Hi,
Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> writes: > Hi, > > Richard Biener <rguent...@suse.de> writes: > >> On Mon, 24 Jul 2023, Jiufu Guo wrote: >> >>> >>> Hi Martin, >>> >>> Not sure about your current option about re-using the ipa-sra code >>> in the light-expander-sra. And if anything I could input please >>> let me know. >>> ... >> >> What I was hoping for is shared stmt-level analysis and a shared >> data structure for the "access"(es) a stmt performs. Because that >> can come up handy in multiple places. The existing SRA data >> structures could easily embed that subset for example if sharing >> the whole data structure of [IPA] SRA seems too unwieldly. > > Understand. > The stmt-level analysis and "access" data structure are similar > between ipa-sra/tree-sra and the expander-sra. > > I just update the patch, this version does not change the behaviors of > the previous version. It is just cleaning/merging some functions only. > The patch is attached. > > This version (and tree-sra/ipa-sra) is still using the similar > "stmt analyze" and "access struct"". This could be extracted as > shared code. > I'm thinking to update the code to use the same "base_access" and > "walk function". I'm drafting code for the shared stmt-analyze and access-structure. The code may like below. BR, Jeff (Jiufu Guo) ----------------------- struct base_access { /* Values returned by get_ref_base_and_extent, indicates the OFFSET, SIZE and BASE of the access. */ HOST_WIDE_INT offset; HOST_WIDE_INT size; tree base; /* The context expression of this access. */ tree expr; /* Indicates this is a write access. */ bool write : 1; /* Indicates if this access is made in reverse storage order. */ bool reverse : 1; }; /* Default template for sra_scan_function. */ struct default_analyzer { /* Template analyze functions. */ void analyze_phi (gphi *){}; void pre_analyze_stmt (gimple *){}; void analyze_return (greturn *){}; void analyze_assign (gassign *){}; void analyze_call (gcall *){}; void analyze_asm (gasm *){}; void analyze_default_stmt (gimple *){}; }; /* Scan function and look for interesting expressions. */ template <typename analyzer> void sra_scan_function (struct function *fun, analyzer &a) { basic_block bb; FOR_EACH_BB_FN (bb, fun) { for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) a.analyze_phi (gsi.phi ()); for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple *stmt = gsi_stmt (gsi); a.pre_analyze_stmt (stmt); switch (gimple_code (stmt)) { case GIMPLE_RETURN: a.analyze_return (as_a<greturn *> (stmt)); break; case GIMPLE_ASSIGN: a.analyze_assign (as_a<gassign *> (stmt)); break; case GIMPLE_CALL: a.analyze_call (as_a<gcall *> (stmt)); break; case GIMPLE_ASM: a.analyze_asm (as_a<gasm *> (stmt)); break; default: a.analyze_default_stmt (stmt); break; } } } } struct access : public base_access { /* The rtx for the access: link to incoming/returning register(s). */ rtx rtx_val; }; struct expand_access_analyzer : public default_analyzer { /* Now use default APIs, no actions for pre_analyze_stmt, analyze_return. */ /* overwrite analyze_default_stmt. */ void analyze_default_stmt (gimple *); /* overwrite analyze phi,call,asm . */ void analyze_phi (gphi *stmt) { analyze_default_stmt (stmt); }; void analyze_call (gcall *stmt) { analyze_default_stmt (stmt); }; void analyze_asm (gasm *stmt) { analyze_default_stmt (stmt); }; /* overwrite analyze_assign. */ void analyze_assign (gassign *); }; > >> >> With a stmt-leve API using FOR_EACH_IMM_USE_STMT would still be >> possible (though RTL expansion pre-walks all stmts anyway). > > Yeap, I also notice that "FOR_EACH_IMM_USE_STMT" is not enough. > For struct parameters, walking stmt is needed. > > > BR, > Jeff (Jiufu Guo) > > ----------------------------- > diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc > index edf292cfbe9..8c36ad5df79 100644 > --- a/gcc/cfgexpand.cc > +++ b/gcc/cfgexpand.cc > @@ -97,6 +97,502 @@ static bool defer_stack_allocation (tree, bool); > > static void record_alignment_for_reg_var (unsigned int); > > +extern rtx > +expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx, int); > + > +/* For light SRA in expander about paramaters and returns. */ > +namespace > +{ > + > +struct access > +{ > + /* Each accessing on the aggragate is about OFFSET/SIZE. */ > + HOST_WIDE_INT offset; > + HOST_WIDE_INT size; > + > + bool writing; > + > + /* The context expression of this access. */ > + tree expr; > + > + /* The rtx for the access: link to incoming/returning register(s). */ > + rtx rtx_val; > +}; > + > +typedef struct access *access_p; > + > +/* Expr (tree) -> Scalarized value (rtx) map. */ > +static hash_map<tree, rtx> *expr_rtx_vec; > + > +/* Base (tree) -> Vector (vec<access_p> *) map. */ > +static hash_map<tree, auto_vec<access_p> > *base_access_vec; > + > +/* Return true if EXPR has interesting access to the sra candidates, > + and created access, return false otherwise. */ > + > +static struct access * > +build_access (tree expr, bool write) > +{ > + enum tree_code code = TREE_CODE (expr); > + if (code != VAR_DECL && code != PARM_DECL && code != COMPONENT_REF > + && code != ARRAY_REF && code != ARRAY_RANGE_REF) > + return NULL; > + > + HOST_WIDE_INT offset, size; > + bool reverse; > + tree base = get_ref_base_and_extent_hwi (expr, &offset, &size, &reverse); > + if (!base || !DECL_P (base)) > + return NULL; > + > + vec<access_p> *access_vec = base_access_vec->get (base); > + if (!access_vec) > + return NULL; > + > + /* TODO: support reverse. */ > + if (reverse || size <= 0 || offset + size > tree_to_shwi (DECL_SIZE > (base))) > + { > + base_access_vec->remove (base); > + return NULL; > + } > + > + struct access *access = XNEWVEC (struct access, 1); > + > + memset (access, 0, sizeof (struct access)); > + access->offset = offset; > + access->size = size; > + access->expr = expr; > + access->writing = write; > + access->rtx_val = NULL_RTX; > + > + access_vec->safe_push (access); > + > + return access; > +} > + > +/* Callback of walk_stmt_load_store_addr_ops visit_base used to remove > + operands with address taken. */ > + > +static bool > +visit_base (gimple *, tree op, tree, void *) > +{ > + op = get_base_address (op); > + if (op && DECL_P (op)) > + base_access_vec->remove (op); > + > + return false; > +} > + > +/* Scan function and look for interesting expressions and create access > + structures for them. */ > + > +static void > +collect_acccesses (void) > +{ > + basic_block bb; > + > + FOR_EACH_BB_FN (bb, cfun) > + { > + for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); > + gsi_next (&gsi)) > + walk_stmt_load_store_addr_ops (gsi.phi (), NULL, NULL, NULL, > + visit_base); > + > + for (gimple_stmt_iterator gsi = gsi_start_nondebug_after_labels_bb > (bb); > + !gsi_end_p (gsi); gsi_next_nondebug (&gsi)) > + { > + gimple *stmt = gsi_stmt (gsi); > + switch (gimple_code (stmt)) > + { > + case GIMPLE_RETURN: > + continue; > + case GIMPLE_ASSIGN: > + if (gimple_assign_single_p (stmt) && !gimple_clobber_p (stmt)) > + { > + tree rhs = gimple_assign_rhs1 (stmt); > + tree lhs = gimple_assign_lhs (stmt); > + bool res_r = build_access (rhs, false); > + bool res_l = build_access (lhs, true); > + if (res_l && TREE_CODE (rhs) != CONSTRUCTOR) > + base_access_vec->remove (get_base_address (lhs)); > + > + if (res_l || res_r) > + continue; > + } > + break; > + default: > + break; > + } > + > + walk_stmt_load_store_addr_ops (stmt, NULL, visit_base, visit_base, > + visit_base); > + } > + } > +} > + > +/* Return true if VAR is a candidate for SRA. */ > + > +static bool > +add_sra_candidate (tree var) > +{ > + tree type = TREE_TYPE (var); > + > + if (!AGGREGATE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE (type)) > + || tree_to_shwi (TYPE_SIZE (type)) == 0 || TREE_THIS_VOLATILE (var) > + || TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT (va_list_type_node)) > + return false; > + gcc_assert (COMPLETE_TYPE_P (type)); > + > + base_access_vec->get_or_insert (var); > + > + return true; > +} > + > +/* Collect the parameter and returns with type which is suitable for > + * scalarization. */ > + > +static bool > +collect_sra_candidates (void) > +{ > + bool ret = false; > + > + /* Collect parameters. */ > + for (tree parm = DECL_ARGUMENTS (current_function_decl); parm; > + parm = DECL_CHAIN (parm)) > + ret |= add_sra_candidate (parm); > + > + /* Collect VARs on returns. */ > + if (DECL_RESULT (current_function_decl)) > + { > + edge_iterator ei; > + edge e; > + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) > + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src))) > + { > + tree val = gimple_return_retval (r); > + /* To sclaraized the return, the return value should be only > + writen, except this return stmt. > + Then using 'true(write)' to create the access. */ > + if (val && VAR_P (val)) > + ret |= add_sra_candidate (val) && build_access (val, true); > + } > + } > + > + return ret; > +} > + > +/* Check if the accesses of BASE are scalarizbale. > + Now, only scalarize the parms only with reading > + or returns only with writing. */ > + > +static bool > +with_scalariable_accesses (vec<access_p> *access_vec, bool is_parm) > +{ > + if (access_vec->is_empty ()) > + return false; > + > + for (unsigned int j = 0; j < access_vec->length (); j++) > + { > + struct access *access = (*access_vec)[j]; > + /* Writing to a field of parameter. */ > + if (is_parm && access->writing) > + return false; > + > + /* Writing to a field of parameter. */ > + if (!is_parm && !access->writing) > + return false; > + } > + > + return true; > +} > + > +static void > +prepare_expander_sra () > +{ > + if (optimize <= 0) > + return; > + > + base_access_vec = new hash_map<tree, auto_vec<access_p> >; > + expr_rtx_vec = new hash_map<tree, rtx>; > + > + if (collect_sra_candidates ()) > + collect_acccesses (); > +} > + > +static void > +free_expander_sra () > +{ > + if (optimize <= 0) > + return; > + delete expr_rtx_vec; > + expr_rtx_vec = 0; > + delete base_access_vec; > + base_access_vec = 0; > +} > +} /* namespace */ > + > +namespace > +{ > +/* Get the register at INDEX from a parallel REGS. */ > + > +static rtx > +extract_parallel_reg (rtx regs, int index) > +{ > + rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0); > + if (!HARD_REGISTER_P (orig_reg)) > + return orig_reg; > + > + /* Reading from param hard reg need to be moved to a temp. */ > + rtx reg = gen_reg_rtx (GET_MODE (orig_reg)); > + emit_move_insn (reg, orig_reg); > + return reg; > +} > + > +/* Get IMODE part from REG at OFF_BITS. */ > + > +static rtx > +extract_sub_reg (rtx orig_reg, int off_bits, machine_mode mode) > +{ > + scalar_int_mode imode; > + if (!int_mode_for_mode (mode).exists (&imode)) > + return NULL_RTX; > + > + machine_mode orig_mode = GET_MODE (orig_reg); > + gcc_assert (GET_MODE_CLASS (orig_mode) == MODE_INT); > + > + poly_uint64 lowpart_off = subreg_lowpart_offset (imode, orig_mode); > + int lowpart_off_bits = lowpart_off.to_constant () * BITS_PER_UNIT; > + int shift_bits; > + if (lowpart_off_bits >= off_bits) > + shift_bits = lowpart_off_bits - off_bits; > + else > + shift_bits = off_bits - lowpart_off_bits; > + > + rtx reg = orig_reg; > + if (shift_bits > 0) > + reg = expand_shift (RSHIFT_EXPR, orig_mode, reg, shift_bits, NULL, 1); > + > + rtx subreg = gen_lowpart (imode, reg); > + rtx result = gen_reg_rtx (imode); > + emit_move_insn (result, subreg); > + > + if (mode != imode) > + result = gen_lowpart (mode, result); > + > + return result; > +} > + > +/* Extract subfields from the REG at START bits to TARGET at OFF, > + BITS parameter is the total number extract bits. */ > + > +static int > +extract_fields_from_reg (rtx reg, int bits, int start, rtx *target, > + HOST_WIDE_INT off) > +{ > + machine_mode mode_aux[] = {SImode, HImode, QImode}; > + int margins = sizeof (mode_aux) / sizeof (mode_aux[0]); > + HOST_WIDE_INT off_bits = start; > + rtx *p = target; > + for (int n = 0; n < margins; n++) > + { > + machine_mode mode = mode_aux[n]; > + HOST_WIDE_INT bitsize = GET_MODE_BITSIZE (mode).to_constant (); > + if (bits < bitsize) > + continue; > + > + rtx subreg = extract_sub_reg (reg, off_bits, mode); > + *p++ = gen_rtx_EXPR_LIST (mode, subreg, GEN_INT (off)); > + off += bitsize / BITS_PER_UNIT; > + off_bits += bitsize; > + bits -= bitsize; > + } > + > + return p - target; > +} > +} /* namespace */ > + > +/* Check If there is an sra access for the expr. > + Return the correspond scalar sym for the access. */ > + > +rtx > +get_scalar_rtx_for_aggregate_expr (tree expr) > +{ > + if (!expr_rtx_vec) > + return NULL_RTX; > + rtx *val = expr_rtx_vec->get (expr); > + return val ? *val : NULL_RTX; > +} > + > +/* Compute/Set RTX registers for those accesses on BASE. */ > + > +void > +set_scalar_rtx_for_aggregate_access (tree base, rtx regs) > +{ > + if (!base_access_vec) > + return; > + vec<access_p> *access_vec = base_access_vec->get (base); > + if (!access_vec) > + return; > + bool is_parm = TREE_CODE (base) == PARM_DECL; > + if (!with_scalariable_accesses (access_vec, is_parm)) > + return; > + > + /* Go through each access, compute corresponding rtx(regs or subregs) > + for the expression. */ > + int n = access_vec->length (); > + int cur_access_index = 0; > + for (; cur_access_index < n; cur_access_index++) > + { > + access_p acc = (*access_vec)[cur_access_index]; > + machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr)); > + > + /* mode of mult registers. */ > + if (expr_mode != BLKmode > + && known_gt (acc->size, GET_MODE_BITSIZE (word_mode))) > + break; > + > + /* Compute the position of the access in the whole parallel rtx. */ > + int start_index = -1; > + int end_index = -1; > + HOST_WIDE_INT left_bits = 0; > + HOST_WIDE_INT right_bits = 0; > + int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1; > + for (; cur_index < XVECLEN (regs, 0); cur_index++) > + { > + rtx slot = XVECEXP (regs, 0, cur_index); > + HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT; > + machine_mode mode = GET_MODE (XEXP (slot, 0)); > + HOST_WIDE_INT size = GET_MODE_BITSIZE (mode).to_constant (); > + if (off <= acc->offset && off + size > acc->offset) > + { > + start_index = cur_index; > + left_bits = acc->offset - off; > + } > + if (off + size >= acc->offset + acc->size) > + { > + end_index = cur_index; > + right_bits = off + size - (acc->offset + acc->size); > + break; > + } > + } > + /* Invalid access possition: padding or outof bound. */ > + if (start_index < 0 || end_index < 0) > + break; > + > + /* Need a parallel for possible multi-registers. */ > + if (expr_mode == BLKmode || end_index > start_index) > + { > + /* More possible space for SI, HI, QI. */ > + machine_mode mode_aux[] = {SImode, HImode, QImode}; > + int margins = sizeof (mode_aux) / sizeof (mode_aux[0]); > + int extra = (right_bits ? margins : 0) + (left_bits ? margins : 0); > + int num_words = end_index - start_index + 1; > + num_words -= (right_bits ? 1 : 0); > + num_words -= (left_bits ? 1 : 0); > + rtx *tmps = XALLOCAVEC (rtx, num_words + extra); > + > + int pos = 0; > + /* There are extra fields from the left part of the start reg. */ > + if (left_bits) > + { > + gcc_assert (!acc->writing); > + gcc_assert ((left_bits % BITS_PER_UNIT) == 0); > + > + rtx reg = XEXP (XVECEXP (regs, 0, start_index), 0); > + machine_mode mode = GET_MODE (reg); > + int reg_size = GET_MODE_BITSIZE (mode).to_constant (); > + int bits = reg_size - left_bits; > + pos = extract_fields_from_reg (reg, bits, left_bits, tmps, 0); > + } > + > + HOST_WIDE_INT start; > + start = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1)); > + start -= left_bits / BITS_PER_UNIT; > + /* Extract whole registers. */ > + for (; pos < num_words; pos++) > + { > + int index = start_index + pos; > + rtx reg = extract_parallel_reg (regs, index); > + machine_mode mode = GET_MODE (reg); > + HOST_WIDE_INT off; > + off = UINTVAL (XEXP (XVECEXP (regs, 0, index), 1)) - start; > + tmps[pos] = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off)); > + } > + > + /* No more fields. */ > + if (right_bits == 0) > + { > + rtx reg = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos, tmps)); > + acc->rtx_val = reg; > + continue; > + } > + > + /* There are extra fields from the part of register. */ > + gcc_assert (!acc->writing); > + gcc_assert ((right_bits % BITS_PER_UNIT) == 0); > + > + HOST_WIDE_INT off; > + off = UINTVAL (XEXP (XVECEXP (regs, 0, end_index), 1)) - start; > + rtx reg = XEXP (XVECEXP (regs, 0, end_index), 0); > + machine_mode mode = GET_MODE (reg); > + int reg_size = GET_MODE_BITSIZE (mode).to_constant (); > + int bits = reg_size - right_bits; > + pos += extract_fields_from_reg (reg, bits, 0, tmps + pos, off); > + > + /* Currently, PARALLELs with register elements for param/returns > + are using BLKmode. */ > + acc->rtx_val = gen_rtx_PARALLEL (expr_mode, gen_rtvec_v (pos, tmps)); > + continue; > + } > + > + /* Just need one reg for the correspond access. */ > + if (end_index == start_index && left_bits == 0 && right_bits == 0) > + { > + rtx reg = extract_parallel_reg (regs, start_index); > + if (GET_MODE (reg) != expr_mode) > + reg = gen_lowpart (expr_mode, reg); > + > + acc->rtx_val = reg; > + continue; > + } > + > + /* Need to shift to extract a part reg for the access. */ > + if (!acc->writing && end_index == start_index) > + { > + rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0); > + acc->rtx_val = extract_sub_reg (orig_reg, left_bits, expr_mode); > + if (acc->rtx_val) > + continue; > + } > + > + break; > + } > + > + /* If all access expr(s) are not scalarized, > + bind/map all expr(tree) to sclarized rtx. */ > + if (cur_access_index == n) > + for (int j = 0; j < n; j++) > + { > + access_p access = (*access_vec)[j]; > + expr_rtx_vec->put (access->expr, access->rtx_val); > + } > +} > + > +void > +set_scalar_rtx_for_returns () > +{ > + tree res = DECL_RESULT (current_function_decl); > + gcc_assert (res); > + edge_iterator ei; > + edge e; > + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) > + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb (e->src))) > + { > + tree val = gimple_return_retval (r); > + if (val && VAR_P (val)) > + set_scalar_rtx_for_aggregate_access (val, DECL_RTL (res)); > + } > +} > + > /* Return an expression tree corresponding to the RHS of GIMPLE > statement STMT. */ > > @@ -3778,7 +4274,8 @@ expand_return (tree retval) > > /* If we are returning the RESULT_DECL, then the value has already > been stored into it, so we don't have to do anything special. */ > - if (TREE_CODE (retval_rhs) == RESULT_DECL) > + if (TREE_CODE (retval_rhs) == RESULT_DECL > + || get_scalar_rtx_for_aggregate_expr (retval_rhs)) > expand_value_return (result_rtl); > > /* If the result is an aggregate that is being returned in one (or more) > @@ -4422,6 +4919,9 @@ expand_debug_expr (tree exp) > int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp)); > addr_space_t as; > scalar_int_mode op0_mode, op1_mode, addr_mode; > + rtx x = get_scalar_rtx_for_aggregate_expr (exp); > + if (x) > + return NULL_RTX;/* optimized out. */ > > switch (TREE_CODE_CLASS (TREE_CODE (exp))) > { > @@ -6620,6 +7120,8 @@ pass_expand::execute (function *fun) > avoid_deep_ter_for_debug (gsi_stmt (gsi), 0); > } > > + prepare_expander_sra (); > + > /* Mark arrays indexed with non-constant indices with TREE_ADDRESSABLE. */ > auto_bitmap forced_stack_vars; > discover_nonconstant_array_refs (forced_stack_vars); > @@ -7052,6 +7554,7 @@ pass_expand::execute (function *fun) > loop_optimizer_finalize (); > } > > + free_expander_sra (); > timevar_pop (TV_POST_EXPAND); > > return 0; > diff --git a/gcc/expr.cc b/gcc/expr.cc > index fff09dc9951..d487fe3b53b 100644 > --- a/gcc/expr.cc > +++ b/gcc/expr.cc > @@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, rtx, > rtx, > static rtx const_vector_from_tree (tree); > static tree tree_expr_size (const_tree); > static void convert_mode_scalar (rtx, rtx, int); > +rtx get_scalar_rtx_for_aggregate_expr (tree); > > > /* This is run to set up which modes can be used > @@ -5623,11 +5624,12 @@ expand_assignment (tree to, tree from, bool > nontemporal) > Assignment of an array element at a constant index, and assignment of > an array element in an unaligned packed structure field, has the same > problem. Same for (partially) storing into a non-memory object. */ > - if (handled_component_p (to) > - || (TREE_CODE (to) == MEM_REF > - && (REF_REVERSE_STORAGE_ORDER (to) > - || mem_ref_refers_to_non_mem_p (to))) > - || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE) > + if (!get_scalar_rtx_for_aggregate_expr (to) > + && (handled_component_p (to) > + || (TREE_CODE (to) == MEM_REF > + && (REF_REVERSE_STORAGE_ORDER (to) > + || mem_ref_refers_to_non_mem_p (to))) > + || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)) > { > machine_mode mode1; > poly_int64 bitsize, bitpos; > @@ -9011,6 +9013,9 @@ expand_expr_real (tree exp, rtx target, machine_mode > tmode, > ret = CONST0_RTX (tmode); > return ret ? ret : const0_rtx; > } > + rtx x = get_scalar_rtx_for_aggregate_expr (exp); > + if (x) > + return x; > > ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl, > inner_reference_p); > diff --git a/gcc/function.cc b/gcc/function.cc > index dd2c1136e07..7fe927bd36b 100644 > --- a/gcc/function.cc > +++ b/gcc/function.cc > @@ -2740,6 +2740,9 @@ assign_parm_find_stack_rtl (tree parm, struct > assign_parm_data_one *data) > data->stack_parm = stack_parm; > } > > +extern void set_scalar_rtx_for_aggregate_access (tree, rtx); > +extern void set_scalar_rtx_for_returns (); > + > /* A subroutine of assign_parms. Adjust DATA->ENTRY_RTL such that it's > always valid and contiguous. */ > > @@ -3115,8 +3118,24 @@ assign_parm_setup_block (struct assign_parm_data_all > *all, > emit_move_insn (mem, entry_parm); > } > else > - move_block_from_reg (REGNO (entry_parm), mem, > - size_stored / UNITS_PER_WORD); > + { > + int regno = REGNO (entry_parm); > + int nregs = size_stored / UNITS_PER_WORD; > + move_block_from_reg (regno, mem, nregs); > + > + rtx *tmps = XALLOCAVEC (rtx, nregs); > + machine_mode mode = word_mode; > + HOST_WIDE_INT word_size = GET_MODE_SIZE (mode).to_constant (); > + for (int i = 0; i < nregs; i++) > + { > + rtx reg = gen_rtx_REG (mode, regno + i); > + rtx off = GEN_INT (word_size * i); > + tmps[i] = gen_rtx_EXPR_LIST (VOIDmode, reg, off); > + } > + > + rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, tmps)); > + set_scalar_rtx_for_aggregate_access (parm, regs); > + } > } > else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type)) > { > @@ -3716,6 +3735,10 @@ assign_parms (tree fndecl) > else > set_decl_incoming_rtl (parm, data.entry_parm, false); > > + rtx incoming = DECL_INCOMING_RTL (parm); > + if (GET_CODE (incoming) == PARALLEL) > + set_scalar_rtx_for_aggregate_access (parm, incoming); > + > assign_parm_adjust_stack_rtl (&data); > > if (assign_parm_setup_block_p (&data)) > @@ -5136,6 +5159,7 @@ expand_function_start (tree subr) > { > gcc_assert (GET_CODE (hard_reg) == PARALLEL); > set_parm_rtl (res, gen_group_rtx (hard_reg)); > + set_scalar_rtx_for_returns (); > } > } > > diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C > b/gcc/testsuite/g++.target/powerpc/pr102024.C > index 769585052b5..c8995cae707 100644 > --- a/gcc/testsuite/g++.target/powerpc/pr102024.C > +++ b/gcc/testsuite/g++.target/powerpc/pr102024.C > @@ -5,7 +5,7 @@ > // Test that a zero-width bit field in an otherwise homogeneous aggregate > // generates a psabi warning and passes arguments in GPRs. > > -// { dg-final { scan-assembler-times {\mstd\M} 4 } } > +// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } } > > struct a_thing > { > diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c > b/gcc/testsuite/gcc.target/powerpc/pr108073.c > new file mode 100644 > index 00000000000..7dd1a4a326a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c > @@ -0,0 +1,29 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -save-temps" } */ > + > +typedef struct DF {double a[4]; short s1; short s2; short s3; short s4; } DF; > +typedef struct SF {float a[4]; int i1; int i2; } SF; > + > +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { has_arch_ppc64 > && has_arch_pwr8 } } } } */ > +/* { dg-final { scan-assembler-not {\mlwz\M} {target { has_arch_ppc64 && > has_arch_pwr8 } } } } */ > +/* { dg-final { scan-assembler-not {\mlhz\M} {target { has_arch_ppc64 && > has_arch_pwr8 } } } } */ > +short __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == 2)return > a.s2+a.s3;return 0;} > +int __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == 2)return > a.i2+a.i1;return 0;} > +double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == > 2)return arg.a[3];else return 0.0;} > +float __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == > 2)return arg.a[2]; return 0;} > +float __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag == > 2)return arg.a[1];return 0;} > + > +DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4}; > +SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2}; > + > +int main() > +{ > + if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, 2) == > 4.0 > + && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0)) > + __builtin_abort (); > + if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, 1) == 0 > + && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0)) > + __builtin_abort (); > + return 0; > +} > + > diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c > b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c > new file mode 100644 > index 00000000000..4e1f87f7939 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c > @@ -0,0 +1,6 @@ > +/* PR target/65421 */ > +/* { dg-options "-O2" } */ > + > +typedef struct LARGE {double a[4]; int arr[32];} LARGE; > +LARGE foo (LARGE a){return a;} > +/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c > b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c > new file mode 100644 > index 00000000000..8a8e1a0e996 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c > @@ -0,0 +1,32 @@ > +/* PR target/65421 */ > +/* { dg-options "-O2" } */ > +/* { dg-require-effective-target powerpc_elfv2 } */ > +/* { dg-require-effective-target has_arch_ppc64 } */ > + > +typedef struct FLOATS > +{ > + double a[3]; > +} FLOATS; > + > +/* 3 lfd after returns also optimized */ > +/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */ > + > +/* 3 stfd */ > +void st_arg (FLOATS a, FLOATS *p) {*p = a;} > +/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */ > + > +/* blr */ > +FLOATS ret_arg (FLOATS a) {return a;} > + > +typedef struct MIX > +{ > + double a[2]; > + long l; > +} MIX; > + > +/* std 3 param regs to return slot */ > +MIX ret_arg1 (MIX a) {return a;} > +/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */ > + > +/* count insns */ > +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */ > >> >> Richard. >> >>> >>> BR, >>> Jeff (Jiufu Guo) >>> >>> >>> Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> writes: >>> >>> > Hi Martin, >>> > >>> > Jiufu Guo via Gcc-patches <gcc-patches@gcc.gnu.org> writes: >>> > >>> >> Hi, >>> >> >>> >> Martin Jambor <mjam...@suse.cz> writes: >>> >> >>> >>> Hi, >>> >>> >>> >>> On Tue, May 30 2023, Richard Biener wrote: >>> >>>> On Mon, 29 May 2023, Jiufu Guo wrote: >>> >>>> >>> >>>>> Hi, >>> >>>>> >>> >>>>> Previously, I was investigating some struct parameters and returns >>> >>>>> related >>> >>>>> PRs 69143/65421/108073. >>> >>>>> >>> >>>>> Investigating the issues case by case, and drafting patches for each >>> >>>>> of >>> >>>>> them one by one. This would help us to enhance code incrementally. >>> >>>>> While, this way, patches would interact with each other and implement >>> >>>>> different codes for similar issues (because of the different paths in >>> >>>>> gimple/rtl). We may have a common fix for those issues. >>> >>>>> >>> >>>>> We know a few other related PRs(such as meta-bug PR101926) exist. For >>> >>>>> those >>> >>>>> PRs in different targets with different symptoms (and also different >>> >>>>> root >>> >>>>> cause), I would expect a method could help some of them, but it may >>> >>>>> be hard to handle all of them in one fix. >>> >>>>> >>> >>>>> With investigation and check discussion for the issues, I remember a >>> >>>>> suggestion from Richard: it would be nice to perform some SRA-like >>> >>>>> analysis >>> >>>>> for the accesses on the structs (parameter/returns). >>> >>>>> https://gcc.gnu.org/pipermail/gcc-patches/2022-November/605117.html >>> >>>>> This may be a 'fairly common method' for those issues. With this idea, >>> >>>>> I drafted a patch as below in this mail. >>> >>>>> >>> >>>>> I also thought about directly using tree-sra.cc, e.g. enhance it and >>> >>>>> rerun it >>> >>>>> at the end of GIMPLE passes. While since some issues are introduced >>> >>>>> inside >>> >>>>> the expander, so below patch also co-works with other parts of the >>> >>>>> expander. >>> >>>>> And since we already have tree-sra in gimple pass, we only need to >>> >>>>> take more >>> >>>>> care on parameter and return in this patch: other decls could be >>> >>>>> handled >>> >>>>> well in tree-sra. >>> >>>>> >>> >>>>> The steps of this patch are: >>> >>>>> 1. Collect struct type parameters and returns, and then scan the >>> >>>>> function to >>> >>>>> get the accesses on them. And figure out the accesses which would be >>> >>>>> profitable >>> >>>>> to be scalarized (using registers of the parameter/return ). Now, >>> >>>>> reading on >>> >>>>> parameter and writing on returns are checked in the current patch. >>> >>>>> 2. When/after the scalar registers are determined/expanded for the >>> >>>>> return or >>> >>>>> parameters, compute the corresponding scalar register(s) for each >>> >>>>> accesses of >>> >>>>> the return/parameter, and prepare the scalar RTLs for those accesses. >>> >>>>> 3. When using/expanding the accesses expression, leverage the >>> >>>>> computed/prepared >>> >>>>> scalars directly. >>> >>>>> >>> >>>>> This patch is tested on ppc64 both LE and BE. >>> >>>>> To continue, I would ask for comments and suggestions first. And then >>> >>>>> I would >>> >>>>> update/enhance accordingly. Thanks in advance! >>> >>>> >>> >>>> Thanks for working on this - the description above sounds exactly like >>> >>>> what should be done. >>> >>>> >>> >>>> Now - I'd like the code to re-use the access tree data structure from >>> >>>> SRA plus at least the worker creating the accesses from a stmt. >>> >>> >>> > >>> > I'm thinking about which part of the code can be re-used from >>> > ipa-sra and tree-sra. >>> > It seems there are some similar concepts between them: >>> > "access with offset/size", "collect and check candidates", >>> > "analyze accesses"... >>> > >>> > While because the purposes are different, the logic and behavior >>> > between them (ipa-sra, tree-sra, and expander-sra) are different, >>> > even for similar concepts. >>> > >>> > The same behavior and similar concept may be reusable. Below list >>> > may be part of them. >>> > *. allocate and maintain access >>> > basic access structure: offset, size, reverse >>> > *. type or expr checking >>> > *. disqualify >>> > *. scan and build expr access >>> > *. scan and walk stmts (return/assign/call/asm) >>> > *. collect candidates >>> > *. initialize/deinitialize >>> > *. access dump >>> > >>> > There are different behaviors for a similar concept. >>> > For examples: >>> > *. Access has grg/queues in tree-sra, access has nonarg in ipa-sra, >>> > and expander-sra does not check access's child/sibling yet. >>> > *. for same stmt(assign/call), different sra checks different logic. >>> > *. candidates have different checking logic: ipa-sra checks more stuff. >>> > >>> > Is this align with your thoughts? Thanks for comments! >>> > >>> > BR, >>> > Jeff (Jiufu Guo) >>> > >>> >> Thanks Martin for your reply and thanks for your time! >>> >> >>> >>> I have had a first look at the patch but still need to look into it more >>> >>> to understand how it uses the information it gathers. >>> >>> >>> >>> My plan is to make the access-tree infrastructure of IPA-SRA more >>> >>> generic and hopefully usable even for this purpose, rather than the one >>> >>> in tree-sra.cc. But that really builds a tree of accesses, bailing out >>> >>> on any partial overlaps, for example, which may not be the right thing >>> >>> here since I don't see any tree-building here. >>> >> >>> >> Yeap, both in tree-sra and ipa-sra, there are concepts about >>> >> "access" and "scan functions/stmts". In this light-sra, these concepts >>> >> are also used. And you may notice that ipa-sra and tree-sra have more >>> >> logic than the current 'light-expand-sra'. >>> >> >>> >> Currently, the 'light-expand-sra' just takes care few things: reading >>> >> from parameter, writing to returns, and disabling sra if address-taken. >>> >> As you notice, now the "access" in this patch is not in a 'tree-struct', >>> >> it is just a 'flat' (or say map & vector). And overlaps between >>> >> accesses are not checked because they are all just reading (for parm). >>> >> >>> >> When we take care of more stuff: passing to call argument, occur in >>> >> memory assignment, occur in line asm... This light-expander-sra would be >>> >> more and more like tee-sra and ipa-sra. And it would be good to leverage >>> >> more capabilities from tree-sra and ipa-sra. So, I agree that it would be >>> >> a great idea to share and reuse the same struct. >>> >> >>> >>> But I still need to >>> >>> properly read set_scalar_rtx_for_aggregate_access function in the patch, >>> >>> which I plan to do next week. >>> >> >>> >> set_scalar_rtx_for_aggregate_access is another key part of this patch. >>> >> Different from tree-sra/ipa-sra (which creates new scalars SSA for each >>> >> access), this patch invokes "set_scalar_rtx_for_aggregate_access" to >>> >> create an rtx expression for each access. Now, this part may not common >>> >> with tree-sra and ipa-sra. >>> >> >>> >> This function is invoked for each parameter if the parameter is >>> >> aggregate type and passed via registers. >>> >> For each access about this parameter, the function creates an rtx >>> >> according to the offset/size/mode of the access. The created rtx maybe: >>> >> 1. one rtx pseudo corresponds to an incoming reg, >>> >> 2. one rtx pseudo which is assigned by a part of incoming reg after >>> >> shift and mode adjust, >>> >> 3. a parallel rtx contains a few rtx pseudos corresponding to the >>> >> incoming registers. >>> >> For return, only 1 and 3 are ok. >>> >> >>> >> BR, >>> >> Jeff (Jiufu Guo) >>> >> >>> >>> >>> >>> Thanks, >>> >>> >>> >>> Martin >>> >>> >>> >>>> >>> >>>> The RTL expansion code already does a sweep over stmts in >>> >>>> discover_nonconstant_array_refs which makes sure RTL expansion doesn't >>> >>>> scalarize (aka assign non-stack) to variables which have accesses >>> >>>> that would later eventually FAIL to expand when operating on registers. >>> >>>> That's very much related to the task at hand so we should try to >>> >>>> at least merge the CFG walks of both (it produces a forced_stack_vars >>> >>>> bitmap). >>> >>>> >>> >>>> Can you work together with Martin to split out the access tree >>> >>>> data structure and share it? >>> >>>> >>> >>>> I didn't look in detail as of how you make use of the information >>> >>>> yet. >>> >>>> >>> >>>> Thanks, >>> >>>> Richard. >>> >>>> >>> >>>>> >>> >>>>> BR, >>> >>>>> Jeff (Jiufu) >>> >>>>> >>> >>>>> >>> >>>>> --- >>> >>>>> gcc/cfgexpand.cc | 567 >>> >>>>> ++++++++++++++++++- >>> >>>>> gcc/expr.cc | 15 +- >>> >>>>> gcc/function.cc | 26 +- >>> >>>>> gcc/opts.cc | 8 +- >>> >>>>> gcc/testsuite/g++.target/powerpc/pr102024.C | 2 +- >>> >>>>> gcc/testsuite/gcc.target/powerpc/pr108073.c | 29 + >>> >>>>> gcc/testsuite/gcc.target/powerpc/pr65421-1.c | 6 + >>> >>>>> gcc/testsuite/gcc.target/powerpc/pr65421-2.c | 32 ++ >>> >>>>> 8 files changed, 675 insertions(+), 10 deletions(-) >>> >>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108073.c >>> >>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-1.c >>> >>>>> create mode 100644 gcc/testsuite/gcc.target/powerpc/pr65421-2.c >>> >>>>> >>> >>>>> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc >>> >>>>> index 85a93a547c0..95c29b6b6fe 100644 >>> >>>>> --- a/gcc/cfgexpand.cc >>> >>>>> +++ b/gcc/cfgexpand.cc >>> >>>>> @@ -97,6 +97,564 @@ static bool defer_stack_allocation (tree, bool); >>> >>>>> >>> >>>>> static void record_alignment_for_reg_var (unsigned int); >>> >>>>> >>> >>>>> +/* For light SRA in expander about paramaters and returns. */ >>> >>>>> +namespace { >>> >>>>> + >>> >>>>> +struct access >>> >>>>> +{ >>> >>>>> + /* Each accessing on the aggragate is about OFFSET/SIZE and BASE. >>> >>>>> */ >>> >>>>> + HOST_WIDE_INT offset; >>> >>>>> + HOST_WIDE_INT size; >>> >>>>> + tree base; >>> >>>>> + bool writing; >>> >>>>> + >>> >>>>> + /* The context expression of this access. */ >>> >>>>> + tree expr; >>> >>>>> + >>> >>>>> + /* The rtx for the access: link to incoming/returning register(s). >>> >>>>> */ >>> >>>>> + rtx rtx_val; >>> >>>>> +}; >>> >>>>> + >>> >>>>> +typedef struct access *access_p; >>> >>>>> + >>> >>>>> +/* Expr (tree) -> Acess (access_p) map. */ >>> >>>>> +static hash_map<tree, access_p> *expr_access_vec; >>> >>>>> + >>> >>>>> +/* Base (tree) -> Vector (vec<access_p> *) map. */ >>> >>>>> +static hash_map<tree, auto_vec<access_p> > *base_access_vec; >>> >>>>> + >>> >>>>> +/* Return a vector of pointers to accesses for the variable given in >>> >>>>> BASE or >>> >>>>> + NULL if there is none. */ >>> >>>>> + >>> >>>>> +static vec<access_p> * >>> >>>>> +get_base_access_vector (tree base) >>> >>>>> +{ >>> >>>>> + return base_access_vec->get (base); >>> >>>>> +} >>> >>>>> + >>> >>>>> +/* Remove DECL from candidates for SRA. */ >>> >>>>> +static void >>> >>>>> +disqualify_candidate (tree decl) >>> >>>>> +{ >>> >>>>> + decl = get_base_address (decl); >>> >>>>> + base_access_vec->remove (decl); >>> >>>>> +} >>> >>>>> + >>> >>>>> +/* Create and insert access for EXPR. Return created access, or NULL >>> >>>>> if it is >>> >>>>> + not possible. */ >>> >>>>> +static struct access * >>> >>>>> +create_access (tree expr, bool write) >>> >>>>> +{ >>> >>>>> + poly_int64 poffset, psize, pmax_size; >>> >>>>> + bool reverse; >>> >>>>> + >>> >>>>> + tree base >>> >>>>> + = get_ref_base_and_extent (expr, &poffset, &psize, &pmax_size, >>> >>>>> &reverse); >>> >>>>> + >>> >>>>> + if (!DECL_P (base)) >>> >>>>> + return NULL; >>> >>>>> + >>> >>>>> + vec<access_p> *access_vec = get_base_access_vector (base); >>> >>>>> + if (!access_vec) >>> >>>>> + return NULL; >>> >>>>> + >>> >>>>> + /* TODO: support reverse. */ >>> >>>>> + if (reverse) >>> >>>>> + { >>> >>>>> + disqualify_candidate (expr); >>> >>>>> + return NULL; >>> >>>>> + } >>> >>>>> + >>> >>>>> + HOST_WIDE_INT offset, size, max_size; >>> >>>>> + if (!poffset.is_constant (&offset) || !psize.is_constant (&size) >>> >>>>> + || !pmax_size.is_constant (&max_size)) >>> >>>>> + return NULL; >>> >>>>> + >>> >>>>> + if (size != max_size || size == 0 || offset < 0 || size < 0 >>> >>>>> + || offset + size > tree_to_shwi (DECL_SIZE (base))) >>> >>>>> + return NULL; >>> >>>>> + >>> >>>>> + struct access *access = XNEWVEC (struct access, 1); >>> >>>>> + >>> >>>>> + memset (access, 0, sizeof (struct access)); >>> >>>>> + access->base = base; >>> >>>>> + access->offset = offset; >>> >>>>> + access->size = size; >>> >>>>> + access->expr = expr; >>> >>>>> + access->writing = write; >>> >>>>> + access->rtx_val = NULL_RTX; >>> >>>>> + >>> >>>>> + access_vec->safe_push (access); >>> >>>>> + >>> >>>>> + return access; >>> >>>>> +} >>> >>>>> + >>> >>>>> +/* Return true if VAR is a candidate for SRA. */ >>> >>>>> +static bool >>> >>>>> +add_sra_candidate (tree var) >>> >>>>> +{ >>> >>>>> + tree type = TREE_TYPE (var); >>> >>>>> + >>> >>>>> + if (!AGGREGATE_TYPE_P (type) || TREE_THIS_VOLATILE (var) >>> >>>>> + || !COMPLETE_TYPE_P (type) || !tree_fits_shwi_p (TYPE_SIZE >>> >>>>> (type)) >>> >>>>> + || tree_to_shwi (TYPE_SIZE (type)) == 0 >>> >>>>> + || TYPE_MAIN_VARIANT (type) == TYPE_MAIN_VARIANT >>> >>>>> (va_list_type_node)) >>> >>>>> + return false; >>> >>>>> + >>> >>>>> + base_access_vec->get_or_insert (var); >>> >>>>> + >>> >>>>> + return true; >>> >>>>> +} >>> >>>>> + >>> >>>>> +/* Callback of walk_stmt_load_store_addr_ops visit_addr used to >>> >>>>> remove >>> >>>>> + operands with address taken. */ >>> >>>>> +static tree >>> >>>>> +visit_addr (tree *tp, int *, void *) >>> >>>>> +{ >>> >>>>> + tree op = *tp; >>> >>>>> + if (op && DECL_P (op)) >>> >>>>> + disqualify_candidate (op); >>> >>>>> + >>> >>>>> + return NULL; >>> >>>>> +} >>> >>>>> + >>> >>>>> +/* Scan expression EXPR and create access structures for all >>> >>>>> accesses to >>> >>>>> + candidates for scalarization. Return the created access or NULL >>> >>>>> if none is >>> >>>>> + created. */ >>> >>>>> +static struct access * >>> >>>>> +build_access_from_expr (tree expr, bool write) >>> >>>>> +{ >>> >>>>> + if (TREE_CODE (expr) == VIEW_CONVERT_EXPR) >>> >>>>> + expr = TREE_OPERAND (expr, 0); >>> >>>>> + >>> >>>>> + if (TREE_CODE (expr) == BIT_FIELD_REF || storage_order_barrier_p >>> >>>>> (expr) >>> >>>>> + || TREE_THIS_VOLATILE (expr)) >>> >>>>> + { >>> >>>>> + disqualify_candidate (expr); >>> >>>>> + return NULL; >>> >>>>> + } >>> >>>>> + >>> >>>>> + switch (TREE_CODE (expr)) >>> >>>>> + { >>> >>>>> + case MEM_REF: { >>> >>>>> + tree op = TREE_OPERAND (expr, 0); >>> >>>>> + if (TREE_CODE (op) == ADDR_EXPR) >>> >>>>> + disqualify_candidate (TREE_OPERAND (op, 0)); >>> >>>>> + break; >>> >>>>> + } >>> >>>>> + case ADDR_EXPR: >>> >>>>> + case IMAGPART_EXPR: >>> >>>>> + case REALPART_EXPR: >>> >>>>> + disqualify_candidate (TREE_OPERAND (expr, 0)); >>> >>>>> + break; >>> >>>>> + case VAR_DECL: >>> >>>>> + case PARM_DECL: >>> >>>>> + case RESULT_DECL: >>> >>>>> + case COMPONENT_REF: >>> >>>>> + case ARRAY_REF: >>> >>>>> + case ARRAY_RANGE_REF: >>> >>>>> + return create_access (expr, write); >>> >>>>> + break; >>> >>>>> + default: >>> >>>>> + break; >>> >>>>> + } >>> >>>>> + >>> >>>>> + return NULL; >>> >>>>> +} >>> >>>>> + >>> >>>>> +/* Scan function and look for interesting expressions and create >>> >>>>> access >>> >>>>> + structures for them. */ >>> >>>>> +static void >>> >>>>> +scan_function (void) >>> >>>>> +{ >>> >>>>> + basic_block bb; >>> >>>>> + >>> >>>>> + FOR_EACH_BB_FN (bb, cfun) >>> >>>>> + { >>> >>>>> + for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); >>> >>>>> + gsi_next (&gsi)) >>> >>>>> + { >>> >>>>> + gphi *phi = gsi.phi (); >>> >>>>> + for (size_t i = 0; i < gimple_phi_num_args (phi); i++) >>> >>>>> + { >>> >>>>> + tree t = gimple_phi_arg_def (phi, i); >>> >>>>> + walk_tree (&t, visit_addr, NULL, NULL); >>> >>>>> + } >>> >>>>> + } >>> >>>>> + >>> >>>>> + for (gimple_stmt_iterator gsi = >>> >>>>> gsi_start_nondebug_after_labels_bb (bb); >>> >>>>> + !gsi_end_p (gsi); gsi_next_nondebug (&gsi)) >>> >>>>> + { >>> >>>>> + gimple *stmt = gsi_stmt (gsi); >>> >>>>> + switch (gimple_code (stmt)) >>> >>>>> + { >>> >>>>> + case GIMPLE_RETURN: { >>> >>>>> + tree r = gimple_return_retval (as_a<greturn *> (stmt)); >>> >>>>> + if (r && VAR_P (r) && r != DECL_RESULT >>> >>>>> (current_function_decl)) >>> >>>>> + build_access_from_expr (r, true); >>> >>>>> + } >>> >>>>> + break; >>> >>>>> + case GIMPLE_ASSIGN: >>> >>>>> + if (gimple_assign_single_p (stmt) && !gimple_clobber_p >>> >>>>> (stmt)) >>> >>>>> + { >>> >>>>> + tree lhs = gimple_assign_lhs (stmt); >>> >>>>> + tree rhs = gimple_assign_rhs1 (stmt); >>> >>>>> + if (TREE_CODE (rhs) == CONSTRUCTOR) >>> >>>>> + disqualify_candidate (lhs); >>> >>>>> + else >>> >>>>> + { >>> >>>>> + build_access_from_expr (rhs, false); >>> >>>>> + build_access_from_expr (lhs, true); >>> >>>>> + } >>> >>>>> + } >>> >>>>> + break; >>> >>>>> + default: >>> >>>>> + walk_gimple_op (stmt, visit_addr, NULL); >>> >>>>> + break; >>> >>>>> + } >>> >>>>> + } >>> >>>>> + } >>> >>>>> +} >>> >>>>> + >>> >>>>> +/* Collect the parameter and returns with type which is suitable for >>> >>>>> + * scalarization. */ >>> >>>>> +static bool >>> >>>>> +collect_light_sra_candidates (void) >>> >>>>> +{ >>> >>>>> + bool ret = false; >>> >>>>> + >>> >>>>> + /* Collect parameters. */ >>> >>>>> + for (tree parm = DECL_ARGUMENTS (current_function_decl); parm; >>> >>>>> + parm = DECL_CHAIN (parm)) >>> >>>>> + ret |= add_sra_candidate (parm); >>> >>>>> + >>> >>>>> + /* Collect VARs on returns. */ >>> >>>>> + if (DECL_RESULT (current_function_decl)) >>> >>>>> + { >>> >>>>> + edge_iterator ei; >>> >>>>> + edge e; >>> >>>>> + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) >>> >>>>> + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb >>> >>>>> (e->src))) >>> >>>>> + { >>> >>>>> + tree val = gimple_return_retval (r); >>> >>>>> + if (val && VAR_P (val)) >>> >>>>> + ret |= add_sra_candidate (val); >>> >>>>> + } >>> >>>>> + } >>> >>>>> + >>> >>>>> + return ret; >>> >>>>> +} >>> >>>>> + >>> >>>>> +/* Now, only scalarize the parms only with reading >>> >>>>> + or returns only with writing. */ >>> >>>>> +bool >>> >>>>> +check_access_vec (tree const &base, auto_vec<access_p> const >>> >>>>> &access_vec, >>> >>>>> + auto_vec<tree> *unqualify_vec) >>> >>>>> +{ >>> >>>>> + bool read = false; >>> >>>>> + bool write = false; >>> >>>>> + for (unsigned int j = 0; j < access_vec.length (); j++) >>> >>>>> + { >>> >>>>> + struct access *access = access_vec[j]; >>> >>>>> + if (access->writing) >>> >>>>> + write = true; >>> >>>>> + else >>> >>>>> + read = true; >>> >>>>> + >>> >>>>> + if (write && read) >>> >>>>> + break; >>> >>>>> + } >>> >>>>> + if ((write && read) || (!write && !read)) >>> >>>>> + unqualify_vec->safe_push (base); >>> >>>>> + >>> >>>>> + return true; >>> >>>>> +} >>> >>>>> + >>> >>>>> +/* Analyze all the accesses, remove those inprofitable candidates. >>> >>>>> + And build the expr->access map. */ >>> >>>>> +static void >>> >>>>> +analyze_accesses () >>> >>>>> +{ >>> >>>>> + auto_vec<tree> unqualify_vec; >>> >>>>> + base_access_vec->traverse<auto_vec<tree> *, check_access_vec> ( >>> >>>>> + &unqualify_vec); >>> >>>>> + >>> >>>>> + tree base; >>> >>>>> + unsigned i; >>> >>>>> + FOR_EACH_VEC_ELT (unqualify_vec, i, base) >>> >>>>> + disqualify_candidate (base); >>> >>>>> +} >>> >>>>> + >>> >>>>> +static void >>> >>>>> +prepare_expander_sra () >>> >>>>> +{ >>> >>>>> + if (optimize <= 0) >>> >>>>> + return; >>> >>>>> + >>> >>>>> + base_access_vec = new hash_map<tree, auto_vec<access_p> >; >>> >>>>> + expr_access_vec = new hash_map<tree, access_p>; >>> >>>>> + >>> >>>>> + if (collect_light_sra_candidates ()) >>> >>>>> + { >>> >>>>> + scan_function (); >>> >>>>> + analyze_accesses (); >>> >>>>> + } >>> >>>>> +} >>> >>>>> + >>> >>>>> +static void >>> >>>>> +free_expander_sra () >>> >>>>> +{ >>> >>>>> + if (optimize <= 0 || !expr_access_vec) >>> >>>>> + return; >>> >>>>> + delete expr_access_vec; >>> >>>>> + expr_access_vec = 0; >>> >>>>> + delete base_access_vec; >>> >>>>> + base_access_vec = 0; >>> >>>>> +} >>> >>>>> +} /* namespace */ >>> >>>>> + >>> >>>>> +/* Check If there is an sra access for the expr. >>> >>>>> + Return the correspond scalar sym for the access. */ >>> >>>>> +rtx >>> >>>>> +get_scalar_rtx_for_aggregate_expr (tree expr) >>> >>>>> +{ >>> >>>>> + if (!expr_access_vec) >>> >>>>> + return NULL_RTX; >>> >>>>> + access_p *access = expr_access_vec->get (expr); >>> >>>>> + return access ? (*access)->rtx_val : NULL_RTX; >>> >>>>> +} >>> >>>>> + >>> >>>>> +extern rtx >>> >>>>> +expand_shift (enum tree_code, machine_mode, rtx, poly_int64, rtx, >>> >>>>> int); >>> >>>>> + >>> >>>>> +/* Compute/Set RTX registers for those accesses on BASE. */ >>> >>>>> +void >>> >>>>> +set_scalar_rtx_for_aggregate_access (tree base, rtx regs) >>> >>>>> +{ >>> >>>>> + if (!base_access_vec) >>> >>>>> + return; >>> >>>>> + vec<access_p> *access_vec = get_base_access_vector (base); >>> >>>>> + if (!access_vec) >>> >>>>> + return; >>> >>>>> + >>> >>>>> + /* Go through each access, compute corresponding rtx(regs or >>> >>>>> subregs) >>> >>>>> + for the expression. */ >>> >>>>> + int n = access_vec->length (); >>> >>>>> + int cur_access_index = 0; >>> >>>>> + for (; cur_access_index < n; cur_access_index++) >>> >>>>> + { >>> >>>>> + access_p acc = (*access_vec)[cur_access_index]; >>> >>>>> + machine_mode expr_mode = TYPE_MODE (TREE_TYPE (acc->expr)); >>> >>>>> + /* non BLK in mult registers*/ >>> >>>>> + if (expr_mode != BLKmode >>> >>>>> + && known_gt (acc->size, GET_MODE_BITSIZE (word_mode))) >>> >>>>> + break; >>> >>>>> + >>> >>>>> + int start_index = -1; >>> >>>>> + int end_index = -1; >>> >>>>> + HOST_WIDE_INT left_margin_bits = 0; >>> >>>>> + HOST_WIDE_INT right_margin_bits = 0; >>> >>>>> + int cur_index = XEXP (XVECEXP (regs, 0, 0), 0) ? 0 : 1; >>> >>>>> + for (; cur_index < XVECLEN (regs, 0); cur_index++) >>> >>>>> + { >>> >>>>> + rtx slot = XVECEXP (regs, 0, cur_index); >>> >>>>> + HOST_WIDE_INT off = UINTVAL (XEXP (slot, 1)) * BITS_PER_UNIT; >>> >>>>> + HOST_WIDE_INT size >>> >>>>> + = GET_MODE_BITSIZE (GET_MODE (XEXP (slot, 0))).to_constant >>> >>>>> (); >>> >>>>> + if (off <= acc->offset && off + size > acc->offset) >>> >>>>> + { >>> >>>>> + start_index = cur_index; >>> >>>>> + left_margin_bits = acc->offset - off; >>> >>>>> + } >>> >>>>> + if (off + size >= acc->offset + acc->size) >>> >>>>> + { >>> >>>>> + end_index = cur_index; >>> >>>>> + right_margin_bits = off + size - (acc->offset + >>> >>>>> acc->size); >>> >>>>> + break; >>> >>>>> + } >>> >>>>> + } >>> >>>>> + /* accessing pading and outof bound. */ >>> >>>>> + if (start_index < 0 || end_index < 0) >>> >>>>> + break; >>> >>>>> + >>> >>>>> + /* Need a parallel for possible multi-registers. */ >>> >>>>> + if (expr_mode == BLKmode || end_index > start_index) >>> >>>>> + { >>> >>>>> + /* Can not support start from middle of a register. */ >>> >>>>> + if (left_margin_bits != 0) >>> >>>>> + break; >>> >>>>> + >>> >>>>> + int len = end_index - start_index + 1; >>> >>>>> + const int margin = 3; /* more space for SI, HI, QI. */ >>> >>>>> + rtx *tmps = XALLOCAVEC (rtx, len + (right_margin_bits ? >>> >>>>> margin : 0)); >>> >>>>> + >>> >>>>> + HOST_WIDE_INT start_off >>> >>>>> + = UINTVAL (XEXP (XVECEXP (regs, 0, start_index), 1)); >>> >>>>> + int pos = 0; >>> >>>>> + for (; pos < len - (right_margin_bits ? 1 : 0); pos++) >>> >>>>> + { >>> >>>>> + int index = start_index + pos; >>> >>>>> + rtx orig_reg = XEXP (XVECEXP (regs, 0, index), 0); >>> >>>>> + machine_mode mode = GET_MODE (orig_reg); >>> >>>>> + rtx reg = NULL_RTX; >>> >>>>> + if (HARD_REGISTER_P (orig_reg)) >>> >>>>> + { >>> >>>>> + /* Reading from param hard reg need to be moved to a >>> >>>>> temp. */ >>> >>>>> + gcc_assert (!acc->writing); >>> >>>>> + reg = gen_reg_rtx (mode); >>> >>>>> + emit_move_insn (reg, orig_reg); >>> >>>>> + } >>> >>>>> + else >>> >>>>> + reg = orig_reg; >>> >>>>> + >>> >>>>> + HOST_WIDE_INT off = UINTVAL (XEXP (XVECEXP (regs, 0, >>> >>>>> index), 1)); >>> >>>>> + tmps[pos] >>> >>>>> + = gen_rtx_EXPR_LIST (mode, reg, GEN_INT (off - >>> >>>>> start_off)); >>> >>>>> + } >>> >>>>> + >>> >>>>> + /* There are some fields are in part of registers. */ >>> >>>>> + if (right_margin_bits != 0) >>> >>>>> + { >>> >>>>> + if (acc->writing) >>> >>>>> + break; >>> >>>>> + >>> >>>>> + gcc_assert ((right_margin_bits % BITS_PER_UNIT) == 0); >>> >>>>> + HOST_WIDE_INT off_byte >>> >>>>> + = UINTVAL (XEXP (XVECEXP (regs, 0, end_index), 1)) - >>> >>>>> start_off; >>> >>>>> + rtx orig_reg = XEXP (XVECEXP (regs, 0, end_index), 0); >>> >>>>> + machine_mode orig_mode = GET_MODE (orig_reg); >>> >>>>> + gcc_assert (GET_MODE_CLASS (orig_mode) == MODE_INT); >>> >>>>> + >>> >>>>> + machine_mode mode_aux[] = {SImode, HImode, QImode}; >>> >>>>> + HOST_WIDE_INT reg_size >>> >>>>> + = GET_MODE_BITSIZE (orig_mode).to_constant (); >>> >>>>> + HOST_WIDE_INT off_bits = 0; >>> >>>>> + for (unsigned long j = 0; >>> >>>>> + j < sizeof (mode_aux) / sizeof (mode_aux[0]); j++) >>> >>>>> + { >>> >>>>> + HOST_WIDE_INT submode_bitsize >>> >>>>> + = GET_MODE_BITSIZE (mode_aux[j]).to_constant (); >>> >>>>> + if (reg_size - right_margin_bits - off_bits >>> >>>>> + >= submode_bitsize) >>> >>>>> + { >>> >>>>> + rtx reg = gen_reg_rtx (orig_mode); >>> >>>>> + emit_move_insn (reg, orig_reg); >>> >>>>> + >>> >>>>> + poly_uint64 lowpart_off >>> >>>>> + = subreg_lowpart_offset (mode_aux[j], >>> >>>>> orig_mode); >>> >>>>> + int lowpart_off_bits >>> >>>>> + = lowpart_off.to_constant () * BITS_PER_UNIT; >>> >>>>> + int shift_bits = lowpart_off_bits >= off_bits >>> >>>>> + ? (lowpart_off_bits - off_bits) >>> >>>>> + : (off_bits - >>> >>>>> lowpart_off_bits); >>> >>>>> + if (shift_bits > 0) >>> >>>>> + reg = expand_shift (RSHIFT_EXPR, orig_mode, reg, >>> >>>>> + shift_bits, NULL, 1); >>> >>>>> + rtx subreg = gen_lowpart (mode_aux[j], reg); >>> >>>>> + rtx off = GEN_INT (off_byte); >>> >>>>> + tmps[pos++] >>> >>>>> + = gen_rtx_EXPR_LIST (mode_aux[j], subreg, off); >>> >>>>> + off_byte += submode_bitsize / BITS_PER_UNIT; >>> >>>>> + off_bits += submode_bitsize; >>> >>>>> + } >>> >>>>> + } >>> >>>>> + } >>> >>>>> + >>> >>>>> + /* Currently, PARALLELs with register elements for >>> >>>>> param/returns >>> >>>>> + are using BLKmode. */ >>> >>>>> + acc->rtx_val = gen_rtx_PARALLEL (TYPE_MODE (TREE_TYPE >>> >>>>> (acc->expr)), >>> >>>>> + gen_rtvec_v (pos, tmps)); >>> >>>>> + continue; >>> >>>>> + } >>> >>>>> + >>> >>>>> + /* The access corresponds to one reg. */ >>> >>>>> + if (end_index == start_index && left_margin_bits == 0 >>> >>>>> + && right_margin_bits == 0) >>> >>>>> + { >>> >>>>> + rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0); >>> >>>>> + rtx reg = NULL_RTX; >>> >>>>> + if (HARD_REGISTER_P (orig_reg)) >>> >>>>> + { >>> >>>>> + /* Reading from param hard reg need to be moved to a >>> >>>>> temp. */ >>> >>>>> + gcc_assert (!acc->writing); >>> >>>>> + reg = gen_reg_rtx (GET_MODE (orig_reg)); >>> >>>>> + emit_move_insn (reg, orig_reg); >>> >>>>> + } >>> >>>>> + else >>> >>>>> + reg = orig_reg; >>> >>>>> + if (GET_MODE (orig_reg) != expr_mode) >>> >>>>> + reg = gen_lowpart (expr_mode, reg); >>> >>>>> + >>> >>>>> + acc->rtx_val = reg; >>> >>>>> + continue; >>> >>>>> + } >>> >>>>> + >>> >>>>> + /* It is accessing a filed which is part of a register. */ >>> >>>>> + scalar_int_mode imode; >>> >>>>> + if (!acc->writing && end_index == start_index >>> >>>>> + && int_mode_for_size (acc->size, 1).exists (&imode)) >>> >>>>> + { >>> >>>>> + /* get and copy original register inside the param. */ >>> >>>>> + rtx orig_reg = XEXP (XVECEXP (regs, 0, start_index), 0); >>> >>>>> + machine_mode mode = GET_MODE (orig_reg); >>> >>>>> + gcc_assert (GET_MODE_CLASS (mode) == MODE_INT); >>> >>>>> + rtx reg = gen_reg_rtx (mode); >>> >>>>> + emit_move_insn (reg, orig_reg); >>> >>>>> + >>> >>>>> + /* shift to expect part. */ >>> >>>>> + poly_uint64 lowpart_off = subreg_lowpart_offset (imode, mode); >>> >>>>> + int lowpart_off_bits = lowpart_off.to_constant () * >>> >>>>> BITS_PER_UNIT; >>> >>>>> + int shift_bits = lowpart_off_bits >= left_margin_bits >>> >>>>> + ? (lowpart_off_bits - left_margin_bits) >>> >>>>> + : (left_margin_bits - lowpart_off_bits); >>> >>>>> + if (shift_bits > 0) >>> >>>>> + reg = expand_shift (RSHIFT_EXPR, mode, reg, shift_bits, >>> >>>>> NULL, 1); >>> >>>>> + >>> >>>>> + /* move corresond part subreg to result. */ >>> >>>>> + rtx subreg = gen_lowpart (imode, reg); >>> >>>>> + rtx result = gen_reg_rtx (imode); >>> >>>>> + emit_move_insn (result, subreg); >>> >>>>> + >>> >>>>> + if (expr_mode != imode) >>> >>>>> + result = gen_lowpart (expr_mode, result); >>> >>>>> + >>> >>>>> + acc->rtx_val = result; >>> >>>>> + continue; >>> >>>>> + } >>> >>>>> + >>> >>>>> + break; >>> >>>>> + } >>> >>>>> + >>> >>>>> + /* Some access expr(s) are not scalarized. */ >>> >>>>> + if (cur_access_index != n) >>> >>>>> + disqualify_candidate (base); >>> >>>>> + else >>> >>>>> + { >>> >>>>> + /* Add elements to expr->access map. */ >>> >>>>> + for (int j = 0; j < n; j++) >>> >>>>> + { >>> >>>>> + access_p access = (*access_vec)[j]; >>> >>>>> + expr_access_vec->put (access->expr, access); >>> >>>>> + } >>> >>>>> + } >>> >>>>> +} >>> >>>>> + >>> >>>>> +void >>> >>>>> +set_scalar_rtx_for_returns () >>> >>>>> +{ >>> >>>>> + tree res = DECL_RESULT (current_function_decl); >>> >>>>> + gcc_assert (res); >>> >>>>> + edge_iterator ei; >>> >>>>> + edge e; >>> >>>>> + FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds) >>> >>>>> + if (greturn *r = safe_dyn_cast<greturn *> (*gsi_last_bb >>> >>>>> (e->src))) >>> >>>>> + { >>> >>>>> + tree val = gimple_return_retval (r); >>> >>>>> + if (val && VAR_P (val)) >>> >>>>> + set_scalar_rtx_for_aggregate_access (val, DECL_RTL (res)); >>> >>>>> + } >>> >>>>> +} >>> >>>>> + >>> >>>>> /* Return an expression tree corresponding to the RHS of GIMPLE >>> >>>>> statement STMT. */ >>> >>>>> >>> >>>>> @@ -3778,7 +4336,8 @@ expand_return (tree retval) >>> >>>>> >>> >>>>> /* If we are returning the RESULT_DECL, then the value has already >>> >>>>> been stored into it, so we don't have to do anything special. >>> >>>>> */ >>> >>>>> - if (TREE_CODE (retval_rhs) == RESULT_DECL) >>> >>>>> + if (TREE_CODE (retval_rhs) == RESULT_DECL >>> >>>>> + || get_scalar_rtx_for_aggregate_expr (retval_rhs)) >>> >>>>> expand_value_return (result_rtl); >>> >>>>> >>> >>>>> /* If the result is an aggregate that is being returned in one (or >>> >>>>> more) >>> >>>>> @@ -4422,6 +4981,9 @@ expand_debug_expr (tree exp) >>> >>>>> int unsignedp = TYPE_UNSIGNED (TREE_TYPE (exp)); >>> >>>>> addr_space_t as; >>> >>>>> scalar_int_mode op0_mode, op1_mode, addr_mode; >>> >>>>> + rtx x = get_scalar_rtx_for_aggregate_expr (exp); >>> >>>>> + if (x) >>> >>>>> + return NULL_RTX;/* optimized out. */ >>> >>>>> >>> >>>>> switch (TREE_CODE_CLASS (TREE_CODE (exp))) >>> >>>>> { >>> >>>>> @@ -6630,6 +7192,8 @@ pass_expand::execute (function *fun) >>> >>>>> avoid_deep_ter_for_debug (gsi_stmt (gsi), 0); >>> >>>>> } >>> >>>>> >>> >>>>> + prepare_expander_sra (); >>> >>>>> + >>> >>>>> /* Mark arrays indexed with non-constant indices with >>> >>>>> TREE_ADDRESSABLE. */ >>> >>>>> auto_bitmap forced_stack_vars; >>> >>>>> discover_nonconstant_array_refs (forced_stack_vars); >>> >>>>> @@ -7062,6 +7626,7 @@ pass_expand::execute (function *fun) >>> >>>>> loop_optimizer_finalize (); >>> >>>>> } >>> >>>>> >>> >>>>> + free_expander_sra (); >>> >>>>> timevar_pop (TV_POST_EXPAND); >>> >>>>> >>> >>>>> return 0; >>> >>>>> diff --git a/gcc/expr.cc b/gcc/expr.cc >>> >>>>> index 56b51876f80..b970f98e689 100644 >>> >>>>> --- a/gcc/expr.cc >>> >>>>> +++ b/gcc/expr.cc >>> >>>>> @@ -100,6 +100,7 @@ static void do_tablejump (rtx, machine_mode, rtx, >>> >>>>> rtx, rtx, >>> >>>>> static rtx const_vector_from_tree (tree); >>> >>>>> static tree tree_expr_size (const_tree); >>> >>>>> static void convert_mode_scalar (rtx, rtx, int); >>> >>>>> +rtx get_scalar_rtx_for_aggregate_expr (tree); >>> >>>>> >>> >>>>> >>> >>>>> /* This is run to set up which modes can be used >>> >>>>> @@ -5623,11 +5624,12 @@ expand_assignment (tree to, tree from, bool >>> >>>>> nontemporal) >>> >>>>> Assignment of an array element at a constant index, and >>> >>>>> assignment of >>> >>>>> an array element in an unaligned packed structure field, has >>> >>>>> the same >>> >>>>> problem. Same for (partially) storing into a non-memory >>> >>>>> object. */ >>> >>>>> - if (handled_component_p (to) >>> >>>>> - || (TREE_CODE (to) == MEM_REF >>> >>>>> - && (REF_REVERSE_STORAGE_ORDER (to) >>> >>>>> - || mem_ref_refers_to_non_mem_p (to))) >>> >>>>> - || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE) >>> >>>>> + if (!get_scalar_rtx_for_aggregate_expr (to) >>> >>>>> + && (handled_component_p (to) >>> >>>>> + || (TREE_CODE (to) == MEM_REF >>> >>>>> + && (REF_REVERSE_STORAGE_ORDER (to) >>> >>>>> + || mem_ref_refers_to_non_mem_p (to))) >>> >>>>> + || TREE_CODE (TREE_TYPE (to)) == ARRAY_TYPE)) >>> >>>>> { >>> >>>>> machine_mode mode1; >>> >>>>> poly_int64 bitsize, bitpos; >>> >>>>> @@ -8995,6 +8997,9 @@ expand_expr_real (tree exp, rtx target, >>> >>>>> machine_mode tmode, >>> >>>>> ret = CONST0_RTX (tmode); >>> >>>>> return ret ? ret : const0_rtx; >>> >>>>> } >>> >>>>> + rtx x = get_scalar_rtx_for_aggregate_expr (exp); >>> >>>>> + if (x) >>> >>>>> + return x; >>> >>>>> >>> >>>>> ret = expand_expr_real_1 (exp, target, tmode, modifier, alt_rtl, >>> >>>>> inner_reference_p); >>> >>>>> diff --git a/gcc/function.cc b/gcc/function.cc >>> >>>>> index 82102ed78d7..262d3f17e72 100644 >>> >>>>> --- a/gcc/function.cc >>> >>>>> +++ b/gcc/function.cc >>> >>>>> @@ -2742,6 +2742,9 @@ assign_parm_find_stack_rtl (tree parm, struct >>> >>>>> assign_parm_data_one *data) >>> >>>>> data->stack_parm = stack_parm; >>> >>>>> } >>> >>>>> >>> >>>>> +extern void >>> >>>>> +set_scalar_rtx_for_aggregate_access (tree, rtx); >>> >>>>> + >>> >>>>> /* A subroutine of assign_parms. Adjust DATA->ENTRY_RTL such that >>> >>>>> it's >>> >>>>> always valid and contiguous. */ >>> >>>>> >>> >>>>> @@ -3117,8 +3120,21 @@ assign_parm_setup_block (struct >>> >>>>> assign_parm_data_all *all, >>> >>>>> emit_move_insn (mem, entry_parm); >>> >>>>> } >>> >>>>> else >>> >>>>> - move_block_from_reg (REGNO (entry_parm), mem, >>> >>>>> - size_stored / UNITS_PER_WORD); >>> >>>>> + { >>> >>>>> + int regno = REGNO (entry_parm); >>> >>>>> + int nregs = size_stored / UNITS_PER_WORD; >>> >>>>> + move_block_from_reg (regno, mem, nregs); >>> >>>>> + >>> >>>>> + rtx *tmps = XALLOCAVEC (rtx, nregs); >>> >>>>> + machine_mode mode = word_mode; >>> >>>>> + for (int i = 0; i < nregs; i++) >>> >>>>> + tmps[i] = gen_rtx_EXPR_LIST ( >>> >>>>> + VOIDmode, gen_rtx_REG (mode, regno + i), >>> >>>>> + GEN_INT (GET_MODE_SIZE (mode).to_constant () * i)); >>> >>>>> + >>> >>>>> + rtx regs = gen_rtx_PARALLEL (BLKmode, gen_rtvec_v (nregs, >>> >>>>> tmps)); >>> >>>>> + set_scalar_rtx_for_aggregate_access (parm, regs); >>> >>>>> + } >>> >>>>> } >>> >>>>> else if (data->stack_parm == 0 && !TYPE_EMPTY_P (data->arg.type)) >>> >>>>> { >>> >>>>> @@ -3718,6 +3734,10 @@ assign_parms (tree fndecl) >>> >>>>> else >>> >>>>> set_decl_incoming_rtl (parm, data.entry_parm, false); >>> >>>>> >>> >>>>> + rtx incoming = DECL_INCOMING_RTL (parm); >>> >>>>> + if (GET_CODE (incoming) == PARALLEL) >>> >>>>> + set_scalar_rtx_for_aggregate_access (parm, incoming); >>> >>>>> + >>> >>>>> assign_parm_adjust_stack_rtl (&data); >>> >>>>> >>> >>>>> if (assign_parm_setup_block_p (&data)) >>> >>>>> @@ -5037,6 +5057,7 @@ stack_protect_epilogue (void) >>> >>>>> the function's parameters, which must be run at any return >>> >>>>> statement. */ >>> >>>>> >>> >>>>> bool currently_expanding_function_start; >>> >>>>> +extern void set_scalar_rtx_for_returns (); >>> >>>>> void >>> >>>>> expand_function_start (tree subr) >>> >>>>> { >>> >>>>> @@ -5138,6 +5159,7 @@ expand_function_start (tree subr) >>> >>>>> { >>> >>>>> gcc_assert (GET_CODE (hard_reg) == PARALLEL); >>> >>>>> set_parm_rtl (res, gen_group_rtx (hard_reg)); >>> >>>>> + set_scalar_rtx_for_returns (); >>> >>>>> } >>> >>>>> } >>> >>>>> >>> >>>>> diff --git a/gcc/opts.cc b/gcc/opts.cc >>> >>>>> index 86b94d62b58..5e129a1cc49 100644 >>> >>>>> --- a/gcc/opts.cc >>> >>>>> +++ b/gcc/opts.cc >>> >>>>> @@ -1559,6 +1559,10 @@ public: >>> >>>>> vec<const char *> m_values; >>> >>>>> }; >>> >>>>> >>> >>>>> +#ifdef __GNUC__ >>> >>>>> +#pragma GCC diagnostic push >>> >>>>> +#pragma GCC diagnostic ignored "-Wformat-truncation" >>> >>>>> +#endif >>> >>>>> /* Print help for a specific front-end, etc. */ >>> >>>>> static void >>> >>>>> print_filtered_help (unsigned int include_flags, >>> >>>>> @@ -1913,7 +1917,9 @@ print_filtered_help (unsigned int include_flags, >>> >>>>> printf ("\n\n"); >>> >>>>> } >>> >>>>> } >>> >>>>> - >>> >>>>> +#ifdef __GNUC__ >>> >>>>> +#pragma GCC diagnostic pop >>> >>>>> +#endif >>> >>>>> /* Display help for a specified type of option. >>> >>>>> The options must have ALL of the INCLUDE_FLAGS set >>> >>>>> ANY of the flags in the ANY_FLAGS set >>> >>>>> diff --git a/gcc/testsuite/g++.target/powerpc/pr102024.C >>> >>>>> b/gcc/testsuite/g++.target/powerpc/pr102024.C >>> >>>>> index 769585052b5..c8995cae707 100644 >>> >>>>> --- a/gcc/testsuite/g++.target/powerpc/pr102024.C >>> >>>>> +++ b/gcc/testsuite/g++.target/powerpc/pr102024.C >>> >>>>> @@ -5,7 +5,7 @@ >>> >>>>> // Test that a zero-width bit field in an otherwise homogeneous >>> >>>>> aggregate >>> >>>>> // generates a psabi warning and passes arguments in GPRs. >>> >>>>> >>> >>>>> -// { dg-final { scan-assembler-times {\mstd\M} 4 } } >>> >>>>> +// { dg-final { scan-assembler-times {\mmtvsrd\M} 4 } } >>> >>>>> >>> >>>>> struct a_thing >>> >>>>> { >>> >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr108073.c >>> >>>>> b/gcc/testsuite/gcc.target/powerpc/pr108073.c >>> >>>>> new file mode 100644 >>> >>>>> index 00000000000..7dd1a4a326a >>> >>>>> --- /dev/null >>> >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr108073.c >>> >>>>> @@ -0,0 +1,29 @@ >>> >>>>> +/* { dg-do run } */ >>> >>>>> +/* { dg-options "-O2 -save-temps" } */ >>> >>>>> + >>> >>>>> +typedef struct DF {double a[4]; short s1; short s2; short s3; short >>> >>>>> s4; } DF; >>> >>>>> +typedef struct SF {float a[4]; int i1; int i2; } SF; >>> >>>>> + >>> >>>>> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 3 {target { >>> >>>>> has_arch_ppc64 && has_arch_pwr8 } } } } */ >>> >>>>> +/* { dg-final { scan-assembler-not {\mlwz\M} {target { >>> >>>>> has_arch_ppc64 && has_arch_pwr8 } } } } */ >>> >>>>> +/* { dg-final { scan-assembler-not {\mlhz\M} {target { >>> >>>>> has_arch_ppc64 && has_arch_pwr8 } } } } */ >>> >>>>> +short __attribute__ ((noipa)) foo_hi (DF a, int flag){if (flag == >>> >>>>> 2)return a.s2+a.s3;return 0;} >>> >>>>> +int __attribute__ ((noipa)) foo_si (SF a, int flag){if (flag == >>> >>>>> 2)return a.i2+a.i1;return 0;} >>> >>>>> +double __attribute__ ((noipa)) foo_df (DF arg, int flag){if (flag == >>> >>>>> 2)return arg.a[3];else return 0.0;} >>> >>>>> +float __attribute__ ((noipa)) foo_sf (SF arg, int flag){if (flag == >>> >>>>> 2)return arg.a[2]; return 0;} >>> >>>>> +float __attribute__ ((noipa)) foo_sf1 (SF arg, int flag){if (flag >>> >>>>> == 2)return arg.a[1];return 0;} >>> >>>>> + >>> >>>>> +DF gdf = {{1.0,2.0,3.0,4.0}, 1, 2, 3, 4}; >>> >>>>> +SF gsf = {{1.0f,2.0f,3.0f,4.0f}, 1, 2}; >>> >>>>> + >>> >>>>> +int main() >>> >>>>> +{ >>> >>>>> + if (!(foo_hi (gdf, 2) == 5 && foo_si (gsf, 2) == 3 && foo_df (gdf, >>> >>>>> 2) == 4.0 >>> >>>>> + && foo_sf (gsf, 2) == 3.0 && foo_sf1 (gsf, 2) == 2.0)) >>> >>>>> + __builtin_abort (); >>> >>>>> + if (!(foo_hi (gdf, 1) == 0 && foo_si (gsf, 1) == 0 && foo_df (gdf, >>> >>>>> 1) == 0 >>> >>>>> + && foo_sf (gsf, 1) == 0 && foo_sf1 (gsf, 1) == 0)) >>> >>>>> + __builtin_abort (); >>> >>>>> + return 0; >>> >>>>> +} >>> >>>>> + >>> >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-1.c >>> >>>>> b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c >>> >>>>> new file mode 100644 >>> >>>>> index 00000000000..4e1f87f7939 >>> >>>>> --- /dev/null >>> >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-1.c >>> >>>>> @@ -0,0 +1,6 @@ >>> >>>>> +/* PR target/65421 */ >>> >>>>> +/* { dg-options "-O2" } */ >>> >>>>> + >>> >>>>> +typedef struct LARGE {double a[4]; int arr[32];} LARGE; >>> >>>>> +LARGE foo (LARGE a){return a;} >>> >>>>> +/* { dg-final { scan-assembler-times {\mmemcpy\M} 1 } } */ >>> >>>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr65421-2.c >>> >>>>> b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c >>> >>>>> new file mode 100644 >>> >>>>> index 00000000000..8a8e1a0e996 >>> >>>>> --- /dev/null >>> >>>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr65421-2.c >>> >>>>> @@ -0,0 +1,32 @@ >>> >>>>> +/* PR target/65421 */ >>> >>>>> +/* { dg-options "-O2" } */ >>> >>>>> +/* { dg-require-effective-target powerpc_elfv2 } */ >>> >>>>> +/* { dg-require-effective-target has_arch_ppc64 } */ >>> >>>>> + >>> >>>>> +typedef struct FLOATS >>> >>>>> +{ >>> >>>>> + double a[3]; >>> >>>>> +} FLOATS; >>> >>>>> + >>> >>>>> +/* 3 lfd after returns also optimized */ >>> >>>>> +/* FLOATS ret_arg_pt (FLOATS *a){return *a;} */ >>> >>>>> + >>> >>>>> +/* 3 stfd */ >>> >>>>> +void st_arg (FLOATS a, FLOATS *p) {*p = a;} >>> >>>>> +/* { dg-final { scan-assembler-times {\mstfd\M} 3 } } */ >>> >>>>> + >>> >>>>> +/* blr */ >>> >>>>> +FLOATS ret_arg (FLOATS a) {return a;} >>> >>>>> + >>> >>>>> +typedef struct MIX >>> >>>>> +{ >>> >>>>> + double a[2]; >>> >>>>> + long l; >>> >>>>> +} MIX; >>> >>>>> + >>> >>>>> +/* std 3 param regs to return slot */ >>> >>>>> +MIX ret_arg1 (MIX a) {return a;} >>> >>>>> +/* { dg-final { scan-assembler-times {\mstd\M} 3 } } */ >>> >>>>> + >>> >>>>> +/* count insns */ >>> >>>>> +/* { dg-final { scan-assembler-times {(?n)^\s+[a-z]} 9 } } */ >>> >>>>> >>> >>>> >>> >>>> -- >>> >>>> Richard Biener <rguent...@suse.de> >>> >>>> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 >>> >>>> Nuernberg, >>> >>>> Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman; >>> >>>> HRB 36809 (AG Nuernberg) >>>