On Tue, Dec 17, 2024 at 11:53:24AM +0000, Andrew Carlotti wrote:
> This pass is used to optimise assignments to the FPMR register in
> aarch64. I chose to implement this as a middle-end pass because it
> mostly reuses the existing RTL PRE code within gcse.cc.
>
> Compared to RTL PRE, the key difference in this new pass is that we
> insert new writes directly to the destination hardreg, instead of
> writing to a new pseudo-register and copying the result later. This
> requires changes to the analysis portion of the pass, because sets
> cannot be moved before existing instructions that set, use or clobber
> the hardreg, and the value becomes unavailable after any uses of
> clobbers of the hardreg.
>
> Any uses of the hardreg in debug insns will be deleted. We could do
> better than this, but for the aarch64 fpmr I don't think we emit useful
> debuginfo for deleted fp8 instructions anyway (and I don't even know if
> it's possible to have a debug fpmr use when entering hardreg PRE).
>
>
> Compared to the first version, I've now fixed the broken debug uses, and
> simplified a lot of the analysis (it turns out DF analysis already provides
> cleaner versions of the checks I need). I also fixed a couple of other minor
> bugs (including one that broke the build on every target except aarch64).
>
> The new tests pass; I haven't rerun a bootstrap or full regression test yet,
> but this should be NFC except for aarch64 code that uses the fpmr register.
>
> Is this ok for master?
I believe all the outstanding questions and gaps on the v1 patch thread have
been addressed, so is this ok for master?
> gcc/ChangeLog:
>
> * config/aarch64/aarch64.h (HARDREG_PRE_REGNOS): New macro.
> * gcse.cc (doing_hardreg_pre_p): New global variable.
> (do_load_motion): New boolean check.
> (current_hardreg_regno): New global variable.
> (compute_local_properties): Unset transp for hardreg clobbers.
> (prune_hardreg_uses): New function.
> (want_to_gcse_p): Use different checks for hardreg PRE.
> (oprs_unchanged_p): Disable load motion for hardreg PRE pass.
> (hash_scan_set): For hardreg PRE, skip non-hardreg sets and
> check for hardreg clobbers.
> (record_last_mem_set_info): Skip for hardreg PRE.
> (compute_pre_data): Prune hardreg uses from transp bitmap.
> (pre_expr_reaches_here_p_work): Add sentence to comment.
> (insert_insn_start_basic_block): New functions.
> (pre_edge_insert): Don't add hardreg sets to predecessor block.
> (pre_delete): Use hardreg for the reaching reg.
> (reset_hardreg_debug_uses): New function.
> (pre_gcse): For hardreg PRE, reset debug uses and don't insert
> copies.
> (one_pre_gcse_pass): Disable load motion for hardreg PRE.
> (execute_hardreg_pre): New.
> (class pass_hardreg_pre): New.
> (pass_hardreg_pre::gate): New.
> (make_pass_hardreg_pre): New.
> * passes.def (pass_hardreg_pre): New pass.
> * tree-pass.h (make_pass_hardreg_pre): New.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/acle/fpmr-1.c: New test.
> * gcc.target/aarch64/acle/fpmr-2.c: New test.
> * gcc.target/aarch64/acle/fpmr-3.c: New test.
> * gcc.target/aarch64/acle/fpmr-4.c: New test.
>
>
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index
> f1251f67c74e8da8420bad2d07a11a98a7de37ff..61837a4a98744225b9d15cfbc37cc914ac48421b
> 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -1652,6 +1652,10 @@ enum class aarch64_tristate_mode : int { NO, YES,
> MAYBE };
> { int (aarch64_tristate_mode::MAYBE), \
> int (aarch64_local_sme_state::ANY) }
>
> +/* Zero terminated list of regnos for which hardreg PRE should be
> + applied. */
> +#define HARDREG_PRE_REGNOS { FPM_REGNUM, 0 }
> +
> #endif
>
> #endif /* GCC_AARCH64_H */
> diff --git a/gcc/gcse.cc b/gcc/gcse.cc
> index
> 31b92f30fa1ba6c519429d4b7bc55547b2d71c01..f33de3747b896950568154acbfac1817519fe748
> 100644
> --- a/gcc/gcse.cc
> +++ b/gcc/gcse.cc
> @@ -415,6 +415,17 @@ static int gcse_create_count;
>
> /* Doing code hoisting. */
> static bool doing_code_hoisting_p = false;
> +
> +/* Doing hardreg_pre. */
> +static bool doing_hardreg_pre_p = false;
> +
> +inline bool
> +do_load_motion ()
> +{
> + return flag_gcse_lm && !doing_hardreg_pre_p;
> +}
> +
> +static unsigned int current_hardreg_regno;
>
> /* For available exprs */
> static sbitmap *ae_kill;
> @@ -689,14 +700,32 @@ compute_local_properties (sbitmap *transp, sbitmap
> *comp, sbitmap *antloc,
> int indx = expr->bitmap_index;
> struct gcse_occr *occr;
>
> - /* The expression is transparent in this block if it is not killed.
> - We start by assuming all are transparent [none are killed], and
> - then reset the bits for those that are. */
> + /* In most cases, the expression is transparent in the block if it is
> + not killed. The exception to this is during hardreg PRE, in which
> + uses of the hardreg prevent transparency but do not kill the
> + expression.
> +
> + We start by assuming all expressions are transparent [none are
> + killed], and then reset the bits for those that are. */
> if (transp)
> - compute_transp (expr->expr, indx, transp,
> - blocks_with_calls,
> - modify_mem_list_set,
> - canon_modify_mem_list);
> + {
> + compute_transp (expr->expr, indx, transp,
> + blocks_with_calls,
> + modify_mem_list_set,
> + canon_modify_mem_list);
> +
> + if (doing_hardreg_pre_p)
> + {
> + /* We also need to check whether the destination hardreg is
> + set or call-clobbered in each BB. We'll check for hardreg
> + uses later. */
> + df_ref def;
> + for (def = DF_REG_DEF_CHAIN (current_hardreg_regno);
> + def;
> + def = DF_REF_NEXT_REG (def))
> + bitmap_clear_bit (transp[DF_REF_BB (def)->index], indx);
> + }
> + }
>
> /* The occurrences recorded in antic_occr are exactly those that
> we want to set to nonzero in ANTLOC. */
> @@ -728,6 +757,37 @@ compute_local_properties (sbitmap *transp, sbitmap
> *comp, sbitmap *antloc,
> }
> }
> }
> +
> +/* A hardreg set is not transparent in a block if there are any uses of that
> + hardreg. This filters the results of compute_local_properties, after the
> + result of that function has been used to define the kills bitmap.
> +
> + TRANSP is the destination sbitmap to be updated.
> +
> + TABLE controls which hash table to look at. */
> +
> +static void
> +prune_hardreg_uses (sbitmap *transp, struct gcse_hash_table_d *table)
> +{
> + unsigned int i;
> + gcc_assert (doing_hardreg_pre_p);
> +
> + for (i = 0; i < table->size; i++)
> + {
> + struct gcse_expr *expr;
> +
> + for (expr = table->table[i]; expr != NULL; expr = expr->next_same_hash)
> + {
> + int indx = expr->bitmap_index;
> + df_ref def;
> +
> + for (def = DF_REG_USE_CHAIN (current_hardreg_regno);
> + def;
> + def = DF_REF_NEXT_REG (def))
> + bitmap_clear_bit (transp[DF_REF_BB (def)->index], indx);
> + }
> + }
> +}
>
> /* Hash table support. */
>
> @@ -771,17 +831,24 @@ want_to_gcse_p (rtx x, machine_mode mode, HOST_WIDE_INT
> *max_distance_ptr)
> pressure, i.e., a pseudo register with REG_EQUAL to constant
> is set only once. Failing to do so will result in IRA/reload
> spilling such constants under high register pressure instead of
> - rematerializing them. */
> + rematerializing them.
> +
> + For hardreg PRE, register pressure is not a concern, and we also want to
> + apply GCSE to simple moves. */
>
> switch (GET_CODE (x))
> {
> case REG:
> case SUBREG:
> + return doing_hardreg_pre_p;
> +
> case CALL:
> return false;
>
> CASE_CONST_ANY:
> - if (!doing_code_hoisting_p)
> + if (doing_hardreg_pre_p)
> + return true;
> + else if (!doing_code_hoisting_p)
> /* Do not PRE constants. */
> return false;
>
> @@ -911,7 +978,7 @@ oprs_unchanged_p (const_rtx x, const rtx_insn *insn, bool
> avail_p)
> }
>
> case MEM:
> - if (! flag_gcse_lm
> + if (! do_load_motion ()
> || load_killed_in_block_p (current_bb, DF_INSN_LUID (insn),
> x, avail_p))
> return false;
> @@ -1258,8 +1325,10 @@ hash_scan_set (rtx set, rtx_insn *insn, struct
> gcse_hash_table_d *table)
> && want_to_gcse_p (XEXP (note, 0), GET_MODE (dest), NULL))
> src = XEXP (note, 0), set = gen_rtx_SET (dest, src);
>
> - /* Only record sets of pseudo-regs in the hash table. */
> - if (regno >= FIRST_PSEUDO_REGISTER
> + /* Only record sets of pseudo-regs in the hash table, unless we're
> + currently doing hardreg switching. */
> + if ((doing_hardreg_pre_p ? regno == current_hardreg_regno
> + : regno >= FIRST_PSEUDO_REGISTER)
> /* Don't GCSE something if we can't do a reg/reg copy. */
> && can_copy_p (GET_MODE (dest))
> /* GCSE commonly inserts instruction after the insn. We can't
> @@ -1286,12 +1355,33 @@ hash_scan_set (rtx set, rtx_insn *insn, struct
> gcse_hash_table_d *table)
> able to handle code motion of insns with multiple sets. */
> bool antic_p = (oprs_anticipatable_p (src, insn)
> && !multiple_sets (insn));
> + if (doing_hardreg_pre_p)
> + {
> + /* An hardreg assignment is anticipatable only if the hardreg is
> + neither set nor used prior to this assignment. */
> + auto info = reg_avail_info[current_hardreg_regno];
> + if ((info.last_bb == current_bb
> + && info.first_set < DF_INSN_LUID (insn))
> + || bitmap_bit_p (DF_LR_IN (current_bb),
> + current_hardreg_regno))
> + antic_p = false;
> + }
> +
> /* An expression is not available if its operands are
> subsequently modified, including this insn. It's also not
> available if this is a branch, because we can't insert
> a set after the branch. */
> bool avail_p = (oprs_available_p (src, insn)
> && ! JUMP_P (insn));
> + if (doing_hardreg_pre_p)
> + {
> + /* An hardreg assignment is only available if the hardreg is
> + not set later in the BB. Uses of the hardreg are allowed. */
> + auto info = reg_avail_info[current_hardreg_regno];
> + if (info.last_bb == current_bb
> + && info.last_set > DF_INSN_LUID (insn))
> + avail_p = false;
> + }
>
> insert_expr_in_table (src, GET_MODE (dest), insn, antic_p, avail_p,
> max_distance, table);
> @@ -1300,7 +1390,10 @@ hash_scan_set (rtx set, rtx_insn *insn, struct
> gcse_hash_table_d *table)
> /* In case of store we want to consider the memory value as available in
> the REG stored in that memory. This makes it possible to remove
> redundant loads from due to stores to the same location. */
> - else if (flag_gcse_las && REG_P (src) && MEM_P (dest))
> + else if (flag_gcse_las
> + && !doing_hardreg_pre_p
> + && REG_P (src)
> + && MEM_P (dest))
> {
> unsigned int regno = REGNO (src);
> HOST_WIDE_INT max_distance = 0;
> @@ -1460,7 +1553,7 @@ record_last_reg_set_info (rtx_insn *insn, int regno)
> static void
> record_last_mem_set_info (rtx_insn *insn)
> {
> - if (! flag_gcse_lm)
> + if (! do_load_motion ())
> return;
>
> record_last_mem_set_info_common (insn, modify_mem_list,
> @@ -1884,6 +1977,9 @@ compute_pre_data (void)
> bitmap_not (ae_kill[bb->index], ae_kill[bb->index]);
> }
>
> + if (doing_hardreg_pre_p)
> + prune_hardreg_uses (transp, &expr_hash_table);
> +
> edge_list = pre_edge_lcm (expr_hash_table.n_elems, transp, comp, antloc,
> ae_kill, &pre_insert_map, &pre_delete_map);
> sbitmap_vector_free (antloc);
> @@ -1938,7 +2034,10 @@ pre_expr_reaches_here_p_work (basic_block occr_bb,
> struct gcse_expr *expr,
>
> visited[pred_bb->index] = 1;
> }
> - /* Ignore this predecessor if it kills the expression. */
> + /* Ignore this predecessor if it kills the expression.
> +
> + If this were used for hardreg pre, then it would need to use the kills
> + bitmap. */
> else if (! bitmap_bit_p (transp[pred_bb->index], expr->bitmap_index))
> visited[pred_bb->index] = 1;
>
> @@ -2109,6 +2208,59 @@ insert_insn_end_basic_block (struct gcse_expr *expr,
> basic_block bb)
> }
> }
>
> +/* Return the INSN which is added at the start of the block BB with
> + same instruction pattern with PAT. */
> +
> +rtx_insn *
> +insert_insn_start_basic_block (rtx_insn *pat, basic_block bb)
> +{
> + rtx_insn *insn = BB_HEAD (bb);
> + rtx_insn *next_insn;
> +
> + gcc_assert (pat && INSN_P (pat));
> +
> + /* Insert after the last initial CODE_LABEL or NOTE_INSN_BASIC_BLOCK,
> before
> + any other instructions. */
> + while ((next_insn = NEXT_INSN (insn))
> + && (LABEL_P (next_insn) || NOTE_INSN_BASIC_BLOCK_P (insn)))
> + insn = next_insn;
> +
> + rtx_insn *new_insn = emit_insn_after_noloc (pat, insn, bb);
> +
> + while (pat != NULL_RTX)
> + {
> + if (INSN_P (pat))
> + add_label_notes (PATTERN (pat), new_insn);
> + pat = NEXT_INSN (pat);
> + }
> +
> + return new_insn;
> +}
> +
> +/* Add EXPR to the start of basic block BB.
> +
> + This is used by hardreg PRE. */
> +
> +static void
> +insert_insn_start_basic_block (struct gcse_expr *expr, basic_block bb)
> +{
> + rtx reg = expr->reaching_reg;
> + int regno = REGNO (reg);
> +
> + rtx_insn *insn = process_insert_insn (expr);
> + rtx_insn *new_insn = insert_insn_start_basic_block (insn, bb);
> +
> + gcse_create_count++;
> +
> + if (dump_file)
> + {
> + fprintf (dump_file, "hardreg PRE: start of bb %d, insn %d, ",
> + bb->index, INSN_UID (new_insn));
> + fprintf (dump_file, "copying expression %d to reg %d\n",
> + expr->bitmap_index, regno);
> + }
> +}
> +
> /* Insert partially redundant expressions on edges in the CFG to make
> the expressions fully redundant. */
>
> @@ -2130,7 +2282,8 @@ pre_edge_insert (struct edge_list *edge_list, struct
> gcse_expr **index_map)
> for (e = 0; e < num_edges; e++)
> {
> int indx;
> - basic_block bb = INDEX_EDGE_PRED_BB (edge_list, e);
> + basic_block pred_bb = INDEX_EDGE_PRED_BB (edge_list, e);
> + basic_block succ_bb = INDEX_EDGE_SUCC_BB (edge_list, e);
>
> for (i = indx = 0; i < set_size; i++, indx += SBITMAP_ELT_BITS)
> {
> @@ -2159,13 +2312,24 @@ pre_edge_insert (struct edge_list *edge_list, struct
> gcse_expr **index_map)
>
> /* We can't insert anything on an abnormal and
> critical edge, so we insert the insn at the end of
> - the previous block. There are several alternatives
> + the previous block. There are several alternatives
> detailed in Morgans book P277 (sec 10.5) for
> handling this situation. This one is easiest for
> - now. */
> + now.
>
> + For hardreg PRE this would add an unwanted clobber
> + of the hardreg, so we instead insert in the
> + successor block. This may be partially redundant,
> + but it is at least correct. */
> if (eg->flags & EDGE_ABNORMAL)
> - insert_insn_end_basic_block (index_map[j], bb);
> + {
> + if (doing_hardreg_pre_p)
> + insert_insn_start_basic_block (index_map[j],
> + succ_bb);
> + else
> + insert_insn_end_basic_block (index_map[j],
> + pred_bb);
> + }
> else
> {
> insn = process_insert_insn (index_map[j]);
> @@ -2175,8 +2339,8 @@ pre_edge_insert (struct edge_list *edge_list, struct
> gcse_expr **index_map)
> if (dump_file)
> {
> fprintf (dump_file, "PRE: edge (%d,%d), ",
> - bb->index,
> - INDEX_EDGE_SUCC_BB (edge_list, e)->index);
> + pred_bb->index,
> + succ_bb->index);
> fprintf (dump_file, "copy expression %d\n",
> expr->bitmap_index);
> }
> @@ -2491,13 +2655,25 @@ pre_delete (void)
> && (set = single_set (insn)) != 0
> && dbg_cnt (pre_insn))
> {
> - /* Create a pseudo-reg to store the result of reaching
> - expressions into. Get the mode for the new pseudo from
> - the mode of the original destination pseudo. */
> + rtx dest = SET_DEST (set);
> if (expr->reaching_reg == NULL)
> - expr->reaching_reg = gen_reg_rtx_and_attrs (SET_DEST (set));
> + {
> + if (doing_hardreg_pre_p)
> + /* Use the hardreg as the reaching register. The
> + deleted sets will be replaced with noop moves.
> +
> + This may change the value of the hardreg in some debug
> + instructions, so we will need to reset any debug uses
> + of the hardreg. */
> + expr->reaching_reg = dest;
> + else
> + /* Create a pseudo-reg to store the result of reaching
> + expressions into. Get the mode for the new pseudo from
> + the mode of the original destination pseudo. */
> + expr->reaching_reg = gen_reg_rtx_and_attrs (SET_DEST
> (set));
> + }
>
> - gcse_emit_move_after (SET_DEST (set), expr->reaching_reg, insn);
> + gcse_emit_move_after (dest, expr->reaching_reg, insn);
> delete_insn (insn);
> occr->deleted_p = 1;
> changed = true;
> @@ -2518,6 +2694,25 @@ pre_delete (void)
> return changed;
> }
>
> +/* Since hardreg PRE reuses the hardreg as the reaching register, we need to
> + eliminate any existing uses in debug insns. This is overly conservative,
> + but there's currently no benefit to preserving the debug insns, so there's
> + no point doing the work to retain them. */
> +
> +static void
> +reset_hardreg_debug_uses ()
> +{
> + df_ref def;
> + for (def = DF_REG_USE_CHAIN (current_hardreg_regno);
> + def;
> + def = DF_REF_NEXT_REG (def))
> + {
> + rtx_insn *insn = DF_REF_INSN (def);
> + if (DEBUG_INSN_P (insn))
> + delete_insn (insn);
> + }
> +}
> +
> /* Perform GCSE optimizations using PRE.
> This is called by one_pre_gcse_pass after all the dataflow analysis
> has been done.
> @@ -2561,12 +2756,16 @@ pre_gcse (struct edge_list *edge_list)
>
> changed = pre_delete ();
> did_insert = pre_edge_insert (edge_list, index_map);
> -
> /* In other places with reaching expressions, copy the expression to the
> - specially allocated pseudo-reg that reaches the redundant expr. */
> - pre_insert_copies ();
> + specially allocated pseudo-reg that reaches the redundant expr. This
> + isn't needed for hardreg PRE. */
> + if (!doing_hardreg_pre_p)
> + pre_insert_copies ();
> +
> if (did_insert)
> {
> + if (doing_hardreg_pre_p)
> + reset_hardreg_debug_uses ();
> commit_edge_insertions ();
> changed = true;
> }
> @@ -2601,11 +2800,11 @@ one_pre_gcse_pass (void)
>
> alloc_hash_table (&expr_hash_table);
> add_noreturn_fake_exit_edges ();
> - if (flag_gcse_lm)
> + if (do_load_motion ())
> compute_ld_motion_mems ();
>
> compute_hash_table (&expr_hash_table);
> - if (flag_gcse_lm)
> + if (do_load_motion ())
> trim_ld_motion_mems ();
> if (dump_file)
> dump_hash_table (dump_file, "Expression", &expr_hash_table);
> @@ -2621,7 +2820,7 @@ one_pre_gcse_pass (void)
> free_pre_mem ();
> }
>
> - if (flag_gcse_lm)
> + if (do_load_motion ())
> free_ld_motion_mems ();
> remove_fake_exit_edges ();
> free_hash_table (&expr_hash_table);
> @@ -4028,6 +4227,32 @@ execute_rtl_pre (void)
> return 0;
> }
>
> +static unsigned int
> +execute_hardreg_pre (void)
> +{
> +#ifdef HARDREG_PRE_REGNOS
> + doing_hardreg_pre_p = true;
> + unsigned int regnos[] = HARDREG_PRE_REGNOS;
> + /* It's possible to avoid this loop, but it isn't worth doing so until
> + hardreg PRE is used for multiple hardregs. */
> + for (int i = 0; regnos[i] != 0; i++)
> + {
> + int changed;
> + current_hardreg_regno = regnos[i];
> + if (dump_file)
> + fprintf(dump_file, "Entering hardreg PRE for regno %d\n",
> + current_hardreg_regno);
> + delete_unreachable_blocks ();
> + df_analyze ();
> + changed = one_pre_gcse_pass ();
> + if (changed)
> + cleanup_cfg (0);
> + }
> + doing_hardreg_pre_p = false;
> +#endif
> + return 0;
> +}
> +
> static unsigned int
> execute_rtl_hoist (void)
> {
> @@ -4096,6 +4321,56 @@ make_pass_rtl_pre (gcc::context *ctxt)
>
> namespace {
>
> +const pass_data pass_data_hardreg_pre =
> +{
> + RTL_PASS, /* type */
> + "hardreg_pre", /* name */
> + OPTGROUP_NONE, /* optinfo_flags */
> + TV_PRE, /* tv_id */
> + PROP_cfglayout, /* properties_required */
> + 0, /* properties_provided */
> + 0, /* properties_destroyed */
> + 0, /* todo_flags_start */
> + TODO_df_finish, /* todo_flags_finish */
> +};
> +
> +class pass_hardreg_pre : public rtl_opt_pass
> +{
> +public:
> + pass_hardreg_pre (gcc::context *ctxt)
> + : rtl_opt_pass (pass_data_hardreg_pre, ctxt)
> + {}
> +
> + /* opt_pass methods: */
> + bool gate (function *) final override;
> + unsigned int execute (function *) final override
> + {
> + return execute_hardreg_pre ();
> + }
> +
> +}; // class pass_rtl_pre
> +
> +bool
> +pass_hardreg_pre::gate (function *fun)
> +{
> +#ifdef HARDREG_PRE_REGNOS
> + return optimize > 0
> + && !fun->calls_setjmp;
> +#else
> + return false;
> +#endif
> +}
> +
> +} // anon namespace
> +
> +rtl_opt_pass *
> +make_pass_hardreg_pre (gcc::context *ctxt)
> +{
> + return new pass_hardreg_pre (ctxt);
> +}
> +
> +namespace {
> +
> const pass_data pass_data_rtl_hoist =
> {
> RTL_PASS, /* type */
> diff --git a/gcc/passes.def b/gcc/passes.def
> index
> ae85ae72dff734a8698f606254970437e2bf93a5..95d72b22761eec3668a4d5bbcaa8e41fcc4d830a
> 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -463,6 +463,7 @@ along with GCC; see the file COPYING3. If not see
> NEXT_PASS (pass_rtl_cprop);
> NEXT_PASS (pass_rtl_pre);
> NEXT_PASS (pass_rtl_hoist);
> + NEXT_PASS (pass_hardreg_pre);
> NEXT_PASS (pass_rtl_cprop);
> NEXT_PASS (pass_rtl_store_motion);
> NEXT_PASS (pass_cse_after_global_opts);
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c
> b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..f7a47f81c5ea4639827d4c902f316932120f44af
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-1.c
> @@ -0,0 +1,58 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
> +
> +#include <arm_neon.h>
> +
> +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, int br)
> +{
> + float16x8_t a;
> + a = vld1q_f16(ap);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> + vst1q_f16(ap, a);
> + if (br)
> + {
> + a = vld1q_f16(ap + 8);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> + vst1q_f16(ap + 8, a);
> + a = vld1q_f16(ap + 16);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> + vst1q_f16(ap + 16, a);
> + }
> + else
> + {
> + a = vld1q_f16(ap + 24);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> + vst1q_f16(ap + 24, a);
> + }
> + a = vld1q_f16(ap + 32);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> + vst1q_f16(ap + 32, a);
> +}
> +
> +void bar(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, fpm_t mode,
> int br)
> +{
> + float16x8_t a;
> + a = vld1q_f16(ap);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> + vst1q_f16(ap, a);
> + if (br)
> + {
> + a = vld1q_f16(ap + 8);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> + vst1q_f16(ap + 8, a);
> + a = vld1q_f16(ap + 16);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> + vst1q_f16(ap + 16, a);
> + }
> + else
> + {
> + a = vld1q_f16(ap + 24);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> + vst1q_f16(ap + 24, a);
> + }
> + a = vld1q_f16(ap + 32);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> + vst1q_f16(ap + 32, a);
> +}
> +
> +/* { dg-final { scan-assembler-times "msr\tfpmr" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
> b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..c5b255b0a9a8ea9161217b22f19adaf58c899dbb
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-2.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
> +
> +#include <arm_neon.h>
> +
> +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c)
> +{
> + for (int i = 0; i < 103; i++)
> + {
> + float16x8_t a = vld1q_f16(ap + 8*i);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> + vst1q_f16(ap + 8*i, a);
> + }
> +}
> +/* { dg-final { scan-assembler "msr\tfpmr.*\n\.L2" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c
> b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..73a79ad4b44e2b950cf7ea3e914254b5fdc05b69
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-3.c
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
> +
> +#include <arm_neon.h>
> +
> +void foo(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c, fpm_t mode)
> +{
> + float16x8_t x = vld1q_f16(ap + 1);
> + x = vmlalbq_f16_mf8_fpm(x, b, c, mode);
> + vst1q_f16(ap + 1, x);
> + for (int i = 0; i < 103; i++)
> + {
> + float16x8_t a = vld1q_f16(ap + 8*i);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, mode);
> + vst1q_f16(ap + 8*i, a);
> + }
> +}
> +/* { dg-final { scan-assembler-times "msr\tfpmr" 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c
> b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..18c1def752f557e98868250cd73442fb9f556e18
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/acle/fpmr-4.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O1 -march=armv8-a+fp8fma" } */
> +
> +#include <arm_neon.h>
> +
> +void baz(float16_t ap[20000], mfloat8x16_t b, mfloat8x16_t c)
> +{
> + float16x8_t x = vld1q_f16(ap + 1);
> + x = vmlalbq_f16_mf8_fpm(x, b, c, 13);
> + vst1q_f16(ap + 1, x);
> + for (int i = 0; i < 10; i++)
> + {
> + float16x8_t a = vld1q_f16(ap + 16*i);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, 13);
> + vst1q_f16(ap + 16*i, a);
> + a = vld1q_f16(ap + 16*i + 8);
> + a = vmlalbq_f16_mf8_fpm(a, b, c, 865);
> + vst1q_f16(ap + 16*i+8, a);
> + }
> +}
> +
> +/* { dg-final { scan-assembler-times "msr\tfpmr" 3 } } */
> +/* { dg-final { scan-assembler "msr\tfpmr.*\n\tb\t" } } */
> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index
> ce463629194a7298b70da6463706caea0b28dabd..797d719b2c45ffa2d71c7e94687bf1d5ac19c69f
> 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -573,6 +573,7 @@ extern rtl_opt_pass *make_pass_rtl_dse3 (gcc::context
> *ctxt);
> extern rtl_opt_pass *make_pass_rtl_cprop (gcc::context *ctxt);
> extern rtl_opt_pass *make_pass_rtl_pre (gcc::context *ctxt);
> extern rtl_opt_pass *make_pass_rtl_hoist (gcc::context *ctxt);
> +extern rtl_opt_pass *make_pass_hardreg_pre (gcc::context *ctxt);
> extern rtl_opt_pass *make_pass_rtl_avoid_store_forwarding (gcc::context
> *ctxt);
> extern rtl_opt_pass *make_pass_rtl_store_motion (gcc::context *ctxt);
> extern rtl_opt_pass *make_pass_cse_after_global_opts (gcc::context *ctxt);