[SUBREG V4 3/4] IRA: Apply DF_LIVE_SUBREG data
--- gcc/ira-build.cc | 7 --- gcc/ira-color.cc | 8 gcc/ira-emit.cc | 12 ++-- gcc/ira-lives.cc | 7 --- gcc/ira.cc | 19 --- 5 files changed, 30 insertions(+), 23 deletions(-) diff --git a/gcc/ira-build.cc b/gcc/ira-build.cc index ea593d5a087..283ff36d3dd 100644 --- a/gcc/ira-build.cc +++ b/gcc/ira-build.cc @@ -1921,7 +1921,8 @@ create_bb_allocnos (ira_loop_tree_node_t bb_node) create_insn_allocnos (PATTERN (insn), NULL, false); /* It might be a allocno living through from one subloop to another. */ - EXECUTE_IF_SET_IN_REG_SET (df_get_live_in (bb), FIRST_PSEUDO_REGISTER, i, bi) + EXECUTE_IF_SET_IN_REG_SET (df_get_subreg_live_in (bb), FIRST_PSEUDO_REGISTER, +i, bi) if (ira_curr_regno_allocno_map[i] == NULL) ira_create_allocno (i, false, ira_curr_loop_tree_node); } @@ -1937,9 +1938,9 @@ create_loop_allocnos (edge e) bitmap_iterator bi; ira_loop_tree_node_t parent; - live_in_regs = df_get_live_in (e->dest); + live_in_regs = df_get_subreg_live_in (e->dest); border_allocnos = ira_curr_loop_tree_node->border_allocnos; - EXECUTE_IF_SET_IN_REG_SET (df_get_live_out (e->src), + EXECUTE_IF_SET_IN_REG_SET (df_get_subreg_live_out (e->src), FIRST_PSEUDO_REGISTER, i, bi) if (bitmap_bit_p (live_in_regs, i)) { diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc index b9ae32d1b4d..bfebc48ef83 100644 --- a/gcc/ira-color.cc +++ b/gcc/ira-color.cc @@ -2786,8 +2786,8 @@ ira_loop_edge_freq (ira_loop_tree_node_t loop_node, int regno, bool exit_p) FOR_EACH_EDGE (e, ei, loop_node->loop->header->preds) if (e->src != loop_node->loop->latch && (regno < 0 - || (bitmap_bit_p (df_get_live_out (e->src), regno) - && bitmap_bit_p (df_get_live_in (e->dest), regno + || (bitmap_bit_p (df_get_subreg_live_out (e->src), regno) + && bitmap_bit_p (df_get_subreg_live_in (e->dest), regno freq += EDGE_FREQUENCY (e); } else @@ -2795,8 +2795,8 @@ ira_loop_edge_freq (ira_loop_tree_node_t loop_node, int regno, bool exit_p) auto_vec edges = get_loop_exit_edges (loop_node->loop); FOR_EACH_VEC_ELT (edges, i, e) if (regno < 0 - || (bitmap_bit_p (df_get_live_out (e->src), regno) - && bitmap_bit_p (df_get_live_in (e->dest), regno))) + || (bitmap_bit_p (df_get_subreg_live_out (e->src), regno) + && bitmap_bit_p (df_get_subreg_live_in (e->dest), regno))) freq += EDGE_FREQUENCY (e); } diff --git a/gcc/ira-emit.cc b/gcc/ira-emit.cc index d347f11fa02..8075b082e36 100644 --- a/gcc/ira-emit.cc +++ b/gcc/ira-emit.cc @@ -510,8 +510,8 @@ generate_edge_moves (edge e) return; src_map = src_loop_node->regno_allocno_map; dest_map = dest_loop_node->regno_allocno_map; - regs_live_in_dest = df_get_live_in (e->dest); - regs_live_out_src = df_get_live_out (e->src); + regs_live_in_dest = df_get_subreg_live_in (e->dest); + regs_live_out_src = df_get_subreg_live_out (e->src); EXECUTE_IF_SET_IN_REG_SET (regs_live_in_dest, FIRST_PSEUDO_REGISTER, regno, bi) if (bitmap_bit_p (regs_live_out_src, regno)) @@ -1229,16 +1229,16 @@ add_ranges_and_copies (void) destination block) to use for searching allocnos by their regnos because of subsequent IR flattening. */ node = IRA_BB_NODE (bb)->parent; - bitmap_copy (live_through, df_get_live_in (bb)); + bitmap_copy (live_through, df_get_subreg_live_in (bb)); add_range_and_copies_from_move_list (at_bb_start[bb->index], node, live_through, REG_FREQ_FROM_BB (bb)); - bitmap_copy (live_through, df_get_live_out (bb)); + bitmap_copy (live_through, df_get_subreg_live_out (bb)); add_range_and_copies_from_move_list (at_bb_end[bb->index], node, live_through, REG_FREQ_FROM_BB (bb)); FOR_EACH_EDGE (e, ei, bb->succs) { - bitmap_and (live_through, - df_get_live_in (e->dest), df_get_live_out (bb)); + bitmap_and (live_through, df_get_subreg_live_in (e->dest), + df_get_subreg_live_out (bb)); add_range_and_copies_from_move_list ((move_t) e->aux, node, live_through, REG_FREQ_FROM_EDGE_FREQ (EDGE_FREQUENCY (e))); diff --git a/gcc/ira-lives.cc b/gcc/ira-lives.cc index e07d3dc3e89..7641184069d 100644 --- a/gcc/ira-lives.cc +++ b/gcc/ira-lives.cc @@ -1254,7 +1254,8 @@ process_out_of_region_eh_regs (basic_block bb) if (! eh_p) return; - EXECUTE_IF_SET_IN_BITMAP (df_get_live_out (bb), FIRST_PSEUDO_REGISTER, i, bi) + EXECUTE_IF_SET_IN_BITMAP (df_get_subreg_live_out (bb), FIRST_PSEUDO_REGISTER, + i, bi) { ira_allocno_t a = ira_curr_regno_allocno_map[i]; for (int n = ALLOCNO_NUM_OBJECTS (a) - 1; n >= 0; n--) @@ -1288,7 +1289,7
[SUBREG V4 2/4] DF: Add DF_LIVE_SUBREG problem
--- gcc/Makefile.in | 1 + gcc/df-problems.cc | 886 ++- gcc/df.h | 159 +++ gcc/regs.h | 5 + gcc/sbitmap.cc | 98 + gcc/sbitmap.h| 2 + gcc/subreg-live-range.cc | 233 ++ gcc/subreg-live-range.h | 60 +++ gcc/timevar.def | 1 + 9 files changed, 1444 insertions(+), 1 deletion(-) create mode 100644 gcc/subreg-live-range.cc create mode 100644 gcc/subreg-live-range.h diff --git a/gcc/Makefile.in b/gcc/Makefile.in index a7f15694c34..67d2e3ca1bc 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1684,6 +1684,7 @@ OBJS = \ store-motion.o \ streamer-hooks.o \ stringpool.o \ + subreg-live-range.o \ substring-locations.o \ target-globals.o \ targhooks.o \ diff --git a/gcc/df-problems.cc b/gcc/df-problems.cc index 88ee0dd67fc..01f1f850925 100644 --- a/gcc/df-problems.cc +++ b/gcc/df-problems.cc @@ -28,6 +28,7 @@ along with GCC; see the file COPYING3. If not see #include "target.h" #include "rtl.h" #include "df.h" +#include "subreg-live-range.h" #include "memmodel.h" #include "tm_p.h" #include "insn-config.h" @@ -1344,8 +1345,891 @@ df_lr_verify_transfer_functions (void) bitmap_clear (&all_blocks); } +/* + REGISTER AND SUBREGS LIVES + Like DF_LR, but include tracking subreg liveness. Currently used to provide + subreg liveness related information to the register allocator. The subreg + information is currently tracked for registers that satisfy the following + conditions: + 1. REG is a pseudo register + 2. MODE_SIZE > UNIT_SIZE + 3. MODE_SIZE is a multiple of UNIT_SIZE + 4. REG is used via subreg pattern + Assuming: MODE = the machine mode of the REG +MODE_SIZE = GET_MODE_SIZE (MODE) +UNIT_SIZE = REGMODE_NATURAL_SIZE (MODE) + Condition 3 is currently strict, maybe it can be removed in the future, but + for now it is sufficient. +*/ + +/* These two empty data are used as default data in case the user does not turn + * on the track-subreg-liveness feature. */ +bitmap_head df_subreg_empty_bitmap; +subregs_live df_subreg_empty_live; + +/* Private data for live_subreg problem. */ +struct df_live_subreg_problem_data +{ + /* Record registers that need to track subreg liveness. */ + bitmap_head tracked_regs; + /* An obstack for the bitmaps we need for this problem. */ + bitmap_obstack live_subreg_bitmaps; +}; + +/* Helper functions. */ + +static df_live_subreg_bb_info * +df_live_subreg_get_bb_info (unsigned int index) +{ + if (index < df_live_subreg->block_info_size) +return &static_cast ( + df_live_subreg->block_info)[index]; + else +return nullptr; +} + +static df_live_subreg_local_bb_info * +get_live_subreg_local_bb_info (unsigned int bb_index) +{ + return df_live_subreg_get_bb_info (bb_index); +} + +/* Return true if regno is a multireg. */ +bool +multireg_p (int regno) +{ + if (regno < FIRST_PSEUDO_REGISTER) +return false; + rtx regno_rtx = regno_reg_rtx[regno]; + machine_mode reg_mode = GET_MODE (regno_rtx); + poly_int64 total_size = GET_MODE_SIZE (reg_mode); + poly_int64 natural_size = REGMODE_NATURAL_SIZE (reg_mode); + return maybe_gt (total_size, natural_size) +&& multiple_p (total_size, natural_size); +} + +/* Return true if the REGNO need be track with subreg liveness. */ + +static bool +need_track_subreg_p (unsigned regno) +{ + auto problem_data += (struct df_live_subreg_problem_data *) df_live_subreg->problem_data; + return bitmap_bit_p (&problem_data->tracked_regs, regno); +} + +/* Fill RANGE with the subreg range for OP in REGMODE_NATURAL_SIZE granularity. + */ +void +init_range (rtx op, sbitmap range) +{ + rtx reg = SUBREG_P (op) ? SUBREG_REG (op) : op; + machine_mode reg_mode = GET_MODE (reg); + + if (!read_modify_subreg_p (op)) +{ + bitmap_set_range (range, 0, get_nblocks (reg_mode)); + return; +} + + rtx subreg = op; + machine_mode subreg_mode = GET_MODE (subreg); + poly_int64 offset = SUBREG_BYTE (subreg); + int nblocks = get_nblocks (reg_mode); + poly_int64 unit_size = REGMODE_NATURAL_SIZE (reg_mode); + poly_int64 subreg_size = GET_MODE_SIZE (subreg_mode); + poly_int64 left = offset + subreg_size; + + int subreg_start = -1; + int subreg_nblocks = -1; + for (int i = 0; i < nblocks; i += 1) +{ + poly_int64 right = unit_size * (i + 1); + if (subreg_start < 0 && maybe_lt (offset, right)) + subreg_start = i; + if (subreg_nblocks < 0 && maybe_le (left, right)) + { + subreg_nblocks = i + 1 - subreg_start; + break; + } +} + gcc_assert (subreg_start >= 0 && subreg_nblocks > 0); + + bitmap_set_range (range, subreg_start, subreg_nblocks); +} + +/* Remove R
[SUBREG V4 1/4] DF: Add -ftrack-subreg-liveness option
--- gcc/common.opt | 4 gcc/common.opt.urls | 3 +++ gcc/doc/invoke.texi | 8 gcc/opts.cc | 1 + 4 files changed, 16 insertions(+) diff --git a/gcc/common.opt b/gcc/common.opt index 40cab3cb36a..5710e817abe 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -2163,6 +2163,10 @@ fira-share-spill-slots Common Var(flag_ira_share_spill_slots) Init(1) Optimization Share stack slots for spilled pseudo-registers. +ftrack-subreg-liveness +Common Var(flag_track_subreg_liveness) Init(0) Optimization +Track subreg liveness information. + fira-verbose= Common RejectNegative Joined UInteger Var(flag_ira_verbose) Init(5) -fira-verbose= Control IRA's level of diagnostic messages. diff --git a/gcc/common.opt.urls b/gcc/common.opt.urls index f71ed80a34b..59f27a6f7c6 100644 --- a/gcc/common.opt.urls +++ b/gcc/common.opt.urls @@ -880,6 +880,9 @@ UrlSuffix(gcc/Optimize-Options.html#index-fira-share-save-slots) fira-share-spill-slots UrlSuffix(gcc/Optimize-Options.html#index-fira-share-spill-slots) +ftrack-subreg-liveness +UrlSuffix(gcc/Optimize-Options.html#index-ftrack-subreg-liveness) + fira-verbose= UrlSuffix(gcc/Developer-Options.html#index-fira-verbose) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index ddcd5213f06..fbcde8aa745 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -13188,6 +13188,14 @@ Disable sharing of stack slots allocated for pseudo-registers. Each pseudo-register that does not get a hard register gets a separate stack slot, and as a result function stack frames are larger. +@opindex ftrack-subreg-liveness +@item -ftrack-subreg-liveness +Enable tracking subreg liveness information. This infomation allows IRA +and LRA to support subreg coalesce feature which can improve the quality +of register allocation. + +This option is enabled at level @option{-O3} for all targets. + @opindex flra-remat @item -flra-remat Enable CFG-sensitive rematerialization in LRA. Instead of loading diff --git a/gcc/opts.cc b/gcc/opts.cc index 14d1767e48f..8fe3a213807 100644 --- a/gcc/opts.cc +++ b/gcc/opts.cc @@ -698,6 +698,7 @@ static const struct default_options default_options_table[] = { OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_DYNAMIC }, { OPT_LEVELS_3_PLUS, OPT_fversion_loops_for_strides, NULL, 1 }, +{ OPT_LEVELS_3_PLUS, OPT_ftrack_subreg_liveness, NULL, 1 }, /* -O3 parameters. */ { OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_auto_, NULL, 30 }, -- 2.36.3
[SUBREG V4 4/4] LRA: Apply DF_LIVE_SUBREG data
--- gcc/lra-coalesce.cc| 27 +++- gcc/lra-constraints.cc | 109 ++--- gcc/lra-int.h | 4 + gcc/lra-lives.cc | 357 - gcc/lra-remat.cc | 8 +- gcc/lra-spills.cc | 27 +++- gcc/lra.cc | 10 +- 7 files changed, 430 insertions(+), 112 deletions(-) diff --git a/gcc/lra-coalesce.cc b/gcc/lra-coalesce.cc index a9b5b51cb3f..9416775a009 100644 --- a/gcc/lra-coalesce.cc +++ b/gcc/lra-coalesce.cc @@ -186,19 +186,28 @@ static bitmap_head used_pseudos_bitmap; /* Set up USED_PSEUDOS_BITMAP, and update LR_BITMAP (a BB live info bitmap). */ static void -update_live_info (bitmap lr_bitmap) +update_live_info (bitmap all, bitmap full, bitmap partial) { unsigned int j; bitmap_iterator bi; bitmap_clear (&used_pseudos_bitmap); - EXECUTE_IF_AND_IN_BITMAP (&coalesced_pseudos_bitmap, lr_bitmap, + EXECUTE_IF_AND_IN_BITMAP (&coalesced_pseudos_bitmap, all, FIRST_PSEUDO_REGISTER, j, bi) bitmap_set_bit (&used_pseudos_bitmap, first_coalesced_pseudo[j]); - if (! bitmap_empty_p (&used_pseudos_bitmap)) + if (!bitmap_empty_p (&used_pseudos_bitmap)) { - bitmap_and_compl_into (lr_bitmap, &coalesced_pseudos_bitmap); - bitmap_ior_into (lr_bitmap, &used_pseudos_bitmap); + bitmap_and_compl_into (all, &coalesced_pseudos_bitmap); + bitmap_ior_into (all, &used_pseudos_bitmap); + + if (flag_track_subreg_liveness) + { + bitmap_and_compl_into (full, &coalesced_pseudos_bitmap); + bitmap_ior_and_compl_into (full, &used_pseudos_bitmap, partial); + + bitmap_and_compl_into (partial, &coalesced_pseudos_bitmap); + bitmap_ior_and_compl_into (partial, &used_pseudos_bitmap, full); + } } } @@ -301,8 +310,12 @@ lra_coalesce (void) bitmap_initialize (&used_pseudos_bitmap, ®_obstack); FOR_EACH_BB_FN (bb, cfun) { - update_live_info (df_get_live_in (bb)); - update_live_info (df_get_live_out (bb)); + update_live_info (df_get_subreg_live_in (bb), + df_get_subreg_live_full_in (bb), + df_get_subreg_live_partial_in (bb)); + update_live_info (df_get_subreg_live_out (bb), + df_get_subreg_live_full_out (bb), + df_get_subreg_live_partial_out (bb)); FOR_BB_INSNS_SAFE (bb, insn, next) if (INSN_P (insn) && bitmap_bit_p (&involved_insns_bitmap, INSN_UID (insn))) diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc index e945a4da451..c9246e6be58 100644 --- a/gcc/lra-constraints.cc +++ b/gcc/lra-constraints.cc @@ -6565,34 +6565,86 @@ update_ebb_live_info (rtx_insn *head, rtx_insn *tail) { if (prev_bb != NULL) { - /* Update df_get_live_in (prev_bb): */ + /* Update subreg live (prev_bb): */ + bitmap subreg_all_in = df_get_subreg_live_in (prev_bb); + bitmap subreg_full_in = df_get_subreg_live_full_in (prev_bb); + bitmap subreg_partial_in = df_get_subreg_live_partial_in (prev_bb); + subregs_live *range_in = df_get_subreg_live_range_in (prev_bb); EXECUTE_IF_SET_IN_BITMAP (&check_only_regs, 0, j, bi) if (bitmap_bit_p (&live_regs, j)) - bitmap_set_bit (df_get_live_in (prev_bb), j); - else - bitmap_clear_bit (df_get_live_in (prev_bb), j); + { + bitmap_set_bit (subreg_all_in, j); + if (flag_track_subreg_liveness) + { + bitmap_set_bit (subreg_full_in, j); + if (bitmap_bit_p (subreg_partial_in, j)) + { + bitmap_clear_bit (subreg_partial_in, j); + range_in->remove_range (j); + } + } + } + else if (bitmap_bit_p (subreg_all_in, j)) + { + bitmap_clear_bit (subreg_all_in, j); + if (flag_track_subreg_liveness) + { + bitmap_clear_bit (subreg_full_in, j); + if (bitmap_bit_p (subreg_partial_in, j)) + { + bitmap_clear_bit (subreg_partial_in, j); + range_in->remove_range (j); + } + } + } } + bitmap subreg_all_out = df_get_subreg_live_out (curr_bb); if (curr_bb != last_bb) { - /* Update df_get_live_out (curr_bb): */ + /* Update subreg live (curr_bb): */ + bitmap subreg_full_out = df_get_subreg_live_full_out (curr_bb); + bitmap subreg_partial_out = df_get_subreg_live_partial_out (curr_bb); + subregs_live *range_out = df_get_subreg_liv
[SUBREG V4 0/4] Add DF_LIVE_SUBREG data and apply to IRA and LRA
V3: Address comments from Dimitar Dimitrov V4: Move detailed function from subreg-live-range.h to subreg-live-range.cc. These patches are used to add a new data flow DF_LIVE_SUBREG, which will track subreg liveness and then apply it to IRA and LRA passes (enabled via -O3 or -ftrack-subreg-liveness). These patches are for GCC 15. And these codes are pushed to the devel/subreg-coalesce branch. In addition, my colleague Shuo Chen will also be involved in some of the remain work, thank you for your support. These patches are separated from the subreg-coalesce patches submitted a few months ago. I refactored the code according to comments. The next patches will support subreg coalesce base on they. Here are some data abot build time of SPEC INT 2017 (x86-64 target): baseline baseline(+track-subreg-liveness) specint2017 build time : 1892s 1883s Regarding build times, I've run it a few times, but they all seem to take much less time. Since the difference is small, it's possible that it's just a change in environment. But it's theoretically possible, since supporting subreg-liveness could have reduced the number of living regs. For memory usage, I trided PR 69609 by valgrind, peak memory size grow from 2003910656 to 2003947520, very small increase. Note that these patches don't enable register coalesce with subreg liveness in IRA/LRA, so no performance change as expected. And we will enable register coalsece with subreg liveness tracking in the followup patches. Bootstrap and Regtested on x86-64 no regression. Co-authored-by: Lehua Ding Juzhe-Zhong (4): DF: Add -ftrack-subreg-liveness option DF: Add DF_LIVE_SUBREG problem IRA: Apply DF_LIVE_SUBREG data LRA: Apply DF_LIVE_SUBREG data gcc/Makefile.in | 1 + gcc/common.opt | 4 + gcc/common.opt.urls | 3 + gcc/df-problems.cc | 886 ++- gcc/df.h | 159 +++ gcc/doc/invoke.texi | 8 + gcc/ira-build.cc | 7 +- gcc/ira-color.cc | 8 +- gcc/ira-emit.cc | 12 +- gcc/ira-lives.cc | 7 +- gcc/ira.cc | 19 +- gcc/lra-coalesce.cc | 27 +- gcc/lra-constraints.cc | 109 - gcc/lra-int.h| 4 + gcc/lra-lives.cc | 357 gcc/lra-remat.cc | 8 +- gcc/lra-spills.cc| 27 +- gcc/lra.cc | 10 +- gcc/opts.cc | 1 + gcc/regs.h | 5 + gcc/sbitmap.cc | 98 + gcc/sbitmap.h| 2 + gcc/subreg-live-range.cc | 233 ++ gcc/subreg-live-range.h | 60 +++ gcc/timevar.def | 1 + 25 files changed, 1920 insertions(+), 136 deletions(-) create mode 100644 gcc/subreg-live-range.cc create mode 100644 gcc/subreg-live-range.h -- 2.36.3
[SUBREG V3 2/4] DF: Add DF_LIVE_SUBREG problem
This patch add a new DF problem, named DF_LIVE_SUBREG. This problem is extended from the DF_LR problem and support track the subreg liveness of multireg pseudo if these pseudo satisfy the following conditions: 1. the mode size greater than it's REGMODE_NATURAL_SIZE. 2. the reg is used in insns via subreg pattern. The main methods are as follows: 1. split bitmap in/out/def/use fileds to full_in/out/def/use and partial_in/out/def/use. If a pseudo need to be tracked it's subreg liveness, then it is recorded in partial_in/out/def/use fileds. Meantimes, there are range_in/out/def/use fileds which records the live range of the tracked pseudo. 2. in the df_live_subreg_finalize function, we move the tracked pseudo from the partial_in/out/def/use to full_in/out/def/use if the pseudo's live range is full. Co-authored-by: Lehua Ding gcc/ChangeLog: * Makefile.in: Add subreg-live-range object file. * df-problems.cc (struct df_live_subreg_problem_data): New struct. (df_live_subreg_get_bb_info): New function. (get_live_subreg_local_bb_info): Ditto. (multireg_p): Ditto. (need_track_subreg_p): Ditto. (init_range): Ditto. (remove_subreg_range): Ditto. (add_subreg_range_to_def): Ditto. (add_subreg_range_to_use): Ditto. (df_live_subreg_free_bb_info): Ditto. (df_live_subreg_alloc): Ditto. (df_live_subreg_reset): Ditto. (df_live_subreg_bb_local_compute): Ditto. (df_live_subreg_local_compute): Ditto. (df_live_subreg_init): Ditto. (df_live_subreg_check_result): Ditto. (df_live_subreg_confluence_0): Ditto. (df_live_subreg_confluence_n): Ditto. (df_live_subreg_transfer_function): Ditto. (df_live_subreg_finalize): Ditto. (df_live_subreg_free): Ditto. (df_live_subreg_top_dump): Ditto. (df_live_subreg_bottom_dump): Ditto. (df_live_subreg_add_problem): Ditto. * df.h (enum df_problem_id): New enum. (class subregs_live): New class. (class df_live_subreg_local_bb_info): Ditto. (class df_live_subreg_bb_info): Ditto. (df_live_subreg): New function. (df_live_subreg_add_problem): Ditto. (df_live_subreg_finalize): Ditto. (df_live_subreg_check_result): Ditto. (multireg_p): Ditto. (init_range): Ditto. (add_subreg_range_to_def): Ditto. (add_subreg_range_to_use): Ditto. (remove_subreg_range): Ditto. (df_get_subreg_live_in): Ditto. (df_get_subreg_live_out): Ditto. (df_get_subreg_live_full_in): Ditto. (df_get_subreg_live_full_out): Ditto. (df_get_subreg_live_partial_in): Ditto. (df_get_subreg_live_partial_out): Ditto. (df_get_subreg_live_range_in): Ditto. (df_get_subreg_live_range_out): Ditto. * regs.h (get_nblocks): New macro. * sbitmap.cc (bitmap_full_p): New function. (bitmap_same_p): Ditto. (test_full): Ditto. (test_same): Ditto. (sbitmap_cc_tests): Ditto. * sbitmap.h (bitmap_full_p): Ditto. (bitmap_same_p): Ditto. * timevar.def (TV_DF_LIVE_SUBREG): New timer stat. * subreg-live-range.cc: New file. * subreg-live-range.h: New file. --- gcc/Makefile.in | 1 + gcc/df-problems.cc | 886 ++- gcc/df.h | 159 +++ gcc/regs.h | 5 + gcc/sbitmap.cc | 98 + gcc/sbitmap.h| 2 + gcc/subreg-live-range.cc | 53 +++ gcc/subreg-live-range.h | 206 + gcc/timevar.def | 1 + 9 files changed, 1410 insertions(+), 1 deletion(-) create mode 100644 gcc/subreg-live-range.cc create mode 100644 gcc/subreg-live-range.h diff --git a/gcc/Makefile.in b/gcc/Makefile.in index ecd51146357..11722506018 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1683,6 +1683,7 @@ OBJS = \ store-motion.o \ streamer-hooks.o \ stringpool.o \ + subreg-live-range.o \ substring-locations.o \ target-globals.o \ targhooks.o \ diff --git a/gcc/df-problems.cc b/gcc/df-problems.cc index 88ee0dd67fc..01f1f850925 100644 --- a/gcc/df-problems.cc +++ b/gcc/df-problems.cc @@ -28,6 +28,7 @@ along with GCC; see the file COPYING3. If not see #include "target.h" #include "rtl.h" #include "df.h" +#include "subreg-live-range.h" #include "memmodel.h" #include "tm_p.h" #include "insn-config.h" @@ -1344,8 +1345,891 @@ df_lr_verify_transfer_functions (void) bitmap_clear (&all_blocks); } +/* + REGISTER AND SUBREGS LIVES + Like DF_LR, but include tracking subreg liveness. Currently used to provide + subreg liveness related information to the register allocator. The subreg + information is currently tracked for registers that satisf
[SUBREG V3 0/4] Add DF_LIVE_SUBREG data and apply to IRA and LRA
V3: Address comments from Dimitar Dimitrov These patches are used to add a new data flow DF_LIVE_SUBREG, which will track subreg liveness and then apply it to IRA and LRA passes (enabled via -O3 or -ftrack-subreg-liveness). These patches are for GCC 15. And these codes are pushed to the devel/subreg-coalesce branch. In addition, my colleague Shuo Chen will also be involved in some of the remain work, thank you for your support. These patches are separated from the subreg-coalesce patches submitted a few months ago. I refactored the code according to comments. The next patches will support subreg coalesce base on they. Here are some data abot build time of SPEC INT 2017 (x86-64 target): baseline baseline(+track-subreg-liveness) specint2017 build time : 1892s 1883s Regarding build times, I've run it a few times, but they all seem to take much less time. Since the difference is small, it's possible that it's just a change in environment. But it's theoretically possible, since supporting subreg-liveness could have reduced the number of living regs. For memory usage, I trided PR 69609 by valgrind, peak memory size grow from 2003910656 to 2003947520, very small increase. Note that these patches don't enable register coalesce with subreg liveness in IRA/LRA, so no performance change as expected. And we will enable register coalsece with subreg liveness tracking in the followup patches. Bootstrap and Regtested on x86-64 no regression. Co-authored-by: Lehua Ding Juzhe-Zhong (4): DF: Add -ftrack-subreg-liveness option DF: Add DF_LIVE_SUBREG problem IRA: Add DF_LIVE_SUBREG problem LRA: Apply DF_LIVE_SUBREG data gcc/Makefile.in | 1 + gcc/common.opt | 4 + gcc/common.opt.urls | 3 + gcc/df-problems.cc | 886 ++- gcc/df.h | 159 +++ gcc/doc/invoke.texi | 8 + gcc/ira-build.cc | 7 +- gcc/ira-color.cc | 8 +- gcc/ira-emit.cc | 12 +- gcc/ira-lives.cc | 7 +- gcc/ira.cc | 19 +- gcc/lra-coalesce.cc | 27 +- gcc/lra-constraints.cc | 109 - gcc/lra-int.h| 4 + gcc/lra-lives.cc | 357 gcc/lra-remat.cc | 8 +- gcc/lra-spills.cc| 27 +- gcc/lra.cc | 10 +- gcc/opts.cc | 1 + gcc/regs.h | 5 + gcc/sbitmap.cc | 98 + gcc/sbitmap.h| 2 + gcc/subreg-live-range.cc | 53 +++ gcc/subreg-live-range.h | 206 + gcc/timevar.def | 1 + 25 files changed, 1886 insertions(+), 136 deletions(-) create mode 100644 gcc/subreg-live-range.cc create mode 100644 gcc/subreg-live-range.h -- 2.36.3
[SUBREG V3 4/4] LRA: Apply DF_LIVE_SUBREG data
This patch apply the DF_LIVE_SUBREG to LRA pass. More changes were made to the LRA than the IRA since the LRA will modify the DF data directly. The main big changes are centered on the lra-lives.cc file. Co-authored-by: Lehua Ding gcc/ChangeLog: * lra-coalesce.cc (update_live_info): Apply DF_LIVE_SUBREG data. (lra_coalesce): Ditto. * lra-constraints.cc (update_ebb_live_info): Ditto. (get_live_on_other_edges): Ditto. (inherit_in_ebb): Ditto. (lra_inheritance): Ditto. (fix_bb_live_info): Ditto. (remove_inheritance_pseudos): Ditto. * lra-int.h (GCC_LRA_INT_H): Ditto. (struct lra_insn_reg): Ditto. * lra-lives.cc (class bb_data_pseudos): Ditto. (need_track_subreg_p): New function. (make_hard_regno_live): Ditto (make_hard_regno_dead): Ditto. (mark_regno_live): Apply DF_LIVE_SUBREG data. (mark_regno_dead): Ditto. (live_trans_fun): Ditto. (live_con_fun_0): Ditto. (live_con_fun_n): Ditto. (initiate_live_solver): Ditto. (finish_live_solver): Ditto. (process_bb_lives): Ditto. (lra_create_live_ranges_1): Ditto. * lra-remat.cc (dump_candidates_and_remat_bb_data): Ditto. (calculate_livein_cands): Ditto. (do_remat): Ditto. * lra-spills.cc (spill_pseudos): Ditto. * lra.cc (new_insn_reg): Ditto. (add_regs_to_insn_regno_info): Ditto. --- gcc/lra-coalesce.cc| 27 +++- gcc/lra-constraints.cc | 109 ++--- gcc/lra-int.h | 4 + gcc/lra-lives.cc | 357 - gcc/lra-remat.cc | 8 +- gcc/lra-spills.cc | 27 +++- gcc/lra.cc | 10 +- 7 files changed, 430 insertions(+), 112 deletions(-) diff --git a/gcc/lra-coalesce.cc b/gcc/lra-coalesce.cc index a9b5b51cb3f..9416775a009 100644 --- a/gcc/lra-coalesce.cc +++ b/gcc/lra-coalesce.cc @@ -186,19 +186,28 @@ static bitmap_head used_pseudos_bitmap; /* Set up USED_PSEUDOS_BITMAP, and update LR_BITMAP (a BB live info bitmap). */ static void -update_live_info (bitmap lr_bitmap) +update_live_info (bitmap all, bitmap full, bitmap partial) { unsigned int j; bitmap_iterator bi; bitmap_clear (&used_pseudos_bitmap); - EXECUTE_IF_AND_IN_BITMAP (&coalesced_pseudos_bitmap, lr_bitmap, + EXECUTE_IF_AND_IN_BITMAP (&coalesced_pseudos_bitmap, all, FIRST_PSEUDO_REGISTER, j, bi) bitmap_set_bit (&used_pseudos_bitmap, first_coalesced_pseudo[j]); - if (! bitmap_empty_p (&used_pseudos_bitmap)) + if (!bitmap_empty_p (&used_pseudos_bitmap)) { - bitmap_and_compl_into (lr_bitmap, &coalesced_pseudos_bitmap); - bitmap_ior_into (lr_bitmap, &used_pseudos_bitmap); + bitmap_and_compl_into (all, &coalesced_pseudos_bitmap); + bitmap_ior_into (all, &used_pseudos_bitmap); + + if (flag_track_subreg_liveness) + { + bitmap_and_compl_into (full, &coalesced_pseudos_bitmap); + bitmap_ior_and_compl_into (full, &used_pseudos_bitmap, partial); + + bitmap_and_compl_into (partial, &coalesced_pseudos_bitmap); + bitmap_ior_and_compl_into (partial, &used_pseudos_bitmap, full); + } } } @@ -301,8 +310,12 @@ lra_coalesce (void) bitmap_initialize (&used_pseudos_bitmap, ®_obstack); FOR_EACH_BB_FN (bb, cfun) { - update_live_info (df_get_live_in (bb)); - update_live_info (df_get_live_out (bb)); + update_live_info (df_get_subreg_live_in (bb), + df_get_subreg_live_full_in (bb), + df_get_subreg_live_partial_in (bb)); + update_live_info (df_get_subreg_live_out (bb), + df_get_subreg_live_full_out (bb), + df_get_subreg_live_partial_out (bb)); FOR_BB_INSNS_SAFE (bb, insn, next) if (INSN_P (insn) && bitmap_bit_p (&involved_insns_bitmap, INSN_UID (insn))) diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc index 5b78fd0b7e5..effb5d8484c 100644 --- a/gcc/lra-constraints.cc +++ b/gcc/lra-constraints.cc @@ -6554,34 +6554,86 @@ update_ebb_live_info (rtx_insn *head, rtx_insn *tail) { if (prev_bb != NULL) { - /* Update df_get_live_in (prev_bb): */ + /* Update subreg live (prev_bb): */ + bitmap subreg_all_in = df_get_subreg_live_in (prev_bb); + bitmap subreg_full_in = df_get_subreg_live_full_in (prev_bb); + bitmap subreg_partial_in = df_get_subreg_live_partial_in (prev_bb); + subregs_live *range_in = df_get_subreg_live_range_in (prev_bb); EXECUTE_IF_SET_IN_BITMAP (&check_only_regs, 0, j, bi) if (bitmap_bit_p (&live_regs, j)) - bitmap_set_bit (df_get_live_in (prev_bb), j); - else - bitmap_clear_bit (df_get_live_in (prev_bb), j); + { + bitmap_
[SUBREG V3 3/4] IRA: Add DF_LIVE_SUBREG problem
This patch simple replace df_get_live_in to df_get_subreg_live_in and replace df_get_live_out to df_get_subreg_live_out. Co-authored-by: Lehua Ding gcc/ChangeLog: * ira-build.cc (create_bb_allocnos): Apply DF_LIVE_SUBREG data. (create_loop_allocnos): Diito. * ira-color.cc (ira_loop_edge_freq): Diito. * ira-emit.cc (generate_edge_moves): Diito. (add_ranges_and_copies): Diito. * ira-lives.cc (process_out_of_region_eh_regs): Diito. (add_conflict_from_region_landing_pads): Diito. (process_bb_node_lives): Diito. * ira.cc (find_moveable_pseudos): Diito. (interesting_dest_for_shprep_1): Diito. (allocate_initial_values): Diito. (ira): Diito. --- gcc/ira-build.cc | 7 --- gcc/ira-color.cc | 8 gcc/ira-emit.cc | 12 ++-- gcc/ira-lives.cc | 7 --- gcc/ira.cc | 19 --- 5 files changed, 30 insertions(+), 23 deletions(-) diff --git a/gcc/ira-build.cc b/gcc/ira-build.cc index ea593d5a087..283ff36d3dd 100644 --- a/gcc/ira-build.cc +++ b/gcc/ira-build.cc @@ -1921,7 +1921,8 @@ create_bb_allocnos (ira_loop_tree_node_t bb_node) create_insn_allocnos (PATTERN (insn), NULL, false); /* It might be a allocno living through from one subloop to another. */ - EXECUTE_IF_SET_IN_REG_SET (df_get_live_in (bb), FIRST_PSEUDO_REGISTER, i, bi) + EXECUTE_IF_SET_IN_REG_SET (df_get_subreg_live_in (bb), FIRST_PSEUDO_REGISTER, +i, bi) if (ira_curr_regno_allocno_map[i] == NULL) ira_create_allocno (i, false, ira_curr_loop_tree_node); } @@ -1937,9 +1938,9 @@ create_loop_allocnos (edge e) bitmap_iterator bi; ira_loop_tree_node_t parent; - live_in_regs = df_get_live_in (e->dest); + live_in_regs = df_get_subreg_live_in (e->dest); border_allocnos = ira_curr_loop_tree_node->border_allocnos; - EXECUTE_IF_SET_IN_REG_SET (df_get_live_out (e->src), + EXECUTE_IF_SET_IN_REG_SET (df_get_subreg_live_out (e->src), FIRST_PSEUDO_REGISTER, i, bi) if (bitmap_bit_p (live_in_regs, i)) { diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc index b9ae32d1b4d..bfebc48ef83 100644 --- a/gcc/ira-color.cc +++ b/gcc/ira-color.cc @@ -2786,8 +2786,8 @@ ira_loop_edge_freq (ira_loop_tree_node_t loop_node, int regno, bool exit_p) FOR_EACH_EDGE (e, ei, loop_node->loop->header->preds) if (e->src != loop_node->loop->latch && (regno < 0 - || (bitmap_bit_p (df_get_live_out (e->src), regno) - && bitmap_bit_p (df_get_live_in (e->dest), regno + || (bitmap_bit_p (df_get_subreg_live_out (e->src), regno) + && bitmap_bit_p (df_get_subreg_live_in (e->dest), regno freq += EDGE_FREQUENCY (e); } else @@ -2795,8 +2795,8 @@ ira_loop_edge_freq (ira_loop_tree_node_t loop_node, int regno, bool exit_p) auto_vec edges = get_loop_exit_edges (loop_node->loop); FOR_EACH_VEC_ELT (edges, i, e) if (regno < 0 - || (bitmap_bit_p (df_get_live_out (e->src), regno) - && bitmap_bit_p (df_get_live_in (e->dest), regno))) + || (bitmap_bit_p (df_get_subreg_live_out (e->src), regno) + && bitmap_bit_p (df_get_subreg_live_in (e->dest), regno))) freq += EDGE_FREQUENCY (e); } diff --git a/gcc/ira-emit.cc b/gcc/ira-emit.cc index d347f11fa02..8075b082e36 100644 --- a/gcc/ira-emit.cc +++ b/gcc/ira-emit.cc @@ -510,8 +510,8 @@ generate_edge_moves (edge e) return; src_map = src_loop_node->regno_allocno_map; dest_map = dest_loop_node->regno_allocno_map; - regs_live_in_dest = df_get_live_in (e->dest); - regs_live_out_src = df_get_live_out (e->src); + regs_live_in_dest = df_get_subreg_live_in (e->dest); + regs_live_out_src = df_get_subreg_live_out (e->src); EXECUTE_IF_SET_IN_REG_SET (regs_live_in_dest, FIRST_PSEUDO_REGISTER, regno, bi) if (bitmap_bit_p (regs_live_out_src, regno)) @@ -1229,16 +1229,16 @@ add_ranges_and_copies (void) destination block) to use for searching allocnos by their regnos because of subsequent IR flattening. */ node = IRA_BB_NODE (bb)->parent; - bitmap_copy (live_through, df_get_live_in (bb)); + bitmap_copy (live_through, df_get_subreg_live_in (bb)); add_range_and_copies_from_move_list (at_bb_start[bb->index], node, live_through, REG_FREQ_FROM_BB (bb)); - bitmap_copy (live_through, df_get_live_out (bb)); + bitmap_copy (live_through, df_get_subreg_live_out (bb)); add_range_and_copies_from_move_list (at_bb_end[bb->index], node, live_through, REG_FREQ_FROM_BB (bb)); FOR_EACH_EDGE (e, ei, bb->succs) { - bitmap_and (live_through, - df_get_live_in (e->dest), df_get_live_out (bb)); + bitmap_and (live_through, df_get_subreg_live_in (e->dest), + df_get_subreg_l
[SUBREG V3 1/4] DF: Add -ftrack-subreg-liveness option
Add new flag -ftrack-subreg-liveness to enable track-subreg-liveness. This flag is enabled at -O3/fast. Co-authored-by: Lehua Ding gcc/ChangeLog: * common.opt: Add -ftrack-subreg-liveness option. * common.opt.urls: Ditto. * doc/invoke.texi: Ditto. * opts.cc: Ditto. --- gcc/common.opt | 4 gcc/common.opt.urls | 3 +++ gcc/doc/invoke.texi | 8 gcc/opts.cc | 1 + 4 files changed, 16 insertions(+) diff --git a/gcc/common.opt b/gcc/common.opt index 40cab3cb36a..5710e817abe 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -2163,6 +2163,10 @@ fira-share-spill-slots Common Var(flag_ira_share_spill_slots) Init(1) Optimization Share stack slots for spilled pseudo-registers. +ftrack-subreg-liveness +Common Var(flag_track_subreg_liveness) Init(0) Optimization +Track subreg liveness information. + fira-verbose= Common RejectNegative Joined UInteger Var(flag_ira_verbose) Init(5) -fira-verbose= Control IRA's level of diagnostic messages. diff --git a/gcc/common.opt.urls b/gcc/common.opt.urls index f71ed80a34b..59f27a6f7c6 100644 --- a/gcc/common.opt.urls +++ b/gcc/common.opt.urls @@ -880,6 +880,9 @@ UrlSuffix(gcc/Optimize-Options.html#index-fira-share-save-slots) fira-share-spill-slots UrlSuffix(gcc/Optimize-Options.html#index-fira-share-spill-slots) +ftrack-subreg-liveness +UrlSuffix(gcc/Optimize-Options.html#index-ftrack-subreg-liveness) + fira-verbose= UrlSuffix(gcc/Developer-Options.html#index-fira-verbose) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index ddcd5213f06..fbcde8aa745 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -13188,6 +13188,14 @@ Disable sharing of stack slots allocated for pseudo-registers. Each pseudo-register that does not get a hard register gets a separate stack slot, and as a result function stack frames are larger. +@opindex ftrack-subreg-liveness +@item -ftrack-subreg-liveness +Enable tracking subreg liveness information. This infomation allows IRA +and LRA to support subreg coalesce feature which can improve the quality +of register allocation. + +This option is enabled at level @option{-O3} for all targets. + @opindex flra-remat @item -flra-remat Enable CFG-sensitive rematerialization in LRA. Instead of loading diff --git a/gcc/opts.cc b/gcc/opts.cc index 600e0ea..50c0b62c5af 100644 --- a/gcc/opts.cc +++ b/gcc/opts.cc @@ -689,6 +689,7 @@ static const struct default_options default_options_table[] = { OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 }, { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_DYNAMIC }, { OPT_LEVELS_3_PLUS, OPT_fversion_loops_for_strides, NULL, 1 }, +{ OPT_LEVELS_3_PLUS, OPT_ftrack_subreg_liveness, NULL, 1 }, /* -O3 parameters. */ { OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_auto_, NULL, 30 }, -- 2.36.3
[PATCH] RISC-V: Fix infinite compilation of VSETVL PASS
This patch fixes issue reported by Jeff. Testing is running. Ok for trunk if I passed the testing with no regression ? gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::emit_vsetvl): Fix inifinite compilation. (pre_vsetvl::remove_vsetvl_pre_insns): Ditto. --- gcc/config/riscv/riscv-vsetvl.cc | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 2c0dcdf18c5..32f262de199 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2281,9 +2281,8 @@ private: } } - void remove_vsetvl_insn (const vsetvl_info &info) + void remove_vsetvl_insn (rtx_insn *rinsn) { -rtx_insn *rinsn = info.get_insn ()->rtl (); if (dump_file) { fprintf (dump_file, " Eliminate insn %d:\n", INSN_UID (rinsn)); @@ -3231,7 +3230,7 @@ pre_vsetvl::emit_vsetvl () if (curr_info.delete_p ()) { if (vsetvl_insn_p (insn->rtl ())) - remove_vsetvl_insn (curr_info); + remove_vsetvl_insn (curr_info.get_insn ()->rtl ()); continue; } else if (curr_info.valid_p ()) @@ -3269,7 +3268,7 @@ pre_vsetvl::emit_vsetvl () for (const vsetvl_info &item : m_delete_list) { gcc_assert (vsetvl_insn_p (item.get_insn ()->rtl ())); - remove_vsetvl_insn (item); + remove_vsetvl_insn (item.get_insn ()->rtl ()); } /* Insert vsetvl info that was not deleted after lift up. */ @@ -3434,7 +3433,7 @@ pre_vsetvl::remove_vsetvl_pre_insns () INSN_UID (rinsn)); print_rtl_single (dump_file, rinsn); } - remove_insn (rinsn); + remove_vsetvl_insn (rinsn); } } -- 2.36.3
[PATCH] RISC-V: Expand VLMAX scalar move in reduction
This patch fixes the following: vsetvli a5,a1,e32,m1,tu,ma sllia4,a5,2 sub a1,a1,a5 vle32.v v2,0(a0) add a0,a0,a4 vadd.vv v1,v2,v1 bne a1,zero,.L3 vsetivlizero,1,e32,m1,ta,ma vmv.s.x v2,zero vsetvli a5,zero,e32,m1,ta,ma ---> Redundant vsetvl. vredsum.vs v1,v1,v2 vmv.x.s a0,v1 ret VSETVL PASS is able to fuse avl = 1 of scalar move and VLMAX avl of reduction. However, this following RTL blocks the fusion in dependence analysis in VSETVL PASS: (insn 49 24 50 5 (set (reg:RVVM1SI 98 v2 [148]) (if_then_else:RVVM1SI (unspec:RVVMF32BI [ (const_vector:RVVMF32BI [ (const_int 1 [0x1]) repeat [ (const_int 0 [0]) ] ]) (const_int 1 [0x1]) (const_int 2 [0x2]) repeated x2 (const_int 0 [0]) (reg:SI 66 vl) (reg:SI 67 vtype) ] UNSPEC_VPREDICATE) (const_vector:RVVM1SI repeat [ (const_int 0 [0]) ]) (unspec:RVVM1SI [ (reg:DI 0 zero) ] UNSPEC_VUNDEF))) 3813 {*pred_broadcastrvvm1si_zero} (nil)) (insn 50 49 51 5 (set (reg:DI 15 a5 [151]) > It set a5, blocks the following VLMAX into the scalar move above. (unspec:DI [ (const_int 32 [0x20]) ] UNSPEC_VLMAX)) 2566 {vlmax_avldi} (expr_list:REG_EQUIV (unspec:DI [ (const_int 32 [0x20]) ] UNSPEC_VLMAX) (nil))) (insn 51 50 52 5 (set (reg:RVVM1SI 97 v1 [150]) (unspec:RVVM1SI [ (unspec:RVVMF32BI [ (const_vector:RVVMF32BI repeat [ (const_int 1 [0x1]) ]) (reg:DI 15 a5 [151]) (const_int 2 [0x2]) (const_int 1 [0x1]) (reg:SI 66 vl) (reg:SI 67 vtype) ] UNSPEC_VPREDICATE) (unspec:RVVM1SI [ (reg:RVVM1SI 97 v1 [orig:134 vect_result_14.6 ] [134]) (reg:RVVM1SI 98 v2 [148]) ] UNSPEC_REDUC_SUM) (unspec:RVVM1SI [ (reg:DI 0 zero) ] UNSPEC_VUNDEF) ] UNSPEC_REDUC)) 17541 {pred_redsumrvvm1si} (expr_list:REG_DEAD (reg:RVVM1SI 98 v2 [148]) (expr_list:REG_DEAD (reg:SI 66 vl) (expr_list:REG_DEAD (reg:DI 15 a5 [151]) (expr_list:REG_DEAD (reg:DI 0 zero) (nil)) Such situation can only happen on auto-vectorization, never happen on intrinsic codes. Since the reduction is passed VLMAX AVL, it should be more natural to pass VLMAX to the scalar move which initial the value of the reduction. After this patch: vsetvli a5,a1,e32,m1,tu,ma sllia4,a5,2 sub a1,a1,a5 vle32.v v2,0(a0) add a0,a0,a4 vadd.vv v1,v2,v1 bne a1,zero,.L3 vsetvli a5,zero,e32,m1,ta,ma vmv.s.x v2,zero vredsum.vs v1,v1,v2 vmv.x.s a0,v1 ret Tested on both RV32/RV64 no regression. PR target/113697 gcc/ChangeLog: * config/riscv/riscv-v.cc (expand_reduction): Pass VLMAX avl to scalar move. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr113697.c: New test. --- gcc/config/riscv/riscv-v.cc| 12 +++- .../gcc.target/riscv/rvv/autovec/pr113697.c| 14 ++ 2 files changed, 21 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113697.c diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 4bacb7fea45..0cfbd21ce6f 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -4151,13 +4151,15 @@ expand_reduction (unsigned unspec, unsigned insn_flags, rtx *ops, rtx init) rtx m1_tmp = gen_reg_rtx (m1_mode); rtx scalar_move_ops[] = {m1_tmp, init}; - emit_nonvlmax_insn (code_for_pred_broadcast (m1_mode), SCALAR_MOVE_OP, - scalar_move_ops, - need_mask_operand_p (insn_flags) ? ops[3] - : CONST1_RTX (Pmode)); + insn_code icode = code_for_pred_broadcast (m1_mode); + if (need_mask_operand_p (insn_flags)) +emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, ops[3]); + else +emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops); + rtx m1_tmp2 = gen_reg_rtx (m1_mode); rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp}; - insn_code icode = code_for_pred (unspec, vmode); +
[PATCH] RISC-V: Allow LICM hoist POLY_INT configuration code sequence
Realize in recent benchmark evaluation (coremark-pro zip-test): vid.v v2 vmv.v.i v5,0 .L9: vle16.v v3,0(a4) vrsub.vxv4,v2,a6 ---> LICM failed to hoist it outside the loop. The root cause is: (insn 56 47 57 4 (set (subreg:DI (reg:HI 220) 0) (reg:DI 223)) "rvv.c":11:9 208 {*movdi_64bit} -> Its result used by the following vrsub.vx then supress the hoist of the vrsub.vx (nil)) (insn 57 56 59 4 (set (reg:RVVMF2HI 216) (if_then_else:RVVMF2HI (unspec:RVVMF32BI [ (const_vector:RVVMF32BI repeat [ (const_int 1 [0x1]) ]) (reg:DI 350) (const_int 2 [0x2]) repeated x2 (const_int 1 [0x1]) (reg:SI 66 vl) (reg:SI 67 vtype) ] UNSPEC_VPREDICATE) (minus:RVVMF2HI (vec_duplicate:RVVMF2HI (reg:HI 220)) (reg:RVVMF2HI 217)) (unspec:RVVMF2HI [ (reg:DI 0 zero) ] UNSPEC_VUNDEF))) "rvv.c":11:9 6938 {pred_subrvvmf2hi_reverse_scalar} (expr_list:REG_DEAD (reg:HI 220) (nil))) This patch fixes it generate (set (reg:HI) (subreg:HI (reg:DI))) instead of (set (subreg:DI (reg:DI)) (reg:DI)). After this patch: vid.v v2 vrsub.vxv2,v2,a7 vmv.v.i v4,0 .L3: vle16.v v3,0(a4) Tested on both RV32 and RV64 no regression. gcc/ChangeLog: * config/riscv/riscv.cc (riscv_legitimize_move): Fix poly_int dest generation. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/poly_licm-1.c: New test. * gcc.target/riscv/rvv/autovec/poly_licm-2.c: New test. --- gcc/config/riscv/riscv.cc | 9 --- .../riscv/rvv/autovec/poly_licm-1.c | 18 + .../riscv/rvv/autovec/poly_licm-2.c | 27 +++ 3 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 529ef5e84b7..6e22b43e618 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -2711,16 +2711,17 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx src) (const_poly_int:HI [m, n]) (const_poly_int:SI [m, n]). */ rtx tmp = gen_reg_rtx (Pmode); - riscv_legitimize_poly_move (Pmode, gen_lowpart (Pmode, dest), tmp, - src); + rtx tmp2 = gen_reg_rtx (Pmode); + riscv_legitimize_poly_move (Pmode, tmp2, tmp, src); + emit_move_insn (dest, gen_lowpart (mode, tmp2)); } else { /* In RV32 system, handle (const_poly_int:SI [m, n]) (const_poly_int:DI [m, n]). In RV64 system, handle (const_poly_int:DI [m, n]). - FIXME: Maybe we could gen SImode in RV32 and then sign-extend to DImode, - the offset should not exceed 4GiB in general. */ +FIXME: Maybe we could gen SImode in RV32 and then sign-extend to +DImode, the offset should not exceed 4GiB in general. */ rtx tmp = gen_reg_rtx (mode); riscv_legitimize_poly_move (mode, dest, tmp, src); } diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c new file mode 100644 index 000..b7da65f0996 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2" } */ + +extern int wsize; + +typedef unsigned short Posf; +#define NIL 0 + +void foo (Posf *p) +{ + register unsigned n, m; + do { + m = *--p; + *p = (Posf)(m >= wsize ? m-wsize : NIL); + } while (--n); +} + +/* { dg-final { scan-assembler-times {vid\.v\s+v[0-9]+\s+addi\s+\s*[a-x0-9]+,\s*[a-x0-9]+,\s*-1\s+vrsub\.vx\s+} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c new file mode 100644 index 000..ffb3c63149f --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns -fno-schedule-insns2" } */ + +typedef unsigned short uint16_t; + +void AAA (uint16_t *x, uint16_t *y, unsigned wsize, unsigned count) +{ + unsigned m = 0, n = count; + register uint16_t *p; + + p = x; + + do { +m = *--p; +*p = (uint16_t)(m >= wsize ? m-wsize : 0); + } while (--n); + + n = wsize; + p = y; + + do { + m = *--p; +
[PATCH] RISC-V: Remove vsetvl_pre bogus instructions in VSETVL PASS
I realize there is a RTL regression between GCC-14 and GCC-13. https://godbolt.org/z/Ga7K6MqaT GCC-14: (insn 9 13 31 2 (set (reg:DI 15 a5 [138]) (unspec:DI [ (const_int 64 [0x40]) ] UNSPEC_VLMAX)) "/app/example.c":5:15 2566 {vlmax_avldi} (expr_list:REG_EQUIV (unspec:DI [ (const_int 64 [0x40]) ] UNSPEC_VLMAX) (nil))) (insn 31 9 10 2 (parallel [ (set (reg:DI 15 a5 [138]) (unspec:DI [ (reg:DI 0 zero) (const_int 32 [0x20]) (const_int 7 [0x7]) (const_int 1 [0x1]) repeated x2 ] UNSPEC_VSETVL)) (set (reg:SI 66 vl) (unspec:SI [ (reg:DI 0 zero) (const_int 32 [0x20]) (const_int 7 [0x7]) ] UNSPEC_VSETVL)) (set (reg:SI 67 vtype) (unspec:SI [ (const_int 32 [0x20]) (const_int 7 [0x7]) (const_int 1 [0x1]) repeated x2 ] UNSPEC_VSETVL)) ]) "/app/example.c":5:15 3281 {vsetvldi} (nil)) GCC-13: (insn 10 7 26 2 (set (reg/f:DI 11 a1 [139]) (plus:DI (reg:DI 11 a1 [142]) (const_int 800 [0x320]))) "/app/example.c":6:32 5 {adddi3} (nil)) (insn 26 10 9 2 (parallel [ (set (reg:DI 15 a5) (unspec:DI [ (reg:DI 0 zero) (const_int 32 [0x20]) (const_int 7 [0x7]) (const_int 1 [0x1]) repeated x2 ] UNSPEC_VSETVL)) (set (reg:SI 66 vl) (unspec:SI [ (reg:DI 0 zero) (const_int 32 [0x20]) (const_int 7 [0x7]) ] UNSPEC_VSETVL)) (set (reg:SI 67 vtype) (unspec:SI [ (const_int 32 [0x20]) (const_int 7 [0x7]) (const_int 1 [0x1]) repeated x2 ] UNSPEC_VSETVL)) ]) "/app/example.c":5:15 792 {vsetvldi} (nil)) GCC-13 doesn't have: (insn 9 13 31 2 (set (reg:DI 15 a5 [138]) (unspec:DI [ (const_int 64 [0x40]) ] UNSPEC_VLMAX)) "/app/example.c":5:15 2566 {vlmax_avldi} (expr_list:REG_EQUIV (unspec:DI [ (const_int 64 [0x40]) ] UNSPEC_VLMAX) (nil))) vsetvl_pre doesn't emit any assembler which is just used for occupying scalar register. It should be removed in VSETVL PASS. Tested on both RV32 and RV64 no regression. gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (vsetvl_pre_insn_p): New function. (pre_vsetvl::cleaup): Remove vsetvl_pre. (pre_vsetvl::remove_vsetvl_pre_insns): New function. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/vsetvl_pre-1.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 64 +++ .../riscv/rvv/vsetvl/vsetvl_pre-1.c | 12 2 files changed, 76 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl_pre-1.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 28b7534d970..4732d4fc77f 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -315,6 +315,48 @@ vsetvl_insn_p (rtx_insn *rinsn) || INSN_CODE (rinsn) == CODE_FOR_vsetvlsi); } +/* Return true if it is the bogus vsetvl_pre instruction: + + (define_insn "@vlmax_avl" + [(set (match_operand:P 0 "register_operand" "=r") + (unspec:P [(match_operand:P 1 "const_int_operand" "i")] UNSPEC_VLMAX))] + "TARGET_VECTOR" + "" + [(set_attr "type" "vsetvl_pre")]) + + As described above, it's the bogus instruction which doesn't any assembler + and should be removed eventually. It's used for occupying a scalar register + for VLMAX avl RVV instruction before register allocation. + + Before RA: + + ... + vsetvl_pre (set r136) + vadd.vv (use r136 with VLMAX avl) + ... + + After RA: + + ... + vsetvl_pre (set a5) + vadd.vv (use r136 with VLMAX avl) + ... + + VSETVL PASS: + + ... + vsetvl_pre (set a5) ---> removed. + vsetvl a5,zero,... ---> Inserted. + vadd.vv + ... +*/ +static bool +vsetvl_pre_insn_p (rtx_insn *rinsn) +{ + return recog_memoized (rinsn) >= 0 +&& get_attr_type (rinsn) == TYPE_VSETVL_PRE; +} + /* Return true if it is vsetvl zero, rs1. */ static bool vsetvl_discard_result_insn_p (rtx_insn *rinsn) @@ -2376,6 +2418,7 @@ public: void cleaup (); void remove_avl_operand (); void remove_unused_dest_operand (); + void remove_vsetvl_pre_insns (); void dump (FILE *file, const char *title) const { @@ -3332,6 +3375,7 @@ pre_vsetvl::cleaup () { remove_avl_operand (
[PATCH v2] RISC-V: Suppress the vsetvl fusion for conflict successors
Update in v2: Add dump information. This patch fixes the following ineffective vsetvl insertion: #include "riscv_vector.h" void f (int32_t * restrict in, int32_t * restrict out, size_t n, size_t cond, size_t cond2) { for (size_t i = 0; i < n; i++) { if (i == cond) { vint8mf8_t v = *(vint8mf8_t*)(in + i + 100); *(vint8mf8_t*)(out + i + 100) = v; } else if (i == cond2) { vfloat32mf2_t v = *(vfloat32mf2_t*)(in + i + 200); *(vfloat32mf2_t*)(out + i + 200) = v; } else if (i == (cond2 - 1)) { vuint16mf2_t v = *(vuint16mf2_t*)(in + i + 300); *(vuint16mf2_t*)(out + i + 300) = v; } else { vint8mf4_t v = *(vint8mf4_t*)(in + i + 400); *(vint8mf4_t*)(out + i + 400) = v; } } } Before this patch: f: .LFB0: .cfi_startproc beq a2,zero,.L12 addia7,a0,400 addia6,a1,400 addia0,a0,1600 addia1,a1,1600 li a5,0 addit6,a4,-1 vsetvli t3,zero,e8,mf8,ta,ma ---> ineffective uplift .L7: beq a3,a5,.L15 beq a4,a5,.L16 beq t6,a5,.L17 vsetvli t1,zero,e8,mf4,ta,ma vle8.v v1,0(a0) vse8.v v1,0(a1) vsetvli t3,zero,e8,mf8,ta,ma .L4: addia5,a5,1 addia7,a7,4 addia6,a6,4 addia0,a0,4 addia1,a1,4 bne a2,a5,.L7 .L12: ret .L15: vle8.v v1,0(a7) vse8.v v1,0(a6) j .L4 .L17: vsetvli t1,zero,e8,mf4,ta,ma addit5,a0,-400 addit4,a1,-400 vle16.v v1,0(t5) vse16.v v1,0(t4) vsetvli t3,zero,e8,mf8,ta,ma j .L4 .L16: addit5,a0,-800 addit4,a1,-800 vle32.v v1,0(t5) vse32.v v1,0(t4) j .L4 It's obvious that we are hoisting the e8mf8 vsetvl to the top. It's ineffective since e8mf8 comes from low probability block which is if (i == cond). For this case, we disable such fusion. After this patch: f: beq a2,zero,.L12 addia7,a0,400 addia6,a1,400 addia0,a0,1600 addia1,a1,1600 li a5,0 addit6,a4,-1 .L7: beq a3,a5,.L15 beq a4,a5,.L16 beq t6,a5,.L17 vsetvli t1,zero,e8,mf4,ta,ma vle8.v v1,0(a0) vse8.v v1,0(a1) .L4: addia5,a5,1 addia7,a7,4 addia6,a6,4 addia0,a0,4 addia1,a1,4 bne a2,a5,.L7 .L12: ret .L15: vsetvli t3,zero,e8,mf8,ta,ma vle8.v v1,0(a7) vse8.v v1,0(a6) j .L4 .L17: addit5,a0,-400 addit4,a1,-400 vsetvli t1,zero,e8,mf4,ta,ma vle16.v v1,0(t5) vse16.v v1,0(t4) j .L4 .L16: addit5,a0,-800 addit4,a1,-800 vsetvli t3,zero,e32,mf2,ta,ma vle32.v v1,0(t5) vse32.v v1,0(t4) j .L4 Tested on both RV32/RV64 no regression. Ok for trunk ? PR target/113696 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::earliest_fuse_vsetvl_info): Suppress vsetvl fusion. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/pr113696.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 25 ++ .../gcc.target/riscv/rvv/vsetvl/pr113696.c| 26 +++ 2 files changed, 51 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113696.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index cec862329c5..28b7534d970 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2959,6 +2959,31 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter) src_block_info.set_empty_info (); src_block_info.probability = profile_probability::uninitialized (); + /* See PR113696, we should reset immediate dominator to +empty since we may uplift ineffective vsetvl which +locate at low probability block. */ + basic_block dom + = get_immediate_dominator (CDI_DOMINATORS, eg->src); + auto &dom_block_info = get_block_info (dom); + if (dom_block_info.has_info () + && !m_dem.compatible_p ( + dom_block_info.get_exit_info (), curr_info)) + { + dom_block_info.set_empty_info (); + dom_block_info.probability + = profile_probability::uninitialized (); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, +
[PATCH] RISC-V: Disable the vsetvl fusion for conflict successors
This patch fixes the following ineffective vsetvl insertion: #include "riscv_vector.h" void f (int32_t * restrict in, int32_t * restrict out, size_t n, size_t cond, size_t cond2) { for (size_t i = 0; i < n; i++) { if (i == cond) { vint8mf8_t v = *(vint8mf8_t*)(in + i + 100); *(vint8mf8_t*)(out + i + 100) = v; } else if (i == cond2) { vfloat32mf2_t v = *(vfloat32mf2_t*)(in + i + 200); *(vfloat32mf2_t*)(out + i + 200) = v; } else if (i == (cond2 - 1)) { vuint16mf2_t v = *(vuint16mf2_t*)(in + i + 300); *(vuint16mf2_t*)(out + i + 300) = v; } else { vint8mf4_t v = *(vint8mf4_t*)(in + i + 400); *(vint8mf4_t*)(out + i + 400) = v; } } } Before this patch: f: .LFB0: .cfi_startproc beq a2,zero,.L12 addia7,a0,400 addia6,a1,400 addia0,a0,1600 addia1,a1,1600 li a5,0 addit6,a4,-1 vsetvli t3,zero,e8,mf8,ta,ma ---> ineffective uplift .L7: beq a3,a5,.L15 beq a4,a5,.L16 beq t6,a5,.L17 vsetvli t1,zero,e8,mf4,ta,ma vle8.v v1,0(a0) vse8.v v1,0(a1) vsetvli t3,zero,e8,mf8,ta,ma .L4: addia5,a5,1 addia7,a7,4 addia6,a6,4 addia0,a0,4 addia1,a1,4 bne a2,a5,.L7 .L12: ret .L15: vle8.v v1,0(a7) vse8.v v1,0(a6) j .L4 .L17: vsetvli t1,zero,e8,mf4,ta,ma addit5,a0,-400 addit4,a1,-400 vle16.v v1,0(t5) vse16.v v1,0(t4) vsetvli t3,zero,e8,mf8,ta,ma j .L4 .L16: addit5,a0,-800 addit4,a1,-800 vle32.v v1,0(t5) vse32.v v1,0(t4) j .L4 It's obvious that we are hoisting the e8mf8 vsetvl to the top. It's ineffective since e8mf8 comes from low probability block which is if (i == cond). For this case, we disable such fusion. After this patch: f: beq a2,zero,.L12 addia7,a0,400 addia6,a1,400 addia0,a0,1600 addia1,a1,1600 li a5,0 addit6,a4,-1 .L7: beq a3,a5,.L15 beq a4,a5,.L16 beq t6,a5,.L17 vsetvli t1,zero,e8,mf4,ta,ma vle8.v v1,0(a0) vse8.v v1,0(a1) .L4: addia5,a5,1 addia7,a7,4 addia6,a6,4 addia0,a0,4 addia1,a1,4 bne a2,a5,.L7 .L12: ret .L15: vsetvli t3,zero,e8,mf8,ta,ma vle8.v v1,0(a7) vse8.v v1,0(a6) j .L4 .L17: addit5,a0,-400 addit4,a1,-400 vsetvli t1,zero,e8,mf4,ta,ma vle16.v v1,0(t5) vse16.v v1,0(t4) j .L4 .L16: addit5,a0,-800 addit4,a1,-800 vsetvli t3,zero,e32,mf2,ta,ma vle32.v v1,0(t5) vse32.v v1,0(t4) j .L4 Tested on both RV32/RV64 no regression. Ok for trunk ? PR target/113696 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::earliest_fuse_vsetvl_info): Suppress vsetvl fusion. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/pr113696.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 14 ++ .../gcc.target/riscv/rvv/vsetvl/pr113696.c| 26 +++ 2 files changed, 40 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113696.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index cec862329c5..79fc2ec2401 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2959,6 +2959,20 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter) src_block_info.set_empty_info (); src_block_info.probability = profile_probability::uninitialized (); + /* See PR113696, we should reset immediate dominator to +empty since we may uplift ineffective vsetvl which +locate at low probability block. */ + basic_block dom + = get_immediate_dominator (CDI_DOMINATORS, eg->src); + auto &dom_block_info = get_block_info (dom); + if (dom_block_info.has_info () + && !m_dem.compatible_p ( + dom_block_info.get_exit_info (), curr_info)) + { + dom_block_info.set_empty_info (); + dom_block_info.probability + = profile_probability::uninitialized (); + } changed = true; } /* Choose the one with higher probability. */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113696.c b
[PATCH] middle-end: Enhance conditional reduction vectorization by re-association in ifcvt [PR109088]
This patch targets GCC-15. Consider this following case: unsigned int single_loop_with_if_condition (unsigned int *restrict a, unsigned int *restrict b, unsigned int *restrict c, unsigned int loop_size) { unsigned int result = 0; for (unsigned int i = 0; i < loop_size; i++) { if (a[i] > b[i]) { result += a[i] + 0xa - c[i]; } } return result; } After investigation of LLVM, I find LLVM re-associate such case to make it easier to be vectorized. Take RISC-V ASM as example. Before this patch: beq a3,zero,.L5 sllia5,a3,32 srlia3,a5,30 mv a4,a0 add a7,a0,a3 li a0,0 .L4: lw a3,0(a4) addiw a5,a0,10 lw a6,0(a1) addia4,a4,4 addwa5,a5,a3 bgeua6,a3,.L3 lw a0,0(a2) subwa0,a5,a0 .L3: addia1,a1,4 addia2,a2,4 bne a7,a4,.L4 ret .L5: li a0,0 ret After this patch: beq a3,zero,.L4 sllia3,a3,32 srlia3,a3,32 vsetvli a5,zero,e32,m1,ta,ma vmv.v.i v2,0 .L3: vsetvli a5,a3,e32,m1,tu,mu sllia4,a5,2 sub a3,a3,a5 vle32.v v3,0(a0) vle32.v v0,0(a1) add a0,a0,a4 vmsgtu.vv v0,v3,v0 add a1,a1,a4 vle32.v v1,0(a2),v0.t add a2,a2,a4 vadd.vi v1,v1,-10 vsub.vv v1,v1,v3 vadd.vv v2,v2,v1,v0.t bne a3,zero,.L3 li a5,0 vsetivlizero,1,e32,m1,ta,ma vmv.s.x v1,a5 vsetvli a5,zero,e32,m1,ta,ma vredsum.vs v2,v2,v1 vmv.x.s a0,v2 ret PR middle-end/109088 gcc/ChangeLog: * tree-if-conv.cc (is_cond_scalar_reduction): Enhance conditional reduction. (convert_scalar_cond_reduction): Ditto. gcc/testsuite/ChangeLog: * gcc.dg/vect/pr109088-1.c: New test. * gcc.dg/vect/pr109088-2.c: New test. * gcc.dg/vect/pr109088-3.c: New test. * gcc.dg/vect/pr109088-4.c: New test. * gcc.dg/vect/pr109088-5.c: New test. --- gcc/testsuite/gcc.dg/vect/pr109088-1.c | 201 gcc/testsuite/gcc.dg/vect/pr109088-2.c | 202 gcc/testsuite/gcc.dg/vect/pr109088-3.c | 314 + gcc/testsuite/gcc.dg/vect/pr109088-4.c | 84 +++ gcc/testsuite/gcc.dg/vect/pr109088-5.c | 96 gcc/tree-if-conv.cc| 150 +++- 6 files changed, 1042 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/pr109088-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/pr109088-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/pr109088-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/pr109088-4.c create mode 100644 gcc/testsuite/gcc.dg/vect/pr109088-5.c diff --git a/gcc/testsuite/gcc.dg/vect/pr109088-1.c b/gcc/testsuite/gcc.dg/vect/pr109088-1.c new file mode 100644 index 000..6772e908535 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr109088-1.c @@ -0,0 +1,201 @@ +/* { dg-require-effective-target vect_int } */ +/* { dg-require-effective-target vect_condition } */ + +#include "tree-vect.h" + +#define N 27 + +#define COND_REDUC(NAME, TYPE, OP) \ + TYPE __attribute__ ((noipa)) \ + cond_##NAME (TYPE *__restrict a, int *__restrict cond1, \ + int *__restrict cond2, TYPE init, int n)\ + { \ +TYPE result = init; \ +for (int i = 0; i < n; i++) \ + if (cond1[i] > cond2[i]) \ + result OP a[i];\ +return result; \ + } + +COND_REDUC (reduc_sum_char, char, +=) +COND_REDUC (reduc_sum_short, short, +=) +COND_REDUC (reduc_sum_int, int, +=) +COND_REDUC (reduc_sum_long, long, +=) +COND_REDUC (reduc_and_char, char, &=) +COND_REDUC (reduc_and_short, short, &=) +COND_REDUC (reduc_and_int, int, &=) +COND_REDUC (reduc_and_long, long, &=) +COND_REDUC (reduc_ior_char, char, |=) +COND_REDUC (reduc_ior_short, short, |=) +COND_REDUC (reduc_ior_int, int, |=) +COND_REDUC (reduc_ior_long, long, |=) +COND_REDUC (reduc_xor_char, char, ^=) +COND_REDUC (reduc_xor_short, short, ^=) +COND_REDUC (reduc_xor_int, int, ^=) +COND_REDUC (reduc_xor_long, long, ^=) + +int +main (void) +{ + check_vect (); + int cond1[N] = {11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 21, 22, 23, 24, 25, 26, 27}; + int cond2[N] = {15, 5, 6, 7, 8,
[Committed] RISC-V: Fix regression
Due to recent middle-end loop vectorizer changes, these tests have regression and the changes are reasonable. Adapt test to fix the regression. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt test. * gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/mod-1.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/shift-1.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/shift-2.c: Ditto. --- .../gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c | 2 +- .../gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c | 2 +- gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mod-1.c | 2 +- gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-1.c| 2 +- gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-2.c| 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c index befa4b85e8f..d5348855aa0 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c @@ -4,5 +4,5 @@ #include "shift-template.h" /* { dg-final { scan-assembler-times {\tvsll\.vv} 8 } } */ -/* { dg-final { scan-assembler-times {\tvsrl\.vv} 4 } } */ +/* { dg-final { scan-assembler-times {\tvsrl\.vv} 2 } } */ /* { dg-final { scan-assembler-times {\tvsra\.vv} 4 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c index 976b29fa356..a533dc79bc0 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c @@ -4,5 +4,5 @@ #include "shift-template.h" /* { dg-final { scan-assembler-times {\tvsll\.vv} 8 } } */ -/* { dg-final { scan-assembler-times {\tvsrl\.vv} 4 } } */ +/* { dg-final { scan-assembler-times {\tvsrl\.vv} 2 } } */ /* { dg-final { scan-assembler-times {\tvsra\.vv} 4 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mod-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mod-1.c index 57bbf8fbc68..17d2784b90d 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mod-1.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mod-1.c @@ -53,5 +53,5 @@ DEF_OP_VV (mod, 128, int64_t, %) DEF_OP_VV (mod, 256, int64_t, %) DEF_OP_VV (mod, 512, int64_t, %) -/* { dg-final { scan-assembler-times {vremu?\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 42 } } */ +/* { dg-final { scan-assembler-times {vremu?\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 47 } } */ /* { dg-final { scan-assembler-not {csrr} } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-1.c index cb5a1dbc9ff..ee8da2573c7 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-1.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-1.c @@ -53,5 +53,5 @@ DEF_OP_VV (shift, 128, int64_t, >>) DEF_OP_VV (shift, 256, int64_t, >>) DEF_OP_VV (shift, 512, int64_t, >>) -/* { dg-final { scan-assembler-times {vsra\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 42 } } */ +/* { dg-final { scan-assembler-times {vsra\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 35 } } */ /* { dg-final { scan-assembler-not {csrr} } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-2.c index e626a52c2d8..ebd5575f267 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-2.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-2.c @@ -53,5 +53,5 @@ DEF_OP_VV (shift, 128, uint64_t, >>) DEF_OP_VV (shift, 256, uint64_t, >>) DEF_OP_VV (shift, 512, uint64_t, >>) -/* { dg-final { scan-assembler-times {vsrl\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 42 } } */ +/* { dg-final { scan-assembler-times {vsrl\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */ /* { dg-final { scan-assembler-not {csrr} } } */ -- 2.36.1
[PATCH] RISC-V: Fix VSETLV PASS compile-time issue
The compile time issue was discovered in SPEC 2017 wrf: Use time and -ftime-report to analyze the profile data of SPEC 2017 wrf compilation . Before this patch (Lazy vsetvl): scheduling : 121.89 ( 15%) 0.53 ( 11%) 122.72 ( 15%) 13M ( 1%) machine dep reorg : 424.61 ( 53%) 1.84 ( 37%) 427.44 ( 53%) 5290k ( 0%) real13m27.074s user13m19.539s sys 0m5.180s Simple vsetvl: machine dep reorg : 0.10 ( 0%) 0.00 ( 0%) 0.11 ( 0%) 4138k ( 0%) real6m5.780s user6m2.396s sys 0m2.373s The machine dep reorg is the compile time of VSETVL PASS (424 seconds) which counts 53% of the compilation time, spends much more time than scheduling. After investigation, the critical patch of VSETVL pass is compute_lcm_local_properties which is called every iteration of phase 2 (earliest fusion) and phase 3 (global lcm). This patch optimized the codes of compute_lcm_local_properties to reduce the compilation time. After this patch: scheduling : 117.51 ( 27%) 0.21 ( 6%) 118.04 ( 27%) 13M ( 1%) machine dep reorg : 80.13 ( 18%) 0.91 ( 26%) 81.26 ( 18%) 5290k ( 0%) real7m25.374s user7m20.116s sys 0m3.795s The optimization of this patch is very obvious, lazy VSETVL PASS: 424s (53%) -> 80s (18%) which spend less time than scheduling. Tested on both RV32 and RV64 no regression. Ok for trunk ? PR target/113495 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (extract_single_source): Remove. (pre_vsetvl::compute_vsetvl_def_data): Fix compile time issue. (pre_vsetvl::compute_transparent): New function. (pre_vsetvl::compute_lcm_local_properties): Fix compile time time issue. --- gcc/config/riscv/riscv-vsetvl.cc | 184 ++- 1 file changed, 60 insertions(+), 124 deletions(-) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index d7b40a5c813..cec862329c5 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -599,14 +599,6 @@ extract_single_source (set_info *set) return first_insn; } -static insn_info * -extract_single_source (def_info *def) -{ - if (!def) -return nullptr; - return extract_single_source (dyn_cast (def)); -} - static bool same_equiv_note_p (set_info *set1, set_info *set2) { @@ -2374,6 +2366,7 @@ public: } void compute_vsetvl_def_data (); + void compute_transparent (const bb_info *); void compute_lcm_local_properties (); void fuse_local_vsetvl_info (); @@ -2452,20 +2445,16 @@ pre_vsetvl::compute_vsetvl_def_data () { for (unsigned i = 0; i < m_vsetvl_def_exprs.length (); i += 1) { - const vsetvl_info &info = *m_vsetvl_def_exprs[i]; - if (!info.has_nonvlmax_reg_avl ()) - continue; - unsigned int regno; - sbitmap_iterator sbi; - EXECUTE_IF_SET_IN_BITMAP (m_reg_def_loc[bb->index ()], 0, regno, - sbi) - if (regno == REGNO (info.get_avl ())) - { - bitmap_set_bit (m_kill[bb->index ()], i); - bitmap_set_bit (def_loc[bb->index ()], - get_expr_index (m_vsetvl_def_exprs, - m_unknow_info)); - } + auto *info = m_vsetvl_def_exprs[i]; + if (info->has_nonvlmax_reg_avl () + && bitmap_bit_p (m_reg_def_loc[bb->index ()], + REGNO (info->get_avl ( + { + bitmap_set_bit (m_kill[bb->index ()], i); + bitmap_set_bit (def_loc[bb->index ()], + get_expr_index (m_vsetvl_def_exprs, + m_unknow_info)); + } } continue; } @@ -2516,6 +2505,36 @@ pre_vsetvl::compute_vsetvl_def_data () sbitmap_vector_free (m_kill); } +/* Subroutine of compute_lcm_local_properties which Compute local transparent + BB. Note that the compile time is very sensitive to compute_transparent and + compute_lcm_local_properties, any change of these 2 functions should be + aware of the compile time changing of the program which has a large number of + blocks, e.g SPEC 2017 wrf. + + Current compile time profile of SPEC 2017 wrf: + + 1. scheduling - 27% + 2. machine dep reorg (VSETVL PASS) - 18% + + VSETVL pass should not spend more time than scheduling in compilation. */ +void +pre_vsetvl::compute_transparent (const bb_info *bb) +{ + int num_exprs = m_exprs.length (); + unsigned bb_index = bb->index (); + for (int i = 0; i < num_exprs; i++) +{ + auto *info = m_exprs[i]; + if (info->has_nonvlmax_reg_avl () + && bitmap_bit_p (m_reg_def_loc[bb_index], REGNO (
[Committed] RISC-V: Refine some codes of VSETVL PASS [NFC]
gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::earliest_fuse_vsetvl_info): Refine some codes. (pre_vsetvl::emit_vsetvl): Ditto. --- gcc/config/riscv/riscv-vsetvl.cc | 69 +--- 1 file changed, 27 insertions(+), 42 deletions(-) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 1a398f02596..d7b40a5c813 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2930,28 +2930,19 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter) EXECUTE_IF_SET_IN_BITMAP (e, 0, expr_index, sbi) { vsetvl_info &curr_info = *m_exprs[expr_index]; - if (!curr_info.valid_p ()) - continue; - edge eg = INDEX_EDGE (m_edges, ed); - if (eg->probability == profile_probability::never ()) - continue; - if (eg->src == ENTRY_BLOCK_PTR_FOR_FN (cfun) - || eg->dest == EXIT_BLOCK_PTR_FOR_FN (cfun)) - continue; - - /* When multiple set bits in earliest edge, such edge may -have infinite loop in preds or succs or multiple conflict -vsetvl expression which make such edge is unrelated. We -don't perform fusion for such situation. */ - if (bitmap_count_bits (e) != 1) - continue; - vsetvl_block_info &src_block_info = get_block_info (eg->src); vsetvl_block_info &dest_block_info = get_block_info (eg->dest); - if (src_block_info.probability - == profile_probability::uninitialized ()) + if (!curr_info.valid_p () + || eg->probability == profile_probability::never () + || src_block_info.probability + == profile_probability::uninitialized () + /* When multiple set bits in earliest edge, such edge may +have infinite loop in preds or succs or multiple conflict +vsetvl expression which make such edge is unrelated. We +don't perform fusion for such situation. */ + || bitmap_count_bits (e) != 1) continue; if (src_block_info.empty_p ()) @@ -3058,29 +3049,27 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter) { vsetvl_info &prev_info = src_block_info.get_exit_info (); if (!prev_info.valid_p () - || m_dem.available_p (prev_info, curr_info)) + || m_dem.available_p (prev_info, curr_info) + || !m_dem.compatible_p (prev_info, curr_info)) continue; - if (m_dem.compatible_p (prev_info, curr_info)) + if (dump_file && (dump_flags & TDF_DETAILS)) { - if (dump_file && (dump_flags & TDF_DETAILS)) - { - fprintf (dump_file, "Fuse curr info since prev info " - "compatible with it:\n"); - fprintf (dump_file, " prev_info: "); - prev_info.dump (dump_file, ""); - fprintf (dump_file, " curr_info: "); - curr_info.dump (dump_file, ""); - } - m_dem.merge (prev_info, curr_info); - if (dump_file && (dump_flags & TDF_DETAILS)) - { - fprintf (dump_file, " prev_info after fused: "); - prev_info.dump (dump_file, ""); - fprintf (dump_file, "\n"); - } - changed = true; + fprintf (dump_file, "Fuse curr info since prev info " + "compatible with it:\n"); + fprintf (dump_file, " prev_info: "); + prev_info.dump (dump_file, ""); + fprintf (dump_file, " curr_info: "); + curr_info.dump (dump_file, ""); + } + m_dem.merge (prev_info, curr_info); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, " prev_info after fused: "); + prev_info.dump (dump_file, ""); + fprintf (dump_file, "\n"); } + changed = true; } } } @@ -3344,15 +,11 @@ pre_vsetvl::emit_vsetvl () { edge eg = INDEX_EDGE (m_edges, ed); sbitmap i = m_insert[ed]; - if (bitmap_count_bits (i) < 1) - continue; - - if (bitmap_count_bits (i) > 1) + if (bitmap_count_bits (i) != 1) /* For code with infinite loop (e.g. pr61634.c), The data flow is completely wrong. */ continue; - gcc_assert (bitmap_count_bits (i) == 1); unsigned expr_index = bitmap_first_set_bit (i); const vsetvl_info &info = *m_exprs[expr_index]; gcc_assert (info.valid_p ()); -
[Committed V2] RISC-V: Fix incorrect LCM delete bug [VSETVL PASS]
This patch fixes the recent noticed bug in RV32 glibc. We incorrectly deleted a vsetvl: ... and a4,a4,a3 vmv.v.i v1,0 ---> Missed vsetvl cause illegal instruction report. vse8.v v1,0(a5) The root cause the laterin in LCM is incorrect. BB 358: avloc: n_bits = 2, set = {} kill: n_bits = 2, set = {} antloc: n_bits = 2, set = {} transp: n_bits = 2, set = {} avin: n_bits = 2, set = {} avout: n_bits = 2, set = {} del: n_bits = 2, set = {} cause LCM let BB 360 delete the vsetvl: BB 360: avloc: n_bits = 2, set = {} kill: n_bits = 2, set = {} antloc: n_bits = 2, set = {} transp: n_bits = 2, set = {0 1 } avin: n_bits = 2, set = {} avout: n_bits = 2, set = {} del: n_bits = 2, set = {1} Also, remove unknown vsetvl info into local computation since it is unnecessary. Tested on both RV32/RV64 no regression. PR target/113469 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::compute_lcm_local_properties): Fix bug. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr113469.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 19 +++ .../gcc.target/riscv/rvv/autovec/pr113469.c | 54 +++ 2 files changed, 64 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index da258b964fc..1a398f02596 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2543,8 +2543,10 @@ pre_vsetvl::compute_lcm_local_properties () vsetvl_info &header_info = block_info.get_entry_info (); vsetvl_info &footer_info = block_info.get_exit_info (); gcc_assert (footer_info.valid_p () || footer_info.unknown_p ()); - add_expr (m_exprs, header_info); - add_expr (m_exprs, footer_info); + if (header_info.valid_p ()) + add_expr (m_exprs, header_info); + if (footer_info.valid_p ()) + add_expr (m_exprs, footer_info); } int num_exprs = m_exprs.length (); @@ -2699,13 +2701,6 @@ pre_vsetvl::compute_lcm_local_properties () } } - for (const bb_info *bb : crtl->ssa->bbs ()) -{ - unsigned bb_index = bb->index (); - bitmap_ior (m_kill[bb_index], m_transp[bb_index], m_avloc[bb_index]); - bitmap_not (m_kill[bb_index], m_kill[bb_index]); -} - for (const bb_info *bb : crtl->ssa->bbs ()) { unsigned bb_index = bb->index (); @@ -2714,6 +2709,12 @@ pre_vsetvl::compute_lcm_local_properties () bitmap_clear (m_antloc[bb_index]); bitmap_clear (m_transp[bb_index]); } + /* Compute ae_kill for each basic block using: + +~(TRANSP | COMP) + */ + bitmap_ior (m_kill[bb_index], m_transp[bb_index], m_avloc[bb_index]); + bitmap_not (m_kill[bb_index], m_kill[bb_index]); } } diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c new file mode 100644 index 000..d1c118c02d6 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 -fno-vect-cost-model" } */ + +struct a { + int b; + int c : 1; + int : 1; +} d(); +typedef struct +{ + int e; + struct { + int f; + }; +} g; +int i; +char k, l, n; +void *m; +char *o; +void h(); +char *j(); +void p(int buf, __builtin_va_list ab, int q) { + do { + void *r[] = {&&s, &&t, &&u, &&v, &&w}; + int c; + goto *m; + s: + c = 1; + while (1) { + t: + u: + ae: + void *af = __builtin_va_arg(ab, void *); + h(p); + o = j(i); + if (o == 0) + goto ae; + l = 'S'; + break; + v: + g ah; + __builtin_memset(&ah, '\0', sizeof(g)); + h(n, __builtin_va_arg(ab, int), &ah); + break; + w: + if (__builtin_expect(q, 0)) + c = 0; + struct a ai = {'S', c}; + d(buf, ai, af); + } + } while (k); +} + +/* { dg-final { scan-assembler-times {vsetivli\tzero,\s*4,\s*e8,\s*mf4,\s*t[au],\s*m[au]} 2 } } */ -- 2.36.3
[PATCH] RISC-V: Fix incorrect LCM delete bug [VSETVL PASS]
This patch fixes the recent noticed bug in RV32 glibc. We incorrectly deleted a vsetvl: ... and a4,a4,a3 vmv.v.i v1,0 ---> Missed vsetvl cause illegal instruction report. vse8.v v1,0(a5) The root cause the laterin in LCM is incorrect. BB 358: avloc: n_bits = 2, set = {} kill: n_bits = 2, set = {} antloc: n_bits = 2, set = {} transp: n_bits = 2, set = {} avin: n_bits = 2, set = {} avout: n_bits = 2, set = {} del: n_bits = 2, set = {} cause LCM let BB 360 delete the vsetvl: BB 360: avloc: n_bits = 2, set = {} kill: n_bits = 2, set = {} antloc: n_bits = 2, set = {} transp: n_bits = 2, set = {0 1 } avin: n_bits = 2, set = {} avout: n_bits = 2, set = {} del: n_bits = 2, set = {1} Also, remove unknown vsetvl info into local computation since it is unnecessary. Tested on both RV32/RV64 no regression. PR target/113469 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::compute_lcm_local_properties): Fix bug. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr113469.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 21 +- .../gcc.target/riscv/rvv/autovec/pr113469.c | 1841 + 2 files changed, 1853 insertions(+), 9 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index da258b964fc..f300f00e62a 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2543,8 +2543,10 @@ pre_vsetvl::compute_lcm_local_properties () vsetvl_info &header_info = block_info.get_entry_info (); vsetvl_info &footer_info = block_info.get_exit_info (); gcc_assert (footer_info.valid_p () || footer_info.unknown_p ()); - add_expr (m_exprs, header_info); - add_expr (m_exprs, footer_info); + if (header_info.valid_p ()) + add_expr (m_exprs, header_info); + if (footer_info.valid_p ()) + add_expr (m_exprs, footer_info); } int num_exprs = m_exprs.length (); @@ -2699,13 +2701,6 @@ pre_vsetvl::compute_lcm_local_properties () } } - for (const bb_info *bb : crtl->ssa->bbs ()) -{ - unsigned bb_index = bb->index (); - bitmap_ior (m_kill[bb_index], m_transp[bb_index], m_avloc[bb_index]); - bitmap_not (m_kill[bb_index], m_kill[bb_index]); -} - for (const bb_info *bb : crtl->ssa->bbs ()) { unsigned bb_index = bb->index (); @@ -2713,8 +2708,16 @@ pre_vsetvl::compute_lcm_local_properties () { bitmap_clear (m_antloc[bb_index]); bitmap_clear (m_transp[bb_index]); + bitmap_clear (m_avloc[bb_index]); } } + + for (const bb_info *bb : crtl->ssa->bbs ()) +{ + unsigned bb_index = bb->index (); + bitmap_ior (m_kill[bb_index], m_transp[bb_index], m_avloc[bb_index]); + bitmap_not (m_kill[bb_index], m_kill[bb_index]); +} } void diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c new file mode 100644 index 000..2502040772b --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c @@ -0,0 +1,1841 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3" } */ + +#include +#include +#include +#include +#include + +static int read_int (const unsigned char * *pstr) {}; +static const char null[] = "(null)"; +extern size_t __strnlen (const char *__string, size_t __maxlen) __attribute__ ((__pure__)); + +struct printf_info +{ + int prec; + int width; + wchar_t spec; + unsigned int is_long_double:1; + unsigned int is_short:1; + unsigned int is_long:1; + unsigned int alt:1; + unsigned int space:1; + unsigned int left:1; + unsigned int showsign:1; + unsigned int group:1; + unsigned int extra:1; + unsigned int is_char:1; + unsigned int wide:1; + unsigned int i18n:1; + unsigned int is_binary128:1; + + unsigned int __pad:3; + unsigned short int user; + wchar_t pad; +}; + +enum { + ABDAY_1 = (((2) << 16) | (0)), + ABDAY_2, + ABDAY_3, + ABDAY_4, + ABDAY_5, + ABDAY_6, + ABDAY_7, + DAY_1, + DAY_2, + DAY_3, + DAY_4, + DAY_5, + DAY_6, + DAY_7, + ABMON_1, + ABMON_2, + ABMON_3, + ABMON_4, + ABMON_5, + ABMON_6, + ABMON_7, + ABMON_8, + ABMON_9, + ABMON_10, + ABMON_11, + ABMON_12, + MON_1, + MON_2, + MON_3, + MON_4, + MON_5, + MON_6, + MON_7, + MON_8, + MON_9, + MON_10, + MON_11, + MON_12, + AM_STR, + PM_STR, + D_T_FMT, + D_FMT, + T_FMT, + T_FMT_AMPM, + ERA, + __ERA_YEAR, + ERA_D_FMT, + + ALT_DIGITS, + + ERA_D_T_FMT, + + ERA_T_FMT, + _NL_TIME_ERA_NUM_ENTRIES, + _NL_TIME_ERA_ENTRIES, + + _NL_WABDAY_1, + _NL_WABDAY_2, + _NL_WABDAY_3, + _NL_WABDAY_4, + _NL_WABDAY_5, + _NL_WABDAY_6, + _NL_WABDAY_7, + _NL_WDA
[PATCH] RISC-V: Add LCM delete block predecessors dump information
While looking into PR113469, I notice the LCM delete a vsetvl incorrectly. This patch add dump information of all predecessors for LCM delete vsetvl block for better debugging. Tested no regression. gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (get_all_predecessors): New function. (pre_vsetvl::pre_global_vsetvl_info): Add LCM delete block all predecessors dump information. --- gcc/config/riscv/riscv-vsetvl.cc | 42 1 file changed, 42 insertions(+) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 72c5a127d9e..da258b964fc 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -654,6 +654,31 @@ invalid_opt_bb_p (basic_block cfg_bb) return false; } +/* Get all predecessors of BB. */ +static hash_set +get_all_predecessors (basic_block bb) +{ + hash_set blocks; + auto_vec work_list; + hash_set visited_list; + work_list.safe_push (bb); + + while (!work_list.is_empty ()) +{ + basic_block new_bb = work_list.pop (); + visited_list.add (new_bb); + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, new_bb->preds) + { + if (!visited_list.contains (e->src)) + work_list.safe_push (e->src); + blocks.add (e->src); + } +} + return blocks; +} + /* This flags indicates the minimum demand of the vl and vtype values by the RVV instruction. For example, DEMAND_RATIO_P indicates that this RVV instruction only needs the SEW/LMUL ratio to remain the same, and does not @@ -3142,6 +3167,23 @@ pre_vsetvl::pre_global_vsetvl_info () const vsetvl_block_info &block_info = get_block_info (info.get_bb ()); gcc_assert (block_info.get_entry_info () == info); info.set_delete (); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, + "\nLCM deleting vsetvl of block %d, it has predecessors: \n", + bb->index ()); + hash_set all_preds + = get_all_predecessors (bb->cfg_bb ()); + int i = 0; + for (const auto pred : all_preds) + { + fprintf (dump_file, "%d ", pred->index); + i++; + if (i % 32 == 0) + fprintf (dump_file, "\n"); + } + fprintf (dump_file, "\n"); + } } /* Remove vsetvl infos if all precessors are available to the block. */ -- 2.36.3
[Committed] RISC-V: Remove redundant full available computation [NFC]
Notice full available is computed evey round of earliest fusion which is redundant. Actually we only need to compute it once in phase 3. It's NFC patch and tested no regression. Committed. gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::compute_vsetvl_def_data): Remove redundant full available computation. (pre_vsetvl::pre_global_vsetvl_info): Ditto. --- gcc/config/riscv/riscv-vsetvl.cc | 57 +--- 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 53d954e1dff..72c5a127d9e 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -1256,9 +1256,7 @@ public: vsetvl_info global_info; bb_info *bb; - bool full_available; - - vsetvl_block_info () : bb (nullptr), full_available (false) + vsetvl_block_info () : bb (nullptr) { local_infos.safe_grow_cleared (0); global_info.set_empty (); @@ -2489,34 +2487,6 @@ pre_vsetvl::compute_vsetvl_def_data () } } - for (const bb_info *bb : crtl->ssa->bbs ()) -{ - vsetvl_block_info &block_info = get_block_info (bb); - if (block_info.empty_p ()) - continue; - vsetvl_info &curr_info = block_info.get_entry_info (); - if (!curr_info.valid_p ()) - continue; - - unsigned int expr_index; - sbitmap_iterator sbi; - gcc_assert ( - !bitmap_empty_p (m_vsetvl_def_in[curr_info.get_bb ()->index ()])); - bool full_available = true; - EXECUTE_IF_SET_IN_BITMAP (m_vsetvl_def_in[bb->index ()], 0, expr_index, - sbi) - { - vsetvl_info &prev_info = *m_vsetvl_def_exprs[expr_index]; - if (!prev_info.valid_p () - || !m_dem.available_p (prev_info, curr_info)) - { - full_available = false; - break; - } - } - block_info.full_available = full_available; -} - sbitmap_vector_free (def_loc); sbitmap_vector_free (m_kill); } @@ -3178,11 +3148,30 @@ pre_vsetvl::pre_global_vsetvl_info () for (const bb_info *bb : crtl->ssa->bbs ()) { vsetvl_block_info &block_info = get_block_info (bb); - if (block_info.empty_p () || !block_info.full_available) + if (block_info.empty_p ()) + continue; + vsetvl_info &curr_info = block_info.get_entry_info (); + if (!curr_info.valid_p ()) continue; - vsetvl_info &info = block_info.get_entry_info (); - info.set_delete (); + unsigned int expr_index; + sbitmap_iterator sbi; + gcc_assert ( + !bitmap_empty_p (m_vsetvl_def_in[curr_info.get_bb ()->index ()])); + bool full_available = true; + EXECUTE_IF_SET_IN_BITMAP (m_vsetvl_def_in[bb->index ()], 0, expr_index, + sbi) + { + vsetvl_info &prev_info = *m_vsetvl_def_exprs[expr_index]; + if (!prev_info.valid_p () + || !m_dem.available_p (prev_info, curr_info)) + { + full_available = false; + break; + } + } + if (full_available) + curr_info.set_delete (); } for (const bb_info *bb : crtl->ssa->bbs ()) -- 2.36.3
[Committed] RISC-V: Add optim-no-fusion compile option [VSETVL PASS]
This patch adds no fusion compile option to disable phase 2 global fusion. It can help us to analyze the compile-time and debugging. Committed. gcc/ChangeLog: * config/riscv/riscv-opts.h (enum vsetvl_strategy_enum): Add optim-no-fusion option. * config/riscv/riscv-vsetvl.cc (pass_vsetvl::lazy_vsetvl): Ditto. (pass_vsetvl::execute): Ditto. * config/riscv/riscv.opt: Ditto. --- gcc/config/riscv/riscv-opts.h| 8 +--- gcc/config/riscv/riscv-vsetvl.cc | 22 -- gcc/config/riscv/riscv.opt | 5 - 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h index ca57dddf1d9..1500f8811ef 100644 --- a/gcc/config/riscv/riscv-opts.h +++ b/gcc/config/riscv/riscv-opts.h @@ -118,11 +118,13 @@ enum stringop_strategy_enum { /* Behavior of VSETVL Pass. */ enum vsetvl_strategy_enum { - /* Simple: Insert a vsetvl* instruction for each Vector instruction. */ - VSETVL_SIMPLE = 1, /* Optimized: Run LCM dataflow analysis to reduce vsetvl* insns and delete any redundant ones generated in the process. */ - VSETVL_OPT = 2 + VSETVL_OPT, + /* Simple: Insert a vsetvl* instruction for each Vector instruction. */ + VSETVL_SIMPLE, + /* No fusion: Disable Phase 2 earliest global fusion. */ + VSETVL_OPT_NO_FUSION, }; #define TARGET_ZICOND_LIKE (TARGET_ZICOND || (TARGET_XVENTANACONDOPS && TARGET_64BIT)) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 170fc7f003d..53d954e1dff 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -3495,16 +3495,18 @@ pass_vsetvl::lazy_vsetvl () /* Phase 2: Fuse header and footer vsetvl infos between basic blocks. */ if (dump_file) fprintf (dump_file, "\nPhase 2: Lift up vsetvl info.\n\n"); - bool changed; - int fused_count = 0; - do + if (vsetvl_strategy != VSETVL_OPT_NO_FUSION) { - if (dump_file) - fprintf (dump_file, " Try lift up %d.\n\n", fused_count); - changed = pre.earliest_fuse_vsetvl_info (fused_count); - fused_count += 1; - } while (changed); - + bool changed = true; + int fused_count = 0; + do + { + if (dump_file) + fprintf (dump_file, " Try lift up %d.\n\n", fused_count); + changed = pre.earliest_fuse_vsetvl_info (fused_count); + fused_count += 1; + } while (changed); +} if (dump_file && (dump_flags & TDF_DETAILS)) pre.dump (dump_file, "phase 2"); @@ -3545,7 +3547,7 @@ pass_vsetvl::execute (function *) if (!has_vector_insn (cfun)) return 0; - if (!optimize || vsetvl_strategy & VSETVL_SIMPLE) + if (!optimize || vsetvl_strategy == VSETVL_SIMPLE) simple_vsetvl (); else lazy_vsetvl (); diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt index 65c656204ca..7c2292d8f91 100644 --- a/gcc/config/riscv/riscv.opt +++ b/gcc/config/riscv/riscv.opt @@ -552,11 +552,14 @@ Enum Name(vsetvl_strategy) Type(enum vsetvl_strategy_enum) Valid arguments to -param=vsetvl-strategy=: +EnumValue +Enum(vsetvl_strategy) String(optim) Value(VSETVL_OPT) + EnumValue Enum(vsetvl_strategy) String(simple) Value(VSETVL_SIMPLE) EnumValue -Enum(vsetvl_strategy) String(optim) Value(VSETVL_OPT) +Enum(vsetvl_strategy) String(optim-no-fusion) Value(VSETVL_OPT_NO_FUSION) -param=vsetvl-strategy= Target Undocumented RejectNegative Joined Enum(vsetvl_strategy) Var(vsetvl_strategy) Init(VSETVL_OPT) -- 2.36.3
[PATCH] RISC-V: Fix large memory usage of VSETVL PASS [PR113495]
SPEC 2017 wrf benchmark expose unreasonble memory usage of VSETVL PASS that is, VSETVL PASS consume over 33 GB memory which make use impossible to compile SPEC 2017 wrf in a laptop. The root cause is wasting-memory variables: unsigned num_exprs = num_bbs * num_regs; sbitmap *avl_def_loc = sbitmap_vector_alloc (num_bbs, num_exprs); sbitmap *m_kill = sbitmap_vector_alloc (num_bbs, num_exprs); m_avl_def_in = sbitmap_vector_alloc (num_bbs, num_exprs); m_avl_def_out = sbitmap_vector_alloc (num_bbs, num_exprs); I find that compute_avl_def_data can be achieved by RTL_SSA framework. Replace the code implementation base on RTL_SSA framework. After this patch, the memory-hog issue is fixed. simple vsetvl memory usage (valgrind --tool=massif --pages-as-heap=yes --massif-out-file=massif.out) is 1.673 GB. lazy vsetvl memory usage (valgrind --tool=massif --pages-as-heap=yes --massif-out-file=massif.out) is 2.441 GB. Tested on both RV32 and RV64, no regression. gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (get_expr_id): Remove. (get_regno): Ditto. (get_bb_index): Ditto. (pre_vsetvl::compute_avl_def_data): Ditto. (pre_vsetvl::earliest_fuse_vsetvl_info): Fix large memory usage. (pre_vsetvl::pre_global_vsetvl_info): Ditto. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/avl_single-107.c: Adapt test. --- gcc/config/riscv/riscv-vsetvl.cc | 233 -- .../riscv/rvv/vsetvl/avl_single-107.c | 2 +- 2 files changed, 52 insertions(+), 183 deletions(-) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 54c85ffb7d5..170fc7f003d 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -617,22 +617,6 @@ same_equiv_note_p (set_info *set1, set_info *set2) return source_equal_p (insn1, insn2); } -static unsigned -get_expr_id (unsigned bb_index, unsigned regno, unsigned num_bbs) -{ - return regno * num_bbs + bb_index; -} -static unsigned -get_regno (unsigned expr_id, unsigned num_bb) -{ - return expr_id / num_bb; -} -static unsigned -get_bb_index (unsigned expr_id, unsigned num_bb) -{ - return expr_id % num_bb; -} - /* Return true if the SET result is not used by any instructions. */ static bool has_no_uses (basic_block cfg_bb, rtx_insn *rinsn, int regno) @@ -1337,9 +1321,6 @@ public: class demand_system { private: - sbitmap *m_avl_def_in; - sbitmap *m_avl_def_out; - /* predictors. */ inline bool always_true (const vsetvl_info &prev ATTRIBUTE_UNUSED, @@ -1743,14 +1724,6 @@ private: } public: - demand_system () : m_avl_def_in (nullptr), m_avl_def_out (nullptr) {} - - void set_avl_in_out_data (sbitmap *m_avl_def_in, sbitmap *m_avl_def_out) - { -m_avl_def_in = m_avl_def_in; -m_avl_def_out = m_avl_def_out; - } - /* Can we move vsetvl info between prev_insn and next_insn safe? */ bool avl_vl_unmodified_between_p (insn_info *prev_insn, insn_info *next_insn, const vsetvl_info &info, @@ -1778,32 +1751,66 @@ public: } else { + basic_block prev_cfg_bb = prev_insn->bb ()->cfg_bb (); if (!ignore_vl && info.has_vl ()) { - bitmap live_out = df_get_live_out (prev_insn->bb ()->cfg_bb ()); + bitmap live_out = df_get_live_out (prev_cfg_bb); if (bitmap_bit_p (live_out, REGNO (info.get_vl ( return false; } - if (info.has_nonvlmax_reg_avl () && m_avl_def_in && m_avl_def_out) + /* Find set_info at location of PREV_INSN and NEXT_INSN, Return + false if those 2 set_info are different. + +PREV_INSN --- multiple nested blocks --- NEXT_INSN. + + Return false if there is any modifications of AVL inside those + multiple nested blocks. */ + if (info.has_nonvlmax_reg_avl ()) { - bool has_avl_out = false; - unsigned regno = REGNO (info.get_avl ()); - unsigned expr_id; - sbitmap_iterator sbi; - EXECUTE_IF_SET_IN_BITMAP (m_avl_def_out[prev_insn->bb ()->index ()], - 0, expr_id, sbi) + resource_info resource = full_register (REGNO (info.get_avl ())); + def_lookup dl1 = crtl->ssa->find_def (resource, prev_insn); + def_lookup dl2 = crtl->ssa->find_def (resource, next_insn); + if (dl2.matching_set ()) + return false; + + auto is_phi_or_real + = [&] (insn_info *h) { return h->is_real () || h->is_phi (); }; + + def_info *def1 = dl1.matching_set_or_last_def_of_prev_group (); + def_info *def2 = dl2.prev_def (next_insn); + set_info *set1 = safe_dyn_cast (def1); + set_info *set2 = safe_dyn_cast (def2); + if (!set1 || !set2) + return false; + + auto is_same_ultimate_def = [&] (set_info *s1, set_info *s2) { +
[PATCH] RISC-V: Fix regressions due to 86de9b66480b710202a2898cf513db105d8c432f
This patch fixes the recent regression: FAIL: gcc.dg/torture/float32-tg-2.c -O1 (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg-2.c -O1 (test for excess errors) FAIL: gcc.dg/torture/float32-tg-2.c -O2 (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg-2.c -O2 (test for excess errors) FAIL: gcc.dg/torture/float32-tg-2.c -O2 -flto -fno-use-linker-plugin -flto-partition=none (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg-2.c -O2 -flto -fno-use-linker-plugin -flto-partition=none (test for excess errors) FAIL: gcc.dg/torture/float32-tg-2.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg-2.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects (test for excess errors) FAIL: gcc.dg/torture/float32-tg-2.c -O3 -g (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg-2.c -O3 -g (test for excess errors) FAIL: gcc.dg/torture/float32-tg-2.c -Os (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg-2.c -Os (test for excess errors) FAIL: gcc.dg/torture/float32-tg.c -O1 (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg.c -O1 (test for excess errors) FAIL: gcc.dg/torture/float32-tg.c -O2 (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg.c -O2 (test for excess errors) FAIL: gcc.dg/torture/float32-tg.c -O2 -flto -fno-use-linker-plugin -flto-partition=none (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg.c -O2 -flto -fno-use-linker-plugin -flto-partition=none (test for excess errors) FAIL: gcc.dg/torture/float32-tg.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects (test for excess errors) FAIL: gcc.dg/torture/float32-tg.c -O3 -g (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg.c -O3 -g (test for excess errors) FAIL: gcc.dg/torture/float32-tg.c -Os (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/float32-tg.c -Os (test for excess errors) FAIL: gcc.dg/torture/pr48124-4.c -O1 (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/pr48124-4.c -O1 (test for excess errors) FAIL: gcc.dg/torture/pr48124-4.c -O2 (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/pr48124-4.c -O2 (test for excess errors) FAIL: gcc.dg/torture/pr48124-4.c -O2 -flto -fno-use-linker-plugin -flto-partition=none (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/pr48124-4.c -O2 -flto -fno-use-linker-plugin -flto-partition=none (test for excess errors) FAIL: gcc.dg/torture/pr48124-4.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/pr48124-4.c -O2 -flto -fuse-linker-plugin -fno-fat-lto-objects (test for excess errors) FAIL: gcc.dg/torture/pr48124-4.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/pr48124-4.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (test for excess errors) FAIL: gcc.dg/torture/pr48124-4.c -O3 -g (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/pr48124-4.c -O3 -g (test for excess errors) FAIL: gcc.dg/torture/pr48124-4.c -Os (internal compiler error: in reg_or_subregno, at jump.cc:1895) FAIL: gcc.dg/torture/pr48124-4.c -Os (test for excess errors) due to commit 86de9b66480b710202a2898cf513db105d8c432f. The root cause is register_operand and reg_or_subregno are consistent so we reach the assertion fail. We shouldn't worry about subreg:...VL_REGNUM since it's impossible that we can have such situation, that is, we only have (set (reg) (reg:VL_REGNUM)) which generate "csrr vl" ASM for first fault load instructions (vleff). So, using REG_P and REGNO must be totally solid and robostic. Since we don't allow VL_RENUM involved into register allocation and we don't have such constraint, we always use this following pattern to generate "csrr vl" ASM: (define_insn "read_vlsi" [(set (match_operand:SI 0 "register_operand" "=r") (reg:SI VL_REGNUM))] "TARGET_VECTOR" "csrr\t%0,vl" [(set_attr "type" "rdvl") (set_attr "mode" "SI")]) So the check in riscv.md is to disallow such situation fall into move pattern in riscv.md Tested on both RV32/RV64 no
[PATCH] RISC-V: Lower vmv.v.x (avl = 1) into vmv.s.x
Notice there is a AI benchmark, GCC vs Clang has 3% performance drop. It's because Clang/LLVM has a simplification transform vmv.v.x (avl = 1) into vmv.s.x. Since vmv.s.x has more flexible vsetvl demand than vmv.v.x that can allow us to have better chances to fuse vsetvl. Consider this following case: void foo (uint32_t *outputMat, uint32_t *inputMat) { vuint32m1_t matRegIn0 = __riscv_vle32_v_u32m1 (inputMat, 4); vuint32m1_t matRegIn1 = __riscv_vle32_v_u32m1 (inputMat + 4, 4); vuint32m1_t matRegIn2 = __riscv_vle32_v_u32m1 (inputMat + 8, 4); vuint32m1_t matRegIn3 = __riscv_vle32_v_u32m1 (inputMat + 12, 4); vbool32_t oddMask = __riscv_vreinterpret_v_u32m1_b32 (__riscv_vmv_v_x_u32m1 (0x, 1)); vuint32m1_t smallTransposeMat0 = __riscv_vslideup_vx_u32m1_tumu (oddMask, matRegIn0, matRegIn1, 1, 4); vuint32m1_t smallTransposeMat2 = __riscv_vslideup_vx_u32m1_tumu (oddMask, matRegIn2, matRegIn3, 1, 4); vuint32m1_t outMat0 = __riscv_vslideup_vx_u32m1_tu (smallTransposeMat0, smallTransposeMat2, 2, 4); __riscv_vse32_v_u32m1 (outputMat, outMat0, 4); } Before this patch: vsetivlizero,4,e32,m1,ta,ma li a5,45056 addia2,a1,16 addia3,a1,32 addia4,a1,48 vle32.v v1,0(a1) vle32.v v4,0(a2) vle32.v v2,0(a3) vle32.v v3,0(a4) addiw a5,a5,-1366 vsetivlizero,1,e32,m1,ta,ma vmv.v.x v0,a5 ---> Since it avl = 1, we can transform it into vmv.s.x vsetivlizero,4,e32,m1,tu,mu vslideup.vi v1,v4,1,v0.t vslideup.vi v2,v3,1,v0.t vslideup.vi v1,v2,2 vse32.v v1,0(a0) ret After this patch: li a5,45056 addia2,a1,16 vsetivlizero,4,e32,m1,tu,mu addiw a5,a5,-1366 vle32.v v3,0(a2) addia3,a1,32 addia4,a1,48 vle32.v v1,0(a1) vmv.s.x v0,a5 vle32.v v2,0(a3) vslideup.vi v1,v3,1,v0.t vle32.v v3,0(a4) vslideup.vi v2,v3,1,v0.t vslideup.vi v1,v2,2 vse32.v v1,0(a0) ret Tested on both RV32 and RV64 no regression. gcc/ChangeLog: * config/riscv/riscv-protos.h (splat_to_scalar_move_p): New function. * config/riscv/riscv-v.cc (splat_to_scalar_move_p): Ditto. * config/riscv/vector.md: Simplify vmv.v.x. into vmv.s.x. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/attribute-2.c: New test. * gcc.target/riscv/rvv/vsetvl/attribute-3.c: New test. --- gcc/config/riscv/riscv-protos.h | 1 + gcc/config/riscv/riscv-v.cc | 12 ++ gcc/config/riscv/vector.md| 9 - .../gcc.target/riscv/rvv/vsetvl/attribute-2.c | 37 +++ .../gcc.target/riscv/rvv/vsetvl/attribute-3.c | 36 ++ 5 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-2.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-3.c diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 7fe26fcd939..b3f0bdb9924 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -708,6 +708,7 @@ bool can_be_broadcasted_p (rtx); bool gather_scatter_valid_offset_p (machine_mode); HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int); bool whole_reg_to_reg_move_p (rtx *, machine_mode, int); +bool splat_to_scalar_move_p (rtx *); } /* We classify builtin types into two classes: diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 93a1238a5ab..4bacb7fea45 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -5151,4 +5151,16 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index) return false; } +/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f. */ +bool +splat_to_scalar_move_p (rtx *ops) +{ + return satisfies_constraint_Wc1 (ops[1]) +&& satisfies_constraint_vu (ops[2]) +&& !MEM_P (ops[3]) +&& satisfies_constraint_c01 (ops[4]) +&& INTVAL (ops[7]) == NONVLMAX +&& known_ge (GET_MODE_SIZE (Pmode), GET_MODE_SIZE (GET_MODE (ops[3]))); +} + } // namespace riscv_vector diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index 307d9a8c952..ab6e099852d 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -1977,8 +1977,15 @@ (match_operand:V_VLS 2 "vector_merge_operand")))] "TARGET_VECTOR" { + /* Transform vmv.v.x/vfmv.v.f (avl = 1) into vmv.s.x since vmv.s.x/vfmv.s.f + has better chances to do vsetvl fusion in vsetvl pass. */ + if (riscv_vector::splat_to_scalar_move_p (operands)) +{ + operands[1] = riscv_vector::gen_scalar_move_mask (mode); + operands[3]
[PATCH] RISC-V: Fix vfirst/vmsbf/vmsif/vmsof ratio attributes
vfirst/vmsbf/vmsif/vmsof instructions are supposed to demand ratio instead of demanding sew_lmul. But my previous typo makes VSETVL PASS miss honor the risc-v v spec. Consider this following simple case: int foo4 (void * in, void * out) { vint32m1_t v = __riscv_vle32_v_i32m1 (in, 4); v = __riscv_vadd_vv_i32m1 (v, v, 4); vbool32_t mask = __riscv_vreinterpret_v_i32m1_b32(v); mask = __riscv_vmsof_m_b32(mask, 4); return __riscv_vfirst_m_b32(mask, 4); } Before this patch: foo4: vsetivlizero,4,e32,m1,ta,ma vle32.v v1,0(a0) vadd.vv v1,v1,v1 vsetvli zero,zero,e8,mf4,ta,ma> redundant. vmsof.m v2,v1 vfirst.ma0,v2 ret After this patch: foo4: vsetivlizero,4,e32,m1,ta,ma vle32.v v1,0(a0) vadd.vv v1,v1,v1 vmsof.m v2,v1 vfirst.ma0,v2 ret Confirm RVV spec and Clang, this patch makes VSETVL PASS match the correct behavior. Tested on both RV32/RV64, no regression. gcc/ChangeLog: * config/riscv/vector.md: Fix vfirst/vmsbf/vmsof ratio attributes. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/attribute-1.c: New test. --- gcc/config/riscv/vector.md| 2 +- .../gcc.target/riscv/rvv/vsetvl/attribute-1.c | 47 +++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-1.c diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index cfc54ae5eac..307d9a8c952 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -433,7 +433,7 @@ vialu,vshift,vicmp,vimul,vidiv,vsalu,\ vext,viwalu,viwmul,vicalu,vnshift,\ vimuladd,vimerge,vaalu,vsmul,vsshift,\ - vnclip,viminmax,viwmuladd,vmffs,vmsfs,\ + vnclip,viminmax,viwmuladd,\ vmiota,vmidx,vfalu,vfmul,vfminmax,vfdiv,\ vfwalu,vfwmul,vfsqrt,vfrecp,vfsgnj,vfcmp,\ vfmerge,vfcvtitof,vfcvtftoi,vfwcvtitof,\ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-1.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-1.c new file mode 100644 index 000..28dcf986bac --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-1.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */ + +#include "riscv_vector.h" + +int +foo (void *in, void *out) +{ + vint32m1_t v = __riscv_vle32_v_i32m1 (in, 4); + v = __riscv_vadd_vv_i32m1 (v, v, 4); + vbool32_t mask = __riscv_vreinterpret_v_i32m1_b32 (v); + return __riscv_vfirst_m_b32 (mask, 4); +} + +int +foo2 (void *in, void *out) +{ + vint32m1_t v = __riscv_vle32_v_i32m1 (in, 4); + v = __riscv_vadd_vv_i32m1 (v, v, 4); + vbool32_t mask = __riscv_vreinterpret_v_i32m1_b32 (v); + mask = __riscv_vmsbf_m_b32 (mask, 4); + return __riscv_vfirst_m_b32 (mask, 4); +} + +int +foo3 (void *in, void *out) +{ + vint32m1_t v = __riscv_vle32_v_i32m1 (in, 4); + v = __riscv_vadd_vv_i32m1 (v, v, 4); + vbool32_t mask = __riscv_vreinterpret_v_i32m1_b32 (v); + mask = __riscv_vmsif_m_b32 (mask, 4); + return __riscv_vfirst_m_b32 (mask, 4); +} + +int +foo4 (void *in, void *out) +{ + vint32m1_t v = __riscv_vle32_v_i32m1 (in, 4); + v = __riscv_vadd_vv_i32m1 (v, v, 4); + vbool32_t mask = __riscv_vreinterpret_v_i32m1_b32 (v); + mask = __riscv_vmsof_m_b32 (mask, 4); + return __riscv_vfirst_m_b32 (mask, 4); +} + +/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 4 } } */ +/* { dg-final { scan-assembler-times {vsetivli} 4 } } */ +/* { dg-final { scan-assembler-not {vsetvli} } } */ -- 2.36.3
[Committed] RISC-V: Suppress warning
../../gcc/config/riscv/riscv.cc: In function 'void riscv_init_cumulative_args(CUMULATIVE_ARGS*, tree, rtx, tree, int)': ../../gcc/config/riscv/riscv.cc:4879:34: error: unused parameter 'fndecl' [-Werror=unused-parameter] 4879 | tree fndecl, | ~^~ ../../gcc/config/riscv/riscv.cc: In function 'bool riscv_vector_mode_supported_any_target_p(machine_mode)': ../../gcc/config/riscv/riscv.cc:10537:56: error: unused parameter 'mode' [-Werror=unused-parameter] 10537 | riscv_vector_mode_supported_any_target_p (machine_mode mode) | ~^~~~ cc1plus: all warnings being treated as errors make[3]: *** [Makefile:2559: riscv.o] Error 1 Suppress these warnings. gcc/ChangeLog: * config/riscv/riscv.cc (riscv_init_cumulative_args): Suppress warning. (riscv_vector_mode_supported_any_target_p): Ditto. --- gcc/config/riscv/riscv.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index dd6e68a08c2..1f9546f4d3e 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -4876,7 +4876,7 @@ void riscv_init_cumulative_args (CUMULATIVE_ARGS *cum, tree fntype ATTRIBUTE_UNUSED, rtx libname ATTRIBUTE_UNUSED, - tree fndecl, + tree fndecl ATTRIBUTE_UNUSED, int caller ATTRIBUTE_UNUSED) { memset (cum, 0, sizeof (*cum)); @@ -10534,7 +10534,7 @@ extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset) /* Implements target hook vector_mode_supported_any_target_p. */ static bool -riscv_vector_mode_supported_any_target_p (machine_mode mode) +riscv_vector_mode_supported_any_target_p (machine_mode) { if (TARGET_XTHEADVECTOR) return false; -- 2.36.3
[PATCH V2] RISC-V: Fix RVV_VLMAX
This patch fixes memory hog found in SPEC2017 wrf benchmark which caused by RVV_VLMAX since RVV_VLMAX generate brand new rtx by gen_rtx_REG (Pmode, X0_REGNUM) every time we call RVV_VLMAX, that is, we are always generating garbage and redundant (reg:DI 0 zero) rtx. After this patch fix, the memory hog is gone. Time variable usr sys wall GGC machine dep reorg : 1.99 ( 9%) 0.35 ( 56%) 2.33 ( 10%) 939M ( 80%) [Before this patch] machine dep reorg : 1.71 ( 6%) 0.16 ( 27%) 3.77 ( 6%) 659k ( 0%) [After this patch] Time variable usr sys wall GGC machine dep reorg : 75.93 ( 18%) 14.23 ( 88%) 90.15 ( 21%) 33383M ( 95%) [Before this patch] machine dep reorg : 56.00 ( 14%) 7.92 ( 77%) 63.93 ( 15%) 4361k ( 0%) [After this patch] Test is running. Ok for trunk if I passed the test with no regresion ? PR target/113495 gcc/ChangeLog: * config/riscv/riscv-protos.h (RVV_VLMAX): Change to regno_reg_rtx[X0_REGNUM]. (RVV_VUNDEF): Ditto. * config/riscv/riscv-vsetvl.cc: Add timevar. --- gcc/config/riscv/riscv-protos.h | 5 ++--- gcc/config/riscv/riscv-vsetvl.cc | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 7853b488838..7fe26fcd939 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -299,10 +299,9 @@ void riscv_run_selftests (void); #endif namespace riscv_vector { -#define RVV_VLMAX gen_rtx_REG (Pmode, X0_REGNUM) +#define RVV_VLMAX regno_reg_rtx[X0_REGNUM] #define RVV_VUNDEF(MODE) \ - gen_rtx_UNSPEC (MODE, gen_rtvec (1, gen_rtx_REG (SImode, X0_REGNUM)), \ - UNSPEC_VUNDEF) + gen_rtx_UNSPEC (MODE, gen_rtvec (1, RVV_VLMAX), UNSPEC_VUNDEF) /* These flags describe how to pass the operands to a rvv insn pattern. e.g.: diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 2067073185f..54c85ffb7d5 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -3556,7 +3556,7 @@ const pass_data pass_data_vsetvl = { RTL_PASS, /* type */ "vsetvl", /* name */ OPTGROUP_NONE, /* optinfo_flags */ - TV_NONE, /* tv_id */ + TV_MACH_DEP, /* tv_id */ 0,/* properties_required */ 0,/* properties_provided */ 0,/* properties_destroyed */ -- 2.36.3
[PATCH] RISC-V: Fix RVV_VLMAX
This patch fixes memory hog found in SPEC2017 wrf benchmark which caused by RVV_VLMAX since RVV_VLMAX generate brand new rtx by gen_rtx_REG (Pmode, X0_REGNUM) every time we call RVV_VLMAX, that is, we are always generating garbage and redundant (reg:DI 0 zero) rtx. After this patch fix, the memory hog is gone. Time variable usr sys wall GGC machine dep reorg : 1.99 ( 9%) 0.35 ( 56%) 2.33 ( 10%) 939M ( 80%) [Before this patch] machine dep reorg : 1.71 ( 6%) 0.16 ( 27%) 3.77 ( 6%) 659k ( 0%) [After this patch] Time variable usr sys wall GGC machine dep reorg : 75.93 ( 18%) 14.23 ( 88%) 90.15 ( 21%) 33383M ( 95%) [Before this patch] machine dep reorg : 56.00 ( 14%) 7.92 ( 77%) 63.93 ( 15%) 4361k ( 0%) [After this patch] Test is running. Ok for trunk if I passed the test with no regresion ? gcc/ChangeLog: * config/riscv/riscv-protos.h (RVV_VLMAX): Change to regno_reg_rtx[X0_REGNUM]. (RVV_VUNDEF): Ditto. * config/riscv/riscv-vsetvl.cc: Add timevar. --- gcc/config/riscv/riscv-protos.h | 5 ++--- gcc/config/riscv/riscv-vsetvl.cc | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 7853b488838..7fe26fcd939 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -299,10 +299,9 @@ void riscv_run_selftests (void); #endif namespace riscv_vector { -#define RVV_VLMAX gen_rtx_REG (Pmode, X0_REGNUM) +#define RVV_VLMAX regno_reg_rtx[X0_REGNUM] #define RVV_VUNDEF(MODE) \ - gen_rtx_UNSPEC (MODE, gen_rtvec (1, gen_rtx_REG (SImode, X0_REGNUM)), \ - UNSPEC_VUNDEF) + gen_rtx_UNSPEC (MODE, gen_rtvec (1, RVV_VLMAX), UNSPEC_VUNDEF) /* These flags describe how to pass the operands to a rvv insn pattern. e.g.: diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 2067073185f..54c85ffb7d5 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -3556,7 +3556,7 @@ const pass_data pass_data_vsetvl = { RTL_PASS, /* type */ "vsetvl", /* name */ OPTGROUP_NONE, /* optinfo_flags */ - TV_NONE, /* tv_id */ + TV_MACH_DEP, /* tv_id */ 0,/* properties_required */ 0,/* properties_provided */ 0,/* properties_destroyed */ -- 2.36.3
[PATCH] RISC-V: Support vi variant for vec_cmp
While running various benchmarks, I notice we miss vi variant support for integer comparison. That is, we can vectorize code into vadd.vi but we can't vectorize into vmseq.vi. Consider this following case: void foo (int n, int **__restrict a) { int b; int c; int d; for (b = 0; b < n; b++) for (long e = 8; e > 0; e--) a[b][e] = a[b][e] == 15; } Before this patch: vsetivlizero,4,e32,m1,ta,ma vmv.v.i v4,15 vmv.v.i v3,1 vmv.v.i v2,0 .L3: ld a5,0(a1) addia4,a5,4 addia5,a5,20 vle32.v v1,0(a5) vle32.v v0,0(a4) vmseq.vvv0,v0,v4 After this patch: ld a5,0(a1) addia4,a5,4 addia5,a5,20 vle32.v v1,0(a5) vle32.v v0,0(a4) vmseq.viv0,v0,15 It's the missing feature caused by our some mistakes, support vi variant for vec_cmp like other patterns (add, sub, ..., etc). Tested with no regression, ok for trunk ? gcc/ChangeLog: * config/riscv/autovec.md: Support vi variant. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/cmp/cmp_vi-1.c: New test. * gcc.target/riscv/rvv/autovec/cmp/cmp_vi-2.c: New test. * gcc.target/riscv/rvv/autovec/cmp/cmp_vi-3.c: New test. * gcc.target/riscv/rvv/autovec/cmp/cmp_vi-4.c: New test. * gcc.target/riscv/rvv/autovec/cmp/cmp_vi-5.c: New test. * gcc.target/riscv/rvv/autovec/cmp/cmp_vi-6.c: New test. * gcc.target/riscv/rvv/autovec/cmp/cmp_vi-7.c: New test. * gcc.target/riscv/rvv/autovec/cmp/cmp_vi-8.c: New test. * gcc.target/riscv/rvv/autovec/cmp/cmp_vi-9.c: New test. * gcc.target/riscv/rvv/autovec/cmp/macro.h: New test. --- gcc/config/riscv/autovec.md | 4 +-- .../riscv/rvv/autovec/cmp/cmp_vi-1.c | 16 +++ .../riscv/rvv/autovec/cmp/cmp_vi-2.c | 16 +++ .../riscv/rvv/autovec/cmp/cmp_vi-3.c | 28 +++ .../riscv/rvv/autovec/cmp/cmp_vi-4.c | 28 +++ .../riscv/rvv/autovec/cmp/cmp_vi-5.c | 16 +++ .../riscv/rvv/autovec/cmp/cmp_vi-6.c | 16 +++ .../riscv/rvv/autovec/cmp/cmp_vi-7.c | 28 +++ .../riscv/rvv/autovec/cmp/cmp_vi-8.c | 28 +++ .../riscv/rvv/autovec/cmp/cmp_vi-9.c | 18 .../gcc.target/riscv/rvv/autovec/cmp/macro.h | 11 11 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-2.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-3.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-4.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-5.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-6.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-7.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-8.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-9.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/macro.h diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md index 706cd9717cb..5ec1c59bdd4 100644 --- a/gcc/config/riscv/autovec.md +++ b/gcc/config/riscv/autovec.md @@ -664,7 +664,7 @@ [(set (match_operand: 0 "register_operand") (match_operator: 1 "comparison_operator" [(match_operand:V_VLSI 2 "register_operand") - (match_operand:V_VLSI 3 "register_operand")]))] + (match_operand:V_VLSI 3 "nonmemory_operand")]))] "TARGET_VECTOR" { riscv_vector::expand_vec_cmp (operands[0], GET_CODE (operands[1]), @@ -677,7 +677,7 @@ [(set (match_operand: 0 "register_operand") (match_operator: 1 "comparison_operator" [(match_operand:V_VLSI 2 "register_operand") - (match_operand:V_VLSI 3 "register_operand")]))] + (match_operand:V_VLSI 3 "nonmemory_operand")]))] "TARGET_VECTOR" { riscv_vector::expand_vec_cmp (operands[0], GET_CODE (operands[1]), diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-1.c new file mode 100644 index 000..10c232f77bd --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */ + +#include "macro.h" + +CMP_VI (ne_char, char, n, !=, 15) +CMP_VI (ne_short, short, n, !=, 15) +CMP_VI (ne_int, int, n, !=, 15) +CMP_VI (ne_long, long, n, !=, 15) +CMP_VI (ne_unsigned_char, unsigned char, n, !=, 15) +CMP_VI (ne_unsigned_short, unsigned short, n, !=, 15) +CMP_VI (ne_unsigned_int, unsigned int, n, !=, 15) +CMP_VI (ne_unsigned_long, unsigned long, n, !=, 15)
[PATCH v2] test regression fix: Add !vect128 for variable length targets of bb-slp-subgroups-3.c
gcc/testsuite/ChangeLog: * gcc.dg/vect/bb-slp-subgroups-3.c: Add !vect128. --- gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c index fb719915db7..d1d79125731 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c @@ -42,7 +42,7 @@ main (int argc, char **argv) /* Because we disable the cost model, targets with variable-length vectors can end up vectorizing the store to a[0..7] on its own. With the cost model we do something sensible. */ -/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { target { ! amdgcn-*-* } xfail vect_variable_length } } } */ +/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { target { ! amdgcn-*-* } xfail { vect_variable_length && { ! vect128 } } } } } */ /* amdgcn can do this in one vector. */ /* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp2" { target amdgcn-*-* } } } */ -- 2.36.3
[Committed V3] RISC-V: Add has compatible check for conflict vsetvl fusion
V3: Rebase to trunk and commit it. This patch fixes SPEC2017 cam4 mismatch issue due to we miss has compatible check for conflict vsetvl fusion. Buggy assembler before this patch: .L69: vsetvli a5,s1,e8,mf4,ta,ma -> buggy vsetvl vsetivlizero,8,e8,mf2,ta,ma vmv.v.i v1,0 vse8.v v1,0(a5) j .L37 .L68: vsetvli a5,s1,e8,mf4,ta,ma -> buggy vsetvl vsetivlizero,8,e8,mf2,ta,ma addia3,a5,8 vmv.v.i v1,0 vse8.v v1,0(a5) vse8.v v1,0(a3) addia4,a4,-16 li a3,8 bltua4,a3,.L37 j .L69 .L67: vsetivlizero,8,e8,mf2,ta,ma vmv.v.i v1,0 vse8.v v1,0(a5) addia5,sp,56 vse8.v v1,0(a5) addis4,sp,64 addia3,sp,72 vse8.v v1,0(s4) vse8.v v1,0(a3) addia4,a4,-32 li a3,16 bltua4,a3,.L36 j .L68 After this patch: .L63: ble s1,zero,.L49 sllia4,s1,3 li a3,32 addia5,sp,48 bltua4,a3,.L62 vsetivlizero,8,e8,mf2,ta,ma vmv.v.i v1,0 vse8.v v1,0(a5) addia5,sp,56 vse8.v v1,0(a5) addis4,sp,64 addia3,sp,72 vse8.v v1,0(s4) addia4,a4,-32 addia5,sp,80 vse8.v v1,0(a3) .L35: li a3,16 bltua4,a3,.L36 addia3,a5,8 vmv.v.i v1,0 addia4,a4,-16 vse8.v v1,0(a5) addia5,a5,16 vse8.v v1,0(a3) .L36: li a3,8 bltua4,a3,.L37 vmv.v.i v1,0 vse8.v v1,0(a5) Tested on both RV32/RV64 no regression, Ok for trunk ? PR target/113429 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::earliest_fuse_vsetvl_info): Fix bug. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/vlmax_conflict-4.c: Adapt test. * gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Ditto. --- gcc/config/riscv/riscv-vsetvl.cc | 43 +++ .../riscv/rvv/vsetvl/vlmax_conflict-4.c | 5 +-- .../riscv/rvv/vsetvl/vlmax_conflict-5.c | 10 ++--- 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 41d4b80648f..2067073185f 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2254,6 +2254,22 @@ private: return true; } + bool has_compatible_reaching_vsetvl_p (vsetvl_info info) + { +unsigned int index; +sbitmap_iterator sbi; +EXECUTE_IF_SET_IN_BITMAP (m_vsetvl_def_in[info.get_bb ()->index ()], 0, + index, sbi) + { + const auto prev_info = *m_vsetvl_def_exprs[index]; + if (!prev_info.valid_p ()) + continue; + if (m_dem.compatible_p (prev_info, info)) + return true; + } +return false; + } + bool preds_all_same_avl_and_ratio_p (const vsetvl_info &curr_info) { gcc_assert ( @@ -3076,22 +3092,8 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter) { vsetvl_info new_curr_info = curr_info; new_curr_info.set_bb (crtl->ssa->bb (eg->dest)); - bool has_compatible_p = false; - unsigned int def_expr_index; - sbitmap_iterator sbi2; - EXECUTE_IF_SET_IN_BITMAP ( - m_vsetvl_def_in[new_curr_info.get_bb ()->index ()], 0, - def_expr_index, sbi2) - { - vsetvl_info &prev_info = *m_vsetvl_def_exprs[def_expr_index]; - if (!prev_info.valid_p ()) - continue; - if (m_dem.compatible_p (prev_info, new_curr_info)) - { - has_compatible_p = true; - break; - } - } + bool has_compatible_p + = has_compatible_reaching_vsetvl_p (new_curr_info); if (!has_compatible_p) { if (dump_file && (dump_flags & TDF_DETAILS)) @@ -3146,7 +3148,10 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter) else { /* Cancel lift up if probabilities are equal. */ - if (successors_probability_equal_p (eg->src)) + if (successors_probability_equal_p (eg->src) + || (dest_block_info.probability + > src_block_info.probability + && !has_compatible_reaching_vsetvl_p (curr_info))) { if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -3154,8 +3159,8 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter) " Reset bb %u:",
[PATCH V2] RISC-V: Add has compatible check for conflict vsetvl fusion
This patch fixes SPEC2017 cam4 mismatch issue due to we miss has compatible check for conflict vsetvl fusion. Buggy assembler before this patch: .L69: vsetvli a5,s1,e8,mf4,ta,ma -> buggy vsetvl vsetivlizero,8,e8,mf2,ta,ma vmv.v.i v1,0 vse8.v v1,0(a5) j .L37 .L68: vsetvli a5,s1,e8,mf4,ta,ma -> buggy vsetvl vsetivlizero,8,e8,mf2,ta,ma addia3,a5,8 vmv.v.i v1,0 vse8.v v1,0(a5) vse8.v v1,0(a3) addia4,a4,-16 li a3,8 bltua4,a3,.L37 j .L69 .L67: vsetivlizero,8,e8,mf2,ta,ma vmv.v.i v1,0 vse8.v v1,0(a5) addia5,sp,56 vse8.v v1,0(a5) addis4,sp,64 addia3,sp,72 vse8.v v1,0(s4) vse8.v v1,0(a3) addia4,a4,-32 li a3,16 bltua4,a3,.L36 j .L68 After this patch: .L63: ble s1,zero,.L49 sllia4,s1,3 li a3,32 addia5,sp,48 bltua4,a3,.L62 vsetivlizero,8,e8,mf2,ta,ma vmv.v.i v1,0 vse8.v v1,0(a5) addia5,sp,56 vse8.v v1,0(a5) addis4,sp,64 addia3,sp,72 vse8.v v1,0(s4) addia4,a4,-32 addia5,sp,80 vse8.v v1,0(a3) .L35: li a3,16 bltua4,a3,.L36 addia3,a5,8 vmv.v.i v1,0 addia4,a4,-16 vse8.v v1,0(a5) addia5,a5,16 vse8.v v1,0(a3) .L36: li a3,8 bltua4,a3,.L37 vmv.v.i v1,0 vse8.v v1,0(a5) Tested on both RV32/RV64 no regression, Ok for trunk ? PR target/113429 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::earliest_fuse_vsetvl_info): Fix conflict vsetvl fusion. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/vlmax_conflict-4.c: Adapt test. * gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Ditto. --- gcc/config/riscv/riscv-vsetvl.cc | 39 +++ .../riscv/rvv/vsetvl/vlmax_conflict-4.c | 5 +-- .../riscv/rvv/vsetvl/vlmax_conflict-5.c | 10 ++--- 3 files changed, 28 insertions(+), 26 deletions(-) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index df7ed149388..76e3d2eb471 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2254,6 +2254,22 @@ private: return true; } + bool has_compatible_reaching_vsetvl_p (vsetvl_info info) + { +unsigned int index; +sbitmap_iterator sbi; +EXECUTE_IF_SET_IN_BITMAP (m_vsetvl_def_in[info.get_bb ()->index ()], 0, + index, sbi) + { + const auto prev_info = *m_vsetvl_def_exprs[index]; + if (!prev_info.valid_p ()) + continue; + if (m_dem.compatible_p (prev_info, info)) + return true; + } +return false; + } + bool preds_all_same_avl_and_ratio_p (const vsetvl_info &curr_info) { gcc_assert ( @@ -3075,22 +3091,8 @@ pre_vsetvl::earliest_fuse_vsetvl_info () { vsetvl_info new_curr_info = curr_info; new_curr_info.set_bb (crtl->ssa->bb (eg->dest)); - bool has_compatible_p = false; - unsigned int def_expr_index; - sbitmap_iterator sbi2; - EXECUTE_IF_SET_IN_BITMAP ( - m_vsetvl_def_in[new_curr_info.get_bb ()->index ()], 0, - def_expr_index, sbi2) - { - vsetvl_info &prev_info = *m_vsetvl_def_exprs[def_expr_index]; - if (!prev_info.valid_p ()) - continue; - if (m_dem.compatible_p (prev_info, new_curr_info)) - { - has_compatible_p = true; - break; - } - } + bool has_compatible_p + = has_compatible_reaching_vsetvl_p (new_curr_info); if (!has_compatible_p) { if (dump_file && (dump_flags & TDF_DETAILS)) @@ -3146,7 +3148,10 @@ pre_vsetvl::earliest_fuse_vsetvl_info () && !m_dem.compatible_p (prev_info, curr_info)) { /* Cancel lift up if probabilities are equal. */ - if (successors_probability_equal_p (eg->src)) + if (successors_probability_equal_p (eg->src) + || (dest_block_info.probability + > src_block_info.probability + && !has_compatible_reaching_vsetvl_p (curr_info))) { if (dump_file && (dump_flags & TDF_DETAILS)) { diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vlmax_conflict-4.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vlm
[PATCH] RISC-V: Add has compatible check for conflict vsetvl fusion
This patch fixes SPEC2017 cam4 mismatch issue due to we miss has compatible check for conflict vsetvl fusion. Buggy assembler before this patch: .L69: vsetvli a5,s1,e8,mf4,ta,ma -> buggy vsetvl vsetivlizero,8,e8,mf2,ta,ma vmv.v.i v1,0 vse8.v v1,0(a5) j .L37 .L68: vsetvli a5,s1,e8,mf4,ta,ma -> buggy vsetvl vsetivlizero,8,e8,mf2,ta,ma addia3,a5,8 vmv.v.i v1,0 vse8.v v1,0(a5) vse8.v v1,0(a3) addia4,a4,-16 li a3,8 bltua4,a3,.L37 j .L69 .L67: vsetivlizero,8,e8,mf2,ta,ma vmv.v.i v1,0 vse8.v v1,0(a5) addia5,sp,56 vse8.v v1,0(a5) addis4,sp,64 addia3,sp,72 vse8.v v1,0(s4) vse8.v v1,0(a3) addia4,a4,-32 li a3,16 bltua4,a3,.L36 j .L68 After this patch: .L63: ble s1,zero,.L49 sllia4,s1,3 li a3,32 addia5,sp,48 bltua4,a3,.L62 vsetivlizero,8,e8,mf2,ta,ma vmv.v.i v1,0 vse8.v v1,0(a5) addia5,sp,56 vse8.v v1,0(a5) addis4,sp,64 addia3,sp,72 vse8.v v1,0(s4) addia4,a4,-32 addia5,sp,80 vse8.v v1,0(a3) .L35: li a3,16 bltua4,a3,.L36 addia3,a5,8 vmv.v.i v1,0 addia4,a4,-16 vse8.v v1,0(a5) addia5,a5,16 vse8.v v1,0(a3) .L36: li a3,8 bltua4,a3,.L37 vmv.v.i v1,0 vse8.v v1,0(a5) Tested on both RV32/RV64 no regression, Ok for trunk ? PR target/113429 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc: Fix bug of conflict vsetvl fusion. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/spec2017_cam4/ppgrid.mod: New test. * gcc.target/riscv/rvv/spec2017_cam4/shr_kind_mod.mod: New test. * gcc.target/riscv/rvv/spec2017_cam4/pr113429.f90: New test. * gcc.target/riscv/rvv/vsetvl/vlmax_conflict-4.c: Adapt test. * gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Ditto. --- gcc/config/riscv/riscv-vsetvl.cc | 39 --- .../rvv/fortran/spec2017_cam4/ppgrid.mod | Bin 0 -> 296 bytes .../rvv/fortran/spec2017_cam4/pr113429.f90| 110 ++ .../fortran/spec2017_cam4/shr_kind_mod.mod| Bin 0 -> 499 bytes .../gcc.target/riscv/rvv/rvv-fortran.exp | 2 + .../riscv/rvv/vsetvl/vlmax_conflict-4.c | 5 +- .../riscv/rvv/vsetvl/vlmax_conflict-5.c | 10 +- 7 files changed, 140 insertions(+), 26 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/fortran/spec2017_cam4/ppgrid.mod create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/fortran/spec2017_cam4/pr113429.f90 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/fortran/spec2017_cam4/shr_kind_mod.mod diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index df7ed149388..76e3d2eb471 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2254,6 +2254,22 @@ private: return true; } + bool has_compatible_reaching_vsetvl_p (vsetvl_info info) + { +unsigned int index; +sbitmap_iterator sbi; +EXECUTE_IF_SET_IN_BITMAP (m_vsetvl_def_in[info.get_bb ()->index ()], 0, + index, sbi) + { + const auto prev_info = *m_vsetvl_def_exprs[index]; + if (!prev_info.valid_p ()) + continue; + if (m_dem.compatible_p (prev_info, info)) + return true; + } +return false; + } + bool preds_all_same_avl_and_ratio_p (const vsetvl_info &curr_info) { gcc_assert ( @@ -3075,22 +3091,8 @@ pre_vsetvl::earliest_fuse_vsetvl_info () { vsetvl_info new_curr_info = curr_info; new_curr_info.set_bb (crtl->ssa->bb (eg->dest)); - bool has_compatible_p = false; - unsigned int def_expr_index; - sbitmap_iterator sbi2; - EXECUTE_IF_SET_IN_BITMAP ( - m_vsetvl_def_in[new_curr_info.get_bb ()->index ()], 0, - def_expr_index, sbi2) - { - vsetvl_info &prev_info = *m_vsetvl_def_exprs[def_expr_index]; - if (!prev_info.valid_p ()) - continue; - if (m_dem.compatible_p (prev_info, new_curr_info)) - { - has_compatible_p = true; - break; - } - } + bool has_compatible_p + = has_compatible_reaching_vsetvl_p (new_curr_info); if (!has_compatible_p) { if (dump_file && (dump_flags & TDF_DETAILS)) @@ -3146,7 +3148,10 @@ pre_vsetvl::earliest_fuse_vsetvl_info ()
[PATCH v2] test regression fix: Add vect128 for bb-slp-43.c
gcc/testsuite/ChangeLog: * gcc.dg/vect/bb-slp-43.c: Add vect128. --- gcc/testsuite/gcc.dg/vect/bb-slp-43.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c index dad2d24262d..8aedb06bf72 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c @@ -14,4 +14,4 @@ f (int *restrict x, short *restrict y) } /* { dg-final { scan-tree-dump-not "mixed mask and nonmask" "slp2" } } */ -/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } xfail { vect_variable_length && { ! vect256 } } } } } */ +/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } xfail { vect_variable_length && { { ! vect128 } && { ! vect256 } } } } } } */ -- 2.36.3
[PATCH] test regression fix: Remove xfail for variable length targets of bb-slp-subgroups-3.c
Notice there is a regression recently: XPASS: gcc.dg/vect/bb-slp-subgroups-3.c -flto -ffat-lto-objects scan-tree-dump-times slp2 "optimized: basic block" 2 XPASS: gcc.dg/vect/bb-slp-subgroups-3.c scan-tree-dump-times slp2 "optimized: basic block" 2 Checked on both ARM SVE an RVV: https://godbolt.org/z/jz4cYbqc8 "optimized: basic block" appears twice. I guess ARM SVE has the same XPASS as RVV. Hi, Andrew. Could you confirm about it ? gcc/testsuite/ChangeLog: * gcc.dg/vect/bb-slp-subgroups-3.c: Remove XFAIL of variable length. --- gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c index fb719915db7..3f0d45ce4a1 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c @@ -42,7 +42,7 @@ main (int argc, char **argv) /* Because we disable the cost model, targets with variable-length vectors can end up vectorizing the store to a[0..7] on its own. With the cost model we do something sensible. */ -/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { target { ! amdgcn-*-* } xfail vect_variable_length } } } */ +/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { target { ! amdgcn-*-* } } } } */ /* amdgcn can do this in one vector. */ /* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp2" { target amdgcn-*-* } } } */ -- 2.36.3
[PATCH] test regression fix: Remove xfail for variable length targets
Recently notice there is a XPASS in RISC-V: XPASS: gcc.dg/vect/bb-slp-43.c -flto -ffat-lto-objects scan-tree-dump-not slp2 "vector operands from scalars" XPASS: gcc.dg/vect/bb-slp-43.c scan-tree-dump-not slp2 "vector operands from scalars" And checked both ARM SVE and RVV: https://godbolt.org/z/T9cPa7fh3 both has the same dump slp2. So I guess ARM SVE has the same XPASS in this test. gcc/testsuite/ChangeLog: * gcc.dg/vect/bb-slp-43.c: Remove xfail for variable length. --- gcc/testsuite/gcc.dg/vect/bb-slp-43.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c index dad2d24262d..40bd2e0dfbf 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c @@ -14,4 +14,4 @@ f (int *restrict x, short *restrict y) } /* { dg-final { scan-tree-dump-not "mixed mask and nonmask" "slp2" } } */ -/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } xfail { vect_variable_length && { ! vect256 } } } } } */ +/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } } } } */ -- 2.36.3
[PATCH] RISC-V: Report Sorry when users enable RVV in big-endian mode [PR113404]
As PR113404 mentioned: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113404 We have ICE when we enable RVV in big-endian mode: during RTL pass: expand a-float-point-dynamic-frm-66.i:2:14: internal compiler error: in to_constant, at poly-int.h:588 0xab4c2c poly_int<2u, unsigned short>::to_constant() const /repo/gcc-trunk/gcc/poly-int.h:588 0xab4de1 poly_int<2u, unsigned short>::to_constant() const /repo/gcc-trunk/gcc/tree.h:4055 0xab4de1 default_function_arg_padding(machine_mode, tree_node const*) /repo/gcc-trunk/gcc/targhooks.cc:844 0x12e2327 locate_and_pad_parm(machine_mode, tree_node*, int, int, int, tree_node*, args_size*, locate_and_pad_arg_data*) /repo/gcc-trunk/gcc/function.cc:4061 0x12e2aca assign_parm_find_entry_rtl /repo/gcc-trunk/gcc/function.cc:2614 0x12e2c89 assign_parms /repo/gcc-trunk/gcc/function.cc:3693 0x12e59df expand_function_start(tree_node*) /repo/gcc-trunk/gcc/function.cc:5152 0x112fafb execute /repo/gcc-trunk/gcc/cfgexpand.cc:6739 Report users that we don't support RVV in big-endian mode for the following reasons: 1. big-endian in RISC-V is pretty rare case. 2. We didn't test RVV in big-endian and we don't have enough time to test it since it's stage 4 now. Naive disallow RVV in big-endian. Tested no regression, ok for trunk ? PR target/113404 gcc/ChangeLog: * config/riscv/riscv.cc (riscv_override_options_internal): Report sorry for RVV in big-endian mode. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/base/big_endian-1.c: New test. * gcc.target/riscv/rvv/base/big_endian-2.c: New test. --- gcc/config/riscv/riscv.cc | 5 + gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-1.c | 5 + gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-2.c | 5 + 3 files changed, 15 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-2.c diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 89caf156f03..41626fa34e4 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -8787,6 +8787,11 @@ riscv_override_options_internal (struct gcc_options *opts) sorry ("Current RISC-V GCC cannot support VLEN greater than 4096bit for " "'V' Extension"); + /* FIXME: We don't support RVV in big-endian for now, we may enable RVV with + big-endian after finishing full coverage testing. */ + if (TARGET_VECTOR && TARGET_BIG_ENDIAN) +sorry ("Current RISC-V GCC cannot support RVV in big-endian mode"); + /* Convert -march to a chunks count. */ riscv_vector_chunks = riscv_convert_vector_bits (opts); } diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-1.c new file mode 100644 index 000..9eaf7ad33b2 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-1.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -mbig-endian -O3" } */ + +#pragma riscv intrinsic "vector" +vfloat32m1_t foo (vfloat32m1_t) {} // { dg-excess-errors "sorry, unimplemented: Current RISC-V GCC cannot support RVV in big-endian mode" } diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-2.c b/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-2.c new file mode 100644 index 000..86cf58370bf --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-2.c @@ -0,0 +1,5 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gc_zve32x -mabi=lp64d -mbig-endian -O3" } */ + +#pragma riscv intrinsic "vector" +vint32m1_t foo (vint32m1_t) {} // { dg-excess-errors "sorry, unimplemented: Current RISC-V GCC cannot support RVV in big-endian mode" } -- 2.36.3
[Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
This patch fixes -70% performance drop from GCC-13.2 to GCC-14 with -march=rv64gcv in real hardware. The root cause is incorrect cost model cause inefficient vectorization which makes us performance drop significantly. So this patch does: 1. Adjust vector to scalar cost by introducing v to scalar reg move. 2. Adjust vec_construct cost since we does spend NUNITS instructions to construct the vector. Tested on both RV32/RV64 no regression, Rebase to the trunk and commit it as it is approved by Robin. PR target/113247 gcc/ChangeLog: * config/riscv/riscv-protos.h (struct regmove_vector_cost): Add vector to scalar regmove. * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Ditto. * config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Adjust vec_construct cost. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Adapt test. * gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c: New test. --- gcc/config/riscv/riscv-protos.h | 2 + gcc/config/riscv/riscv-vector-costs.cc| 3 + gcc/config/riscv/riscv.cc | 4 +- .../vect/costmodel/riscv/rvv/pr113247-1.c | 195 ++ .../vect/costmodel/riscv/rvv/pr113247-2.c | 6 + .../vect/costmodel/riscv/rvv/pr113247-3.c | 6 + .../vect/costmodel/riscv/rvv/pr113247-4.c | 6 + .../riscv/rvv/autovec/vls/reduc-19.c | 2 +- .../riscv/rvv/autovec/vls/reduc-20.c | 2 +- .../riscv/rvv/autovec/vls/reduc-21.c | 2 +- 10 files changed, 224 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 4f3b677f4f9..21f6dadf113 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -255,6 +255,8 @@ struct regmove_vector_cost { const int GR2VR; const int FR2VR; + const int VR2GR; + const int VR2FR; }; /* Cost for vector insn classes. */ diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 90ab93b7506..7c9840df4e9 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1056,6 +1056,9 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost) case scalar_to_vec: return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR : costs->regmove->GR2VR); +case vec_to_scalar: + return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR + : costs->regmove->VR2GR); default: break; } diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index ee1a57b321d..568db90a27d 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -395,6 +395,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = { static const regmove_vector_cost rvv_regmove_vector_cost = { 2, /* GR2VR */ 2, /* FR2VR */ + 2, /* VR2GR */ + 2, /* VR2FR */ }; /* Generic costs for vector insn classes. It is supposed to be the vector cost @@ -10522,7 +10524,7 @@ riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost; case vec_construct: - return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)) - 1; + return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)); default: gcc_unreachable (); diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c new file mode 100644 index 000..0d09a624a00 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c @@ -0,0 +1,195 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=dynamic" } */ + +#include + +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) ((x & y) | (z & (x | y))) + +#define SHR(x, n)(x >> n) +#define ROTR(x,n)(SHR(x,n) | (x << (32 - n))) +#define S1(x)(ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25)) +#define S0(x)(ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22)) + +#define s1(x)(ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10)) +#define s0(x)(ROTR
[Committed V3] RISC-V: Adjust loop len by costing 1 when NITER < VF
Rebase in v3: Rebase to the trunk and commit it as it's approved by Robin. Update in v2: Add dynmaic lmul test. This patch fixes the regression between GCC 13.2.0 and trunk GCC (GCC-14) GCC 13.2.0: lui a5,%hi(a) li a4,19 sb a4,%lo(a)(a5) li a0,0 ret Trunk GCC: vsetvli a5,zero,e8,mf2,ta,ma li a4,-32768 vid.v v1 vsetvli zero,zero,e16,m1,ta,ma addiw a4,a4,104 vmv.v.i v3,15 lui a1,%hi(a) li a0,19 vsetvli zero,zero,e8,mf2,ta,ma vadd.vi v1,v1,1 sb a0,%lo(a)(a1) vsetvli zero,zero,e16,m1,ta,ma vzext.vf2 v2,v1 vmv.v.x v1,a4 vminu.vvv2,v2,v3 vsrl.vv v1,v1,v2 vslidedown.vi v1,v1,17 vmv.x.s a0,v1 sneza0,a0 ret The root cause we are vectorizing the codes inefficiently since we doesn't cost len when NITERS < VF. Leverage loop control of mask targets or rs6000 fixes the regression. Tested no regression. Ok for trunk ? PR target/113281 gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (costs::adjust_vect_cost_per_loop): New function. (costs::finish_cost): Adjust cost for LOOP LEN with NITERS < VF. * config/riscv/riscv-vector-costs.h: New function. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113281-5.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 57 +++ gcc/config/riscv/riscv-vector-costs.h | 2 + .../vect/costmodel/riscv/rvv/pr113281-3.c | 18 ++ .../vect/costmodel/riscv/rvv/pr113281-4.c | 18 ++ .../vect/costmodel/riscv/rvv/pr113281-5.c | 18 ++ 5 files changed, 113 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-5.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 090275c7efe..90ab93b7506 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1097,9 +1097,66 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind, return record_stmt_cost (stmt_info, where, count * stmt_cost); } +/* For some target specific vectorization cost which can't be handled per stmt, + we check the requisite conditions and adjust the vectorization cost + accordingly if satisfied. One typical example is to model model and adjust + loop_len cost for known_lt (NITERS, VF). */ + +void +costs::adjust_vect_cost_per_loop (loop_vec_info loop_vinfo) +{ + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) + && !LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) +{ + /* In middle-end loop vectorizer, we don't count the loop_len cost in +vect_estimate_min_profitable_iters when NITERS < VF, that is, we only +count cost of len that we need to iterate loop more than once with VF. +It's correct for most of the cases: + +E.g. VF = [4, 4] + for (int i = 0; i < 3; i ++) +a[i] += b[i]; + +We don't need to cost MIN_EXPR or SELECT_VL for the case above. + +However, for some inefficient vectorized cases, it does use MIN_EXPR +to generate len. + +E.g. VF = [256, 256] + +Loop body: + # loop_len_110 = PHI <18(2), _119(11)> + ... + _117 = MIN_EXPR ; + _118 = 18 - _117; + _119 = MIN_EXPR <_118, POLY_INT_CST [256, 256]>; + ... + +Epilogue: + ... + _112 = .VEC_EXTRACT (vect_patt_27.14_109, _111); + +We cost 1 unconditionally for this situation like other targets which +apply mask as the loop control. */ + rgroup_controls *rgc; + unsigned int num_vectors_m1; + unsigned int body_stmts = 0; + FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) + if (rgc->type) + body_stmts += num_vectors_m1 + 1; + + add_stmt_cost (body_stmts, scalar_stmt, NULL, NULL, NULL_TREE, 0, +vect_body); +} +} + void costs::finish_cost (const vector_costs *scalar_costs) { + if (loop_vec_info loop_vinfo = dyn_cast (m_vinfo)) +{ + adjust_vect_cost_per_loop (loop_vinfo); +} vector_costs::finish_cost (scalar_costs); } diff --git a/gcc/config/riscv/riscv-vector-costs.h b/gcc/config/riscv/riscv-vector-costs.h index dc0d61f5d4a..4e2bbfd5ca9 100644 --- a/gcc/config/riscv/riscv-vector-costs.h +++ b/gcc/config/riscv/riscv-vector-costs.h @@ -96,6 +96,8 @@ private: V_REGS spills according to the analysis. */ bool m_has_unexpected_spills_p = false; void record_potential_une
[Committed] RISC-V: Add optimized dump check of VLS reduc tests
Add more dump check to robostify the tests. Committed. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls/reduc-1.c: Add dump check. * gcc.target/riscv/rvv/autovec/vls/reduc-10.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-11.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-12.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-13.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-14.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-15.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-16.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-17.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-18.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-2.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-3.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-4.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-5.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-6.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-7.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-8.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-9.c: Ditto. --- .../gcc.target/riscv/rvv/autovec/vls/reduc-1.c | 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-10.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-11.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-12.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-13.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-14.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-15.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-16.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-17.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-18.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-19.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-2.c | 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-20.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-21.c| 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-3.c | 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-4.c | 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-5.c | 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-6.c | 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-7.c | 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-8.c | 14 +- .../gcc.target/riscv/rvv/autovec/vls/reduc-9.c | 14 +- 21 files changed, 273 insertions(+), 21 deletions(-) diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-1.c index 2db25a2b05d..b6d8e6a51ed 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-1.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-1.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8" } */ +/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fdump-tree-optimized-details" } */ #include "def.h" @@ -29,3 +29,15 @@ DEF_REDUC_PLUS (uint8_t, 4096) /* { dg-final { scan-assembler-times {vredsum\.vs} 22 } } */ /* { dg-final { scan-assembler-not {csrr} } } */ +/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */ +/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-10.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-10.c index cdbbe11f611..22aace423cf 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-10.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-10.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8" } */ +/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autove
[Committed] RISC-V: Fix attributes bug configuration of ternary instructions
This patch fixes the following FAILs: Running target riscv-sim/-march=rv64gcv/-mabi=lp64d/-mcmodel=medlow/--param=riscv-autovec-preference=fixed-vlmax FAIL: gcc.c-torture/execute/pr68532.c -O0 execution test FAIL: gcc.c-torture/execute/pr68532.c -O1 execution test FAIL: gcc.c-torture/execute/pr68532.c -O2 execution test FAIL: gcc.c-torture/execute/pr68532.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions execution test FAIL: gcc.c-torture/execute/pr68532.c -O3 -g execution test FAIL: gcc.c-torture/execute/pr68532.c -Os execution test FAIL: gcc.c-torture/execute/pr68532.c -O2 -flto -fno-use-linker-plugin -flto-partition=none execution test Running target riscv-sim/-march=rv64gcv/-mabi=lp64d/-mcmodel=medlow/--param=riscv-autovec-lmul=m2/--param=riscv-autovec-preference=fixed-vlmax FAIL: gcc.dg/vect/pr60196-1.c execution test FAIL: gcc.dg/vect/pr60196-1.c -flto -ffat-lto-objects execution test Running target riscv-sim/-march=rv64gcv/-mabi=lp64d/-mcmodel=medlow/--param=riscv-autovec-preference=fixed-vlmax FAIL: gcc.dg/vect/pr60196-1.c execution test FAIL: gcc.dg/vect/pr60196-1.c -flto -ffat-lto-objects execution test Running target riscv-sim/-march=rv64gcv_zvl256b/-mabi=lp64d/-mcmodel=medlow/--param=riscv-autovec-preference=fixed-vlmax FAIL: gcc.dg/vect/pr60196-1.c execution test FAIL: gcc.dg/vect/pr60196-1.c -flto -ffat-lto-objects execution test The root cause is attributes of ternary intructions are incorrect which cause AVL prop PASS and VSETVL PASS behave incorrectly. Tested no regression and committed. PR target/113393 gcc/ChangeLog: * config/riscv/vector.md: Fix ternary attributes. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr113393-1.c: New test. * gcc.target/riscv/rvv/autovec/pr113393-2.c: New test. * gcc.target/riscv/rvv/autovec/pr113393-3.c: New test. --- gcc/config/riscv/vector.md| 42 +-- .../gcc.target/riscv/rvv/autovec/pr113393-1.c | 24 +++ .../gcc.target/riscv/rvv/autovec/pr113393-2.c | 29 + .../gcc.target/riscv/rvv/autovec/pr113393-3.c | 5 +++ 4 files changed, 79 insertions(+), 21 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113393-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113393-2.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113393-3.c diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index c1a282a27b3..ee4ee059a50 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -715,7 +715,7 @@ (const_int 1) (eq_attr "type" "vimuladd,vfmuladd") - (const_int 5)] + (const_int 2)] (const_int INVALID_ATTRIBUTE))) ;; The index of operand[] represents the machine mode of the instruction. @@ -5308,7 +5308,7 @@ vmv.v.v\t%0,%2\;vmadd.vv\t%0,%3,%4%p1" [(set_attr "type" "vimuladd") (set_attr "mode" "") - (set_attr "merge_op_idx" "4") + (set_attr "merge_op_idx" "2") (set_attr "vl_op_idx" "5") (set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[6])")) (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[7])")) @@ -5339,7 +5339,7 @@ vmv.v.v\t%0,%4\;vmacc.vv\t%0,%2,%3%p1" [(set_attr "type" "vimuladd") (set_attr "mode" "") - (set_attr "merge_op_idx" "2") + (set_attr "merge_op_idx" "4") (set_attr "vl_op_idx" "5") (set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[6])")) (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[7])")) @@ -5392,7 +5392,7 @@ vmv.v.v\t%0,%3\;vmadd.vx\t%0,%2,%4%p1" [(set_attr "type" "vimuladd") (set_attr "mode" "") - (set_attr "merge_op_idx" "4") + (set_attr "merge_op_idx" "3") (set_attr "vl_op_idx" "5") (set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[6])")) (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[7])")) @@ -5424,7 +5424,7 @@ vmv.v.v\t%0,%4\;vmacc.vx\t%0,%2,%3%p1" [(set_attr "type" "vimuladd") (set_attr "mode" "") - (set_attr "merge_op_idx" "2") + (set_attr "merge_op_idx" "4") (set_attr "vl_op_idx" "5") (set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[6])")) (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[7])")) @@ -5492,7 +5492,7 @@ vmv.v.v\t%0,%2\;vmadd.vx\t%0,%2,%4%p1" [(set_attr "type" "vimuladd") (set_attr "mode" "") - (set_attr "merge_op_idx" "4") + (set_attr "merge_op_idx" "3") (set_attr "vl_op_idx" "5") (set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[6])")) (set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[7])")) @@ -5525,7 +5525,7 @@ vmv.v.v\t%0,%4\;vmacc.vx\t%0,%2,%3%p1" [(set_attr "type" "vimuladd") (set_attr "mode" "") - (set_attr "merge_op_idx" "2") + (set_attr "merge_op_idx" "4") (set_attr "vl_op_idx" "5") (set (attr "ta") (symbol_ref "riscv_vector
[PATCH] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
This patch fixes -70% performance drop from GCC-13.2 to GCC-14 with -march=rv64gcv in real hardware. The root cause is incorrect cost model cause inefficient vectorization which makes us performance drop significantly. So this patch does: 1. Adjust vector to scalar cost by introducing v to scalar reg move. 2. Adjust vec_construct cost since we does spend NUNITS instructions to construct the vector. Tested on both RV32/RV64 no regression, ok for trunk ? PR target/113247 gcc/ChangeLog: * config/riscv/riscv-protos.h (struct regmove_vector_cost): Add vector to scalar regmove. * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Ditto. * config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Adjust vec_construct cost. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Adapt test. * gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c: New test. --- gcc/config/riscv/riscv-protos.h | 2 + gcc/config/riscv/riscv-vector-costs.cc| 3 + gcc/config/riscv/riscv.cc | 4 +- .../vect/costmodel/riscv/rvv/pr113247-1.c | 195 ++ .../vect/costmodel/riscv/rvv/pr113247-2.c | 6 + .../vect/costmodel/riscv/rvv/pr113247-3.c | 6 + .../vect/costmodel/riscv/rvv/pr113247-4.c | 6 + .../riscv/rvv/autovec/vls/reduc-19.c | 11 +- .../riscv/rvv/autovec/vls/reduc-20.c | 11 +- .../riscv/rvv/autovec/vls/reduc-21.c | 11 +- 10 files changed, 251 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 4f3b677f4f9..21f6dadf113 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -255,6 +255,8 @@ struct regmove_vector_cost { const int GR2VR; const int FR2VR; + const int VR2GR; + const int VR2FR; }; /* Cost for vector insn classes. */ diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 8adf5700890..298702d2807 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1069,6 +1069,9 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost) case scalar_to_vec: return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR : costs->regmove->GR2VR); +case vec_to_scalar: + return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR + : costs->regmove->VR2GR); default: break; } diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index ee1a57b321d..568db90a27d 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -395,6 +395,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = { static const regmove_vector_cost rvv_regmove_vector_cost = { 2, /* GR2VR */ 2, /* FR2VR */ + 2, /* VR2GR */ + 2, /* VR2FR */ }; /* Generic costs for vector insn classes. It is supposed to be the vector cost @@ -10522,7 +10524,7 @@ riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost; case vec_construct: - return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)) - 1; + return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)); default: gcc_unreachable (); diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c new file mode 100644 index 000..0d09a624a00 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c @@ -0,0 +1,195 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=dynamic" } */ + +#include + +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) ((x & y) | (z & (x | y))) + +#define SHR(x, n)(x >> n) +#define ROTR(x,n)(SHR(x,n) | (x << (32 - n))) +#define S1(x)(ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25)) +#define S0(x)(ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22)) + +#define s1(x)(ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10)) +#define s0(x)(ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3)) + +#define SH
[PATCH] RISC-V: Adjust loop len by costing 1 when NITER < VF
Update in v2: Add dynmaic lmul test. This patch fixes the regression between GCC 13.2.0 and trunk GCC (GCC-14) GCC 13.2.0: lui a5,%hi(a) li a4,19 sb a4,%lo(a)(a5) li a0,0 ret Trunk GCC: vsetvli a5,zero,e8,mf2,ta,ma li a4,-32768 vid.v v1 vsetvli zero,zero,e16,m1,ta,ma addiw a4,a4,104 vmv.v.i v3,15 lui a1,%hi(a) li a0,19 vsetvli zero,zero,e8,mf2,ta,ma vadd.vi v1,v1,1 sb a0,%lo(a)(a1) vsetvli zero,zero,e16,m1,ta,ma vzext.vf2 v2,v1 vmv.v.x v1,a4 vminu.vvv2,v2,v3 vsrl.vv v1,v1,v2 vslidedown.vi v1,v1,17 vmv.x.s a0,v1 sneza0,a0 ret The root cause we are vectorizing the codes inefficiently since we doesn't cost len when NITERS < VF. Leverage loop control of mask targets or rs6000 fixes the regression. Tested no regression. Ok for trunk ? PR target/113281 gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (costs::adjust_vect_cost_per_loop): New function. (costs::finish_cost): Adjust cost for LOOP LEN with NITERS < VF. * config/riscv/riscv-vector-costs.h: New function. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113281-5.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 57 +++ gcc/config/riscv/riscv-vector-costs.h | 2 + .../vect/costmodel/riscv/rvv/pr113281-3.c | 18 ++ .../vect/costmodel/riscv/rvv/pr113281-4.c | 18 ++ .../vect/costmodel/riscv/rvv/pr113281-5.c | 18 ++ 5 files changed, 113 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-5.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 1c3708f23a0..8adf5700890 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1110,9 +1110,66 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind, return record_stmt_cost (stmt_info, where, count * stmt_cost); } +/* For some target specific vectorization cost which can't be handled per stmt, + we check the requisite conditions and adjust the vectorization cost + accordingly if satisfied. One typical example is to model model and adjust + loop_len cost for known_lt (NITERS, VF). */ + +void +costs::adjust_vect_cost_per_loop (loop_vec_info loop_vinfo) +{ + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) + && !LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)) +{ + /* In middle-end loop vectorizer, we don't count the loop_len cost in +vect_estimate_min_profitable_iters when NITERS < VF, that is, we only +count cost of len that we need to iterate loop more than once with VF +(m_num_vector_iterations > 1). It's correct for most of the cases: + +E.g. VF = [4, 4] + for (int i = 0; i < 3; i ++) +a[i] += b[i]; + +We don't need to cost MIN_EXPR or SELECT_VL for the case above. + +However, for some inefficient vectorized cases, it does use MIN_EXPR +to generate len. + +E.g. VF = [256, 256] + +Loop body: + # loop_len_110 = PHI <18(2), _119(11)> + ... + _117 = MIN_EXPR ; + _118 = 18 - _117; + _119 = MIN_EXPR <_118, POLY_INT_CST [256, 256]>; + ... + +Epilogue: + ... + _112 = .VEC_EXTRACT (vect_patt_27.14_109, _111); + +We cost 1 unconditionally for this situation like other targets which +apply mask as the loop control. */ + rgroup_controls *rgc; + unsigned int num_vectors_m1; + unsigned int body_stmts = 0; + FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) + if (rgc->type) + body_stmts += num_vectors_m1 + 1; + + add_stmt_cost (body_stmts, scalar_stmt, NULL, NULL, NULL_TREE, 0, +vect_body); +} +} + void costs::finish_cost (const vector_costs *scalar_costs) { + if (loop_vec_info loop_vinfo = dyn_cast (m_vinfo)) +{ + adjust_vect_cost_per_loop (loop_vinfo); +} vector_costs::finish_cost (scalar_costs); } diff --git a/gcc/config/riscv/riscv-vector-costs.h b/gcc/config/riscv/riscv-vector-costs.h index 9bf041bb65c..3defd45fd4c 100644 --- a/gcc/config/riscv/riscv-vector-costs.h +++ b/gcc/config/riscv/riscv-vector-costs.h @@ -101,6 +101,8 @@ private: V_REGS spills according to the analysis. */ bool m_has_unexpected_spills_p = false; void record_potential_unexpected_spills (loop_vec_info); + + void
[PATCH] RISC-V: Adjust loop len by costing 1 when NITER < VF [GCC 14 regression]
This patch fixes the regression between GCC 13.2.0 and trunk GCC (GCC-14) GCC 13.2.0: lui a5,%hi(a) li a4,19 sb a4,%lo(a)(a5) li a0,0 ret Trunk GCC: vsetvli a5,zero,e8,mf2,ta,ma li a4,-32768 vid.v v1 vsetvli zero,zero,e16,m1,ta,ma addiw a4,a4,104 vmv.v.i v3,15 lui a1,%hi(a) li a0,19 vsetvli zero,zero,e8,mf2,ta,ma vadd.vi v1,v1,1 sb a0,%lo(a)(a1) vsetvli zero,zero,e16,m1,ta,ma vzext.vf2 v2,v1 vmv.v.x v1,a4 vminu.vvv2,v2,v3 vsrl.vv v1,v1,v2 vslidedown.vi v1,v1,17 vmv.x.s a0,v1 sneza0,a0 ret The root cause we are vectorizing the codes inefficiently since we doesn't cost len when NITERS < VF. Leverage loop control of mask targets or rs6000 fixes the regression. Tested no regression. Ok for trunk ? PR target/113281 gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (costs::adjust_vect_cost_per_loop): New function. (costs::finish_cost): Adjust cost * config/riscv/riscv-vector-costs.h: New function. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 61 +++ gcc/config/riscv/riscv-vector-costs.h | 2 + .../vect/costmodel/riscv/rvv/pr113281-3.c | 18 ++ .../vect/costmodel/riscv/rvv/pr113281-4.c | 18 ++ 4 files changed, 99 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 1c3708f23a0..9c0b9a874de 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1110,9 +1110,70 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind, return record_stmt_cost (stmt_info, where, count * stmt_cost); } +/* For some target specific vectorization cost which can't be handled per stmt, + we check the requisite conditions and adjust the vectorization cost + accordingly if satisfied. One typical example is to model model and adjust + loop_len cost for known_lt (NITERS, VF). */ + +void +costs::adjust_vect_cost_per_loop (loop_vec_info loop_vinfo) +{ + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo) + && !LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) + && m_num_vector_iterations == 1 + && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo), + LOOP_VINFO_VECT_FACTOR (loop_vinfo))) +{ + /* In middle-end loop vectorizer, we don't count the loop_len cost in +vect_estimate_min_profitable_iters when NITERS < VF, that is, we only +count cost of len that we need to iterate loop more than once with VF +(m_num_vector_iterations > 1). It's correct for most of the cases: + +E.g. VF = [4, 4] + for (int i = 0; i < 3; i ++) +a[i] += b[i]; + +We don't need to cost MIN_EXPR or SELECT_VL for the case above. + +However, for some inefficient vectorized cases, it does use MIN_EXPR +to generate len. + +E.g. VF = [256, 256] + +Loop body: + # loop_len_110 = PHI <18(2), _119(11)> + ... + _117 = MIN_EXPR ; + _118 = 18 - _117; + _119 = MIN_EXPR <_118, POLY_INT_CST [256, 256]>; + ... + +Epilogue: + ... + _112 = .VEC_EXTRACT (vect_patt_27.14_109, _111); + +We cost 1 unconditionally for this situation like other targets which +apply mask as the loop control. */ + rgroup_controls *rgc; + unsigned int num_vectors_m1; + unsigned int body_stmts = 0; + FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) + if (rgc->type) + body_stmts += num_vectors_m1 + 1; + + add_stmt_cost (body_stmts, scalar_stmt, NULL, NULL, NULL_TREE, 0, +vect_body); +} +} + void costs::finish_cost (const vector_costs *scalar_costs) { + if (loop_vec_info loop_vinfo = dyn_cast (m_vinfo)) +{ + adjust_vect_cost_per_loop (loop_vinfo); +} vector_costs::finish_cost (scalar_costs); } diff --git a/gcc/config/riscv/riscv-vector-costs.h b/gcc/config/riscv/riscv-vector-costs.h index 9bf041bb65c..3defd45fd4c 100644 --- a/gcc/config/riscv/riscv-vector-costs.h +++ b/gcc/config/riscv/riscv-vector-costs.h @@ -101,6 +101,8 @@ private: V_REGS spills according to the analysis. */ bool m_has_unexpected_spills_p = false; void record_potential_unexpected_spills (loop_vec_info); + + void adjust_vect_cost_per_loop (loop_vec_info); }; } // namespace riscv_ve
[PATCH V3] RISC-V: Adjust scalar_to_vec cost
1. Introduce vector regmove new tune info. 2. Adjust scalar_to_vec cost in add_stmt_cost. We will get optimal codegen after this patch with -march=rv64gcv_zvl256b: lui a5,%hi(a) li a4,19 sb a4,%lo(a)(a5) li a0,0 ret Tested on both RV32/RV64 no regression, Ok for trunk ? PR target/113281 gcc/ChangeLog: * config/riscv/riscv-protos.h (struct regmove_vector_cost): New struct. (struct cpu_vector_cost): Add regmove struct. (get_vector_costs): Export as global. * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Adjust scalar_to_vec cost. (costs::add_stmt_cost): Ditto. * config/riscv/riscv.cc (get_common_costs): Export global function. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr113209.c: Adapt test. * gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c: New test. --- gcc/config/riscv/riscv-protos.h | 11 gcc/config/riscv/riscv-vector-costs.cc| 23 + gcc/config/riscv/riscv.cc | 25 --- .../vect/costmodel/riscv/rvv/pr113281-1.c | 18 + .../vect/costmodel/riscv/rvv/pr113281-2.c | 18 + .../gcc.target/riscv/rvv/autovec/pr113209.c | 2 +- 6 files changed, 87 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index e8c54c5be50..4f3b677f4f9 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -250,6 +250,13 @@ struct scalable_vector_cost : common_vector_cost E.g. fold_left reduction cost, lanes load/store cost, ..., etc. */ }; +/* Additional costs for register copies. Cost is for one register. */ +struct regmove_vector_cost +{ + const int GR2VR; + const int FR2VR; +}; + /* Cost for vector insn classes. */ struct cpu_vector_cost { @@ -276,6 +283,9 @@ struct cpu_vector_cost /* Cost of an VLA modes operations. */ const scalable_vector_cost *vla; + + /* Cost of vector register move operations. */ + const regmove_vector_cost *regmove; }; /* Routines implemented in riscv-selftests.cc. */ @@ -764,5 +774,6 @@ struct riscv_tune_info { const struct riscv_tune_info * riscv_parse_tune (const char *, bool); +const cpu_vector_cost *get_vector_costs (); #endif /* ! GCC_RISCV_PROTOS_H */ diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 58ec0b9b503..1c3708f23a0 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -1055,6 +1055,26 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const return vector_costs::better_main_loop_than_p (other); } +/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost. + For some statement, we would like to further fine-grain tweak the cost on + top of riscv_builtin_vectorization_cost handling which doesn't have any + information on statement operation codes etc. */ + +static unsigned +adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost) +{ + const cpu_vector_cost *costs = get_vector_costs (); + switch (kind) +{ +case scalar_to_vec: + return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR + : costs->regmove->GR2VR); +default: + break; +} + return stmt_cost; +} + unsigned costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_vec_info stmt_info, slp_tree, tree vectype, @@ -1082,6 +1102,9 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind, as one iteration of the VLA loop. */ if (where == vect_body && m_unrolled_vls_niters) m_unrolled_vls_stmts += count * m_unrolled_vls_niters; + + if (vectype) + stmt_cost = adjust_stmt_cost (kind, vectype, stmt_cost); } return record_stmt_cost (stmt_info, where, count * stmt_cost); diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index f829014a589..ee1a57b321d 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -391,17 +391,24 @@ static const scalable_vector_cost rvv_vla_vector_cost = { }, }; +/* RVV register move cost. */ +static const regmove_vector_cost rvv_regmove_vector_cost = { + 2, /* GR2VR */ + 2, /* FR2VR */ +}; + /* Generic costs for vector insn classes. It is supposed to be the vector cost models used by default if no other cost model was specified. */ static const struct cpu_vector_cost generic_vector_cost = { - 1, /* scalar_int_stmt_cost */ - 1, /* scalar_fp_stmt_cost */ - 1, /* sca
[Committed] RISC-V: Enhance a testcase
This test should pass no matter how we adjust cost model. Remove -fno-vect-cost-model. Committed. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/fold-min-poly.c: Remove -fno-vect-cost-model --- gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c index de4c472c76e..3f524dba868 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options " -march=rv64gcv_zvl128b -mabi=lp64d -O3 --param riscv-autovec-preference=scalable --param riscv-autovec-lmul=m1 -fno-vect-cost-model" } */ +/* { dg-options " -march=rv64gcv_zvl128b -mabi=lp64d -O3 --param riscv-autovec-preference=scalable --param riscv-autovec-lmul=m1" } */ void foo1 (int* restrict a, int* restrict b, int n) { -- 2.36.3
[PATCH V2] RISC-V: Adjust scalar_to_vec cost accurately
1. This patch set scalar_to_vec cost as 2 instead 1 since scalar move instruction is slightly more costly than normal rvv instructions (e.g. vadd.vv). 2. Adjust scalar_to_vec cost accurately according to the splat value, for example, a value like 32872, needs 2 more scalar instructions: so the cost = 2 (scalar instructions) + 2 (scalar move). We adjust the cost like this since it doesn need such many instructions in vectorized codes, wheras they are not needed in scalar codes. After this patch, no matter -march=rv64gcv_zvl256b or -march=rv64gcv_zvl4096b. We have optimal codgen: lui a5,%hi(a) li a4,19 sb a4,%lo(a)(a5) li a0,0 ret PR target/113281 gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Adjust scalar_to_vec cost accurately. (costs::add_stmt_cost): Ditto. * config/riscv/riscv.cc: Ditto. * config/riscv/t-riscv: Ditto. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr113209.c: Adapt test. * gcc.target/riscv/rvv/autovec/zve32f-1.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 50 ++- gcc/config/riscv/riscv.cc | 4 +- gcc/config/riscv/t-riscv | 2 +- .../vect/costmodel/riscv/rvv/pr113281-1.c | 18 +++ .../vect/costmodel/riscv/rvv/pr113281-2.c | 18 +++ .../gcc.target/riscv/rvv/autovec/pr113209.c | 2 +- .../gcc.target/riscv/rvv/autovec/zve32f-1.c | 2 +- 7 files changed, 90 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 58ec0b9b503..fc377435e53 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -42,6 +42,7 @@ along with GCC; see the file COPYING3. If not see #include "backend.h" #include "tree-data-ref.h" #include "tree-ssa-loop-niter.h" +#include "emit-rtl.h" /* This file should be included last. */ #include "riscv-vector-costs.h" @@ -1055,6 +1056,50 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const return vector_costs::better_main_loop_than_p (other); } +/* Adjust vectorization cost after calling + targetm.vectorize.builtin_vectorization_cost. For some statement, we would + like to further fine-grain tweak the cost on top of + targetm.vectorize.builtin_vectorization_cost handling which doesn't have any + information on statement operation codes etc. */ + +static unsigned +adjust_stmt_cost (enum vect_cost_for_stmt kind, + struct _stmt_vec_info *stmt_info, int count, int stmt_cost) +{ + gimple *stmt = stmt_info->stmt; + switch (kind) +{ + case scalar_to_vec: { + stmt_cost *= count; + gcall *call = dyn_cast (stmt); + /* Adjust cost by counting the scalar value initialization. */ + unsigned int num + = call ? gimple_call_num_args (call) : gimple_num_ops (stmt); + unsigned int start = call ? 0 : 1; + + for (unsigned int i = start; i < num; i++) + { + tree op = call ? gimple_call_arg (call, i) : gimple_op (stmt, i); + if (TREE_CODE (op) == INTEGER_CST) + { + HOST_WIDE_INT value = tree_fits_shwi_p (op) ? tree_to_shwi (op) + : tree_to_uhwi (op); + /* We don't need to count scalar costs if it + is in range of [-16, 15] since we can use + vmv.v.i. */ + if (!IN_RANGE (value, -16, 15)) + stmt_cost += riscv_const_insns (gen_int_mode (value, Pmode)); + } + /* TODO: We don't count CONST_POLY_INT value for now. */ + } + return stmt_cost; + } +default: + break; +} + return count * stmt_cost; +} + unsigned costs::add_stmt_cost (int count, vect_cost_for_stmt kind, stmt_vec_info stmt_info, slp_tree, tree vectype, @@ -1082,9 +1127,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind, as one iteration of the VLA loop. */ if (where == vect_body && m_unrolled_vls_niters) m_unrolled_vls_stmts += count * m_unrolled_vls_niters; + + if (vectype) + stmt_cost = adjust_stmt_cost (kind, stmt_info, count, stmt_cost); } - return record_stmt_cost (stmt_info, where, count * stmt_cost); + return record_stmt_cost (stmt_info, where, stmt_cost); } void diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index df9799d9c5e..a14fb36817a 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc
[PATCH] RISC-V: Increase scalar_to_vec_cost from 1 to 3
This patch fixes the following inefficient vectorized codes: vsetvli a5,zero,e8,mf2,ta,ma li a2,17 vid.v v1 li a4,-32768 vsetvli zero,zero,e16,m1,ta,ma addiw a4,a4,104 vmv.v.i v3,15 lui a1,%hi(a) li a0,19 vsetvli zero,zero,e8,mf2,ta,ma vadd.vx v1,v1,a2 sb a0,%lo(a)(a1) vsetvli zero,zero,e16,m1,ta,ma vzext.vf2 v2,v1 vmv.v.x v1,a4 vminu.vvv2,v2,v3 vsrl.vv v1,v1,v2 vslidedown.vi v1,v1,1 vmv.x.s a0,v1 sneza0,a0 ret The reason is scalar_to_vec_cost is too low. Consider in VEC_SET, we always have a slide + scalar move instruction, scalar_to_vec_cost = 1 (current cost) is not reasonable. I tried to set it as 2 but failed fix this case, that is, I need to set it as 3 to fix this case. No matter scalar move or slide instruction, I believe they are more costly than normal vector instructions (e.g. vadd.vv). So set it as 3 looks reasonable to me. After this patch: lui a5,%hi(a) li a4,19 sb a4,%lo(a)(a5) li a0,0 ret Tested on both RV32/RV64 no regression, Ok for trunk ? PR target/113281 gcc/ChangeLog: * config/riscv/riscv.cc: Set scalar_to_vec_cost as 3. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr113209.c: Adapt test. * gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c: New test. --- gcc/config/riscv/riscv.cc | 4 ++-- .../vect/costmodel/riscv/rvv/pr113281-1.c | 18 ++ .../gcc.target/riscv/rvv/autovec/pr113209.c| 2 +- 3 files changed, 21 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index df9799d9c5e..bcfb3c15a39 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -366,7 +366,7 @@ static const common_vector_cost rvv_vls_vector_cost = { 1, /* gather_load_cost */ 1, /* scatter_store_cost */ 1, /* vec_to_scalar_cost */ - 1, /* scalar_to_vec_cost */ + 3, /* scalar_to_vec_cost */ 1, /* permute_cost */ 1, /* align_load_cost */ 1, /* align_store_cost */ @@ -382,7 +382,7 @@ static const scalable_vector_cost rvv_vla_vector_cost = { 1, /* gather_load_cost */ 1, /* scatter_store_cost */ 1, /* vec_to_scalar_cost */ -1, /* scalar_to_vec_cost */ +3, /* scalar_to_vec_cost */ 1, /* permute_cost */ 1, /* align_load_cost */ 1, /* align_store_cost */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c new file mode 100644 index 000..331cf961a1f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3 -ftree-vectorize -fdump-tree-vect-details" } */ + +unsigned char a; + +int main() { + short b = a = 0; + for (; a != 19; a++) +if (a) + b = 32872 >> a; + + if (b == 0) +return 0; + else +return 1; +} + +/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c index 081ee369394..70aae151000 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3" } */ +/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3 -fno-vect-cost-model" } */ int b, c, d, f, i, a; int e[1] = {0}; -- 2.36.3
[PATCH] RISC-V: VLA preempts VLS on unknown NITERS loop
This patch fixes the known issues on SLP cases: ble a2,zero,.L11 addiw t1,a2,-1 li a5,15 bleut1,a5,.L9 srliw a7,t1,4 sllia7,a7,7 lui t3,%hi(.LANCHOR0) lui a6,%hi(.LANCHOR0+128) addit3,t3,%lo(.LANCHOR0) li a4,128 addia6,a6,%lo(.LANCHOR0+128) add a7,a7,a0 addia3,a1,37 mv a5,a0 vsetvli zero,a4,e8,m8,ta,ma vle8.v v24,0(t3) vle8.v v16,0(a6) .L4: li a6,128 vle8.v v0,0(a3) vrgather.vv v8,v0,v24 vadd.vv v8,v8,v16 vse8.v v8,0(a5) add a5,a5,a6 add a3,a3,a6 bne a5,a7,.L4 andia5,t1,-16 mv t1,a5 .L3: subwa2,a2,a5 li a4,1 beq a2,a4,.L5 sllia5,a5,32 srlia5,a5,32 addiw a2,a2,-1 sllia5,a5,3 csrra4,vlenb sllia6,a2,32 addit3,a5,37 srlia3,a6,29 sllia4,a4,2 add t3,a1,t3 add a5,a0,a5 mv t5,a3 bgtua3,a4,.L14 .L6: li a4,50790400 addia4,a4,1541 li a6,67633152 addia6,a6,513 sllia4,a4,32 add a4,a4,a6 vsetvli t4,zero,e64,m4,ta,ma vmv.v.x v16,a4 vsetvli a6,zero,e16,m8,ta,ma vid.v v8 vsetvli zero,t5,e8,m4,ta,ma vle8.v v20,0(t3) vsetvli a6,zero,e16,m8,ta,ma csrra7,vlenb vand.vi v8,v8,-8 vsetvli zero,zero,e8,m4,ta,ma sllia4,a7,2 vrgatherei16.vv v4,v20,v8 vadd.vv v4,v4,v16 vsetvli zero,t5,e8,m4,ta,ma vse8.v v4,0(a5) bgtua3,a4,.L15 .L7: addwt1,a2,t1 .L5: slliw a5,t1,3 add a1,a1,a5 lui a4,%hi(.LC2) add a0,a0,a5 lbu a3,37(a1) addia5,a4,%lo(.LC2) vsetivlizero,8,e8,mf2,ta,ma vmv.v.x v1,a3 vle8.v v2,0(a5) vadd.vv v1,v1,v2 vse8.v v1,0(a0) .L11: ret .L15: sub a3,a3,a4 bleua3,a4,.L8 mv a3,a4 .L8: li a7,50790400 csrra4,vlenb sllia4,a4,2 addia7,a7,1541 li t4,67633152 add t3,t3,a4 vsetvli zero,a3,e8,m4,ta,ma sllia7,a7,32 addit4,t4,513 vle8.v v20,0(t3) add a4,a5,a4 add a7,a7,t4 vsetvli a5,zero,e64,m4,ta,ma vmv.v.x v16,a7 vsetvli a6,zero,e16,m8,ta,ma vid.v v8 vand.vi v8,v8,-8 vsetvli zero,zero,e8,m4,ta,ma vrgatherei16.vv v4,v20,v8 vadd.vv v4,v4,v16 vsetvli zero,a3,e8,m4,ta,ma vse8.v v4,0(a4) j .L7 .L14: mv t5,a4 j .L6 .L9: li a5,0 li t1,0 j .L3 The vectorization codegen is quite inefficient since we choose a VLS modes to vectorize the loop body with epilogue choosing a VLA modes. cost.c:6:21: note: * Choosing vector mode V128QI cost.c:6:21: note: * Choosing epilogue vector mode RVVM4QI As we known, in RVV side, we have VLA modes and VLS modes. VLAmodes support partial vectors wheras VLSmodes support full vectors. The goal we add VLSmodes is to improve the codegen of known NITERS or SLP codes. If NITERS is unknown, that is i < n, n is unknown. We will always have partial vectors vectorization. It can be loop body or epilogue. In this case, It's always more efficient to apply VLA partial vectorization on loop body which doesn't have epilogue. After this patch: f: ble a2,zero,.L7 li a5,1 beq a2,a5,.L5 li a6,50790400 addia6,a6,1541 li a4,67633152 addia4,a4,513 csrra5,vlenb addiw a2,a2,-1 sllia6,a6,32 add a6,a6,a4 sllia5,a5,2 sllia4,a2,32 vsetvli t1,zero,e64,m4,ta,ma srlia3,a4,29 neg t4,a5 addia7,a1,37 mv a4,a0 vmv.v.x v12,a6 vsetvli t3,zero,e16,m8,ta,ma vid.v v16 vand.vi v16,v16,-8 .L4: minua6,a3,a5 vsetvli zero,a6,e8,m4,ta,ma vle8.v v8,0(a7) vsetvli t3,zero,e8,m4,ta,ma mv t1,a3 vrgatherei16.vv v4,v8,v16 vsetvli zero,a6,e8,m4,ta,ma vadd.vv v4,v4,v12 vse8.v v4,0(a4) add a7,a7,a5 add a4,a4,a5 add a3,a3,t4 bgtut1,a5,.L4 .L3: slliw a2,a2,3 add a1,a1,a2 lui a5,%hi(.LC0) lbu a4,37(a1) add a0,a0,a2 addia5,a5,%lo(.LC0) vsetivlizero,8,e8,mf2,ta,ma vmv.v.x v1,a4 vle8.v v2,0(a5) vadd.vv v1,v1
[PATCH V2] RISC-V: Switch RVV cost model.
This patch is preparing patch for the following cost model tweak. Since we don't have vector cost model in default tune info (rocket), we set the cost model default as generic cost model by default. The reason we want to switch to generic vector cost model is the default cost model generates inferior codegen for various benchmarks. For example, PR113247, we have performance bug that we end up having over 70% performance drop of SHA256. Currently, no matter how we adapt cost model, we are not able to fix the performance bug since we always use default cost model by default. Also, tweak the generic cost model back to default cost model since we have some FAILs in current tests. After this patch, we (me an Robin) can work on cost model tunning together to improve performane in various benchmarks. Tested on both RV32 and RV64, ok for trunk ? gcc/ChangeLog: * config/riscv/riscv.cc (get_common_costs): Switch RVV cost model. (get_vector_costs): Ditto. (riscv_builtin_vectorization_cost): Ditto. --- gcc/config/riscv/riscv.cc | 144 -- 1 file changed, 75 insertions(+), 69 deletions(-) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 32183d63180..cca01fd54d9 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -352,48 +352,49 @@ const enum reg_class riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = { VD_REGS, VD_REGS,VD_REGS,VD_REGS, }; -/* Generic costs for VLS vector operations. */ -static const common_vector_cost generic_vls_vector_cost = { +/* RVV costs for VLS vector operations. */ +static const common_vector_cost rvv_vls_vector_cost = { 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ - 2, /* vec_to_scalar_cost */ + 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ - 2, /* permute_cost */ + 1, /* permute_cost */ 1, /* align_load_cost */ 1, /* align_store_cost */ - 1, /* unalign_load_cost */ - 1, /* unalign_store_cost */ + 2, /* unalign_load_cost */ + 2, /* unalign_store_cost */ }; -/* Generic costs for VLA vector operations. */ -static const scalable_vector_cost generic_vla_vector_cost = { +/* RVV costs for VLA vector operations. */ +static const scalable_vector_cost rvv_vla_vector_cost = { { 1, /* int_stmt_cost */ 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ -2, /* vec_to_scalar_cost */ +1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ -2, /* permute_cost */ +1, /* permute_cost */ 1, /* align_load_cost */ 1, /* align_store_cost */ -1, /* unalign_load_cost */ -1, /* unalign_store_cost */ +2, /* unalign_load_cost */ +2, /* unalign_store_cost */ }, }; -/* Generic costs for vector insn classes. */ +/* Generic costs for vector insn classes. It is supposed to be the vector cost + models used by default if no other cost model was specified. */ static const struct cpu_vector_cost generic_vector_cost = { - 1, /* scalar_int_stmt_cost */ - 1, /* scalar_fp_stmt_cost */ - 1, /* scalar_load_cost */ - 1, /* scalar_store_cost */ - 3, /* cond_taken_branch_cost */ - 1, /* cond_not_taken_branch_cost */ - &generic_vls_vector_cost, /* vls */ - &generic_vla_vector_cost, /* vla */ + 1, /* scalar_int_stmt_cost */ + 1, /* scalar_fp_stmt_cost */ + 1, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 3, /* cond_taken_branch_cost */ + 1, /* cond_not_taken_branch_cost */ + &rvv_vls_vector_cost, /* vls */ + &rvv_vla_vector_cost, /* vla */ }; /* Costs to use when optimizing for rocket. */ @@ -10372,11 +10373,10 @@ riscv_frame_pointer_required (void) return riscv_save_frame_pointer && !crtl->is_leaf; } -/* Return the appropriate common costs for vectors of type VECTYPE. */ +/* Return the appropriate common costs according to VECTYPE from COSTS. */ static const common_vector_cost * -get_common_costs (tree vectype) +get_common_costs (const cpu_vector_cost *costs, tree vectype) { - const cpu_vector_cost *costs = tune_param->vec_costs; gcc_assert (costs); if (vectype && riscv_v_ext_vls_mode_p (TYPE_MODE (vectype))) @@ -10384,78 +10384,84 @@ get_common_costs (tree vectype) return costs->vla; } +/* Return the CPU vector costs according to -mtune if tune info has non-NULL + vector cost. Otherwide, return the default generic vector costs. */ +static const cpu_vector_cost * +get_vector_costs () +{ + const cpu_vector_cost *costs = tune_param->vec_costs; + if (!costs) +return &generic_vector_cost; + return costs; +} + /* Implement targetm.vectorize.builtin_vector
[PATCH] RISC-V: Switch RVV cost model to generic vector cost model
This patch is preparing patch for the following cost model tweak. Since we don't have vector cost model in default tune info (rocket), we set the cost model default as generic cost model by default. The reason we want to switch to generic vector cost model is the default cost model generates inferior codegen for various benchmarks. For example, PR113247, we have performance bug that we end up having over 70% performance drop of SHA256. Currently, no matter how we adapt cost model, we are not able to fix the performance bug since we always use default cost model by default. Also, tweak the generic cost model back to default cost model since we have some FAILs in current tests. After this patch, we (me an Robin) can work on cost model tunning together to improve performane in various benchmarks. Tested on both RV32 and RV64, ok for trunk ? gcc/ChangeLog: * config/riscv/riscv.cc (get_common_costs): Switch RVV cost model. (get_vector_costs): Ditto. (riscv_builtin_vectorization_cost): Ditto. --- gcc/config/riscv/riscv.cc | 117 -- 1 file changed, 61 insertions(+), 56 deletions(-) diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 32183d63180..d72058039ce 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -358,13 +358,13 @@ static const common_vector_cost generic_vls_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ - 2, /* vec_to_scalar_cost */ + 1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ - 2, /* permute_cost */ + 1, /* permute_cost */ 1, /* align_load_cost */ 1, /* align_store_cost */ - 1, /* unalign_load_cost */ - 1, /* unalign_store_cost */ + 2, /* unalign_load_cost */ + 2, /* unalign_store_cost */ }; /* Generic costs for VLA vector operations. */ @@ -374,13 +374,13 @@ static const scalable_vector_cost generic_vla_vector_cost = { 1, /* fp_stmt_cost */ 1, /* gather_load_cost */ 1, /* scatter_store_cost */ -2, /* vec_to_scalar_cost */ +1, /* vec_to_scalar_cost */ 1, /* scalar_to_vec_cost */ -2, /* permute_cost */ +1, /* permute_cost */ 1, /* align_load_cost */ 1, /* align_store_cost */ -1, /* unalign_load_cost */ -1, /* unalign_store_cost */ +2, /* unalign_load_cost */ +2, /* unalign_store_cost */ }, }; @@ -10372,11 +10372,10 @@ riscv_frame_pointer_required (void) return riscv_save_frame_pointer && !crtl->is_leaf; } -/* Return the appropriate common costs for vectors of type VECTYPE. */ +/* Return the appropriate common costs according to VECTYPE from COSTS. */ static const common_vector_cost * -get_common_costs (tree vectype) +get_common_costs (const cpu_vector_cost *costs, tree vectype) { - const cpu_vector_cost *costs = tune_param->vec_costs; gcc_assert (costs); if (vectype && riscv_v_ext_vls_mode_p (TYPE_MODE (vectype))) @@ -10384,78 +10383,84 @@ get_common_costs (tree vectype) return costs->vla; } +/* Return the CPU vector costs according to -mtune if tune info has non-NULL + vector cost. Otherwide, return the default generic vector costs. */ +static const cpu_vector_cost * +get_vector_costs () +{ + const cpu_vector_cost *costs = tune_param->vec_costs; + if (!costs) +return &generic_vector_cost; + return costs; +} + /* Implement targetm.vectorize.builtin_vectorization_cost. */ static int riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, tree vectype, int misalign ATTRIBUTE_UNUSED) { - unsigned elements; - const cpu_vector_cost *costs = tune_param->vec_costs; + const cpu_vector_cost *costs = get_vector_costs (); bool fp = false; if (vectype != NULL) fp = FLOAT_TYPE_P (vectype); - if (costs != NULL) + const common_vector_cost *common_costs = get_common_costs (costs, vectype); + gcc_assert (common_costs != NULL); + switch (type_of_cost) { - const common_vector_cost *common_costs = get_common_costs (vectype); - gcc_assert (common_costs != NULL); - switch (type_of_cost) - { - case scalar_stmt: - return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost; +case scalar_stmt: + return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost; - case scalar_load: - return costs->scalar_load_cost; +case scalar_load: + return costs->scalar_load_cost; - case scalar_store: - return costs->scalar_store_cost; +case scalar_store: + return costs->scalar_store_cost; - case vector_stmt: - return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost; +case vector_stmt: + return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost; - case vector_load: - return common_costs->align_load_cost; +case vector_load: + return common_costs->
[PATCH] RISC-V: Refine unsigned avg_floor/avg_ceil
This patch is inspired by LLVM patches: https://github.com/llvm/llvm-project/pull/76550 https://github.com/llvm/llvm-project/pull/77473 Use vaaddu for AVG vectorization. Before this patch: vsetivlizero,8,e8,mf2,ta,ma vle8.v v3,0(a1) vle8.v v2,0(a2) vwaddu.vvv1,v3,v2 vsetvli zero,zero,e16,m1,ta,ma vadd.vi v1,v1,1 vsetvli zero,zero,e8,mf2,ta,ma vnsrl.wiv1,v1,1 vse8.v v1,0(a0) ret After this patch: vsetivlizero,8,e8,mf2,ta,ma csrwi vxrm,0 vle8.v v1,0(a1) vle8.v v2,0(a2) vaaddu.vv v1,v1,v2 vse8.v v1,0(a0) ret Note on signed averaging addition Based on the rvv spec, there is also a variant for signed averaging addition called vaadd. But AFAIU, no matter in which rounding mode, we cannot achieve the semantic of signed averaging addition through vaadd. Thus this patch only introduces vaaddu. More details in: https://github.com/riscv/riscv-v-spec/issues/935 https://github.com/riscv/riscv-v-spec/issues/934 Tested on both RV32 and RV64 no regression. Ok for trunk ? gcc/ChangeLog: * config/riscv/autovec.md (avg3_floor): Remove. (avg3_floor): New pattern. (avg3_ceil): Remove. (avg3_ceil): New pattern. (uavg3_floor): Ditto. (uavg3_ceil): Ditto. * config/riscv/riscv-protos.h (enum insn_flags): Add for average addition. (enum insn_type): Ditto. * config/riscv/riscv-v.cc: Ditto. * config/riscv/vector-iterators.md (ashiftrt): Remove. (ASHIFTRT): Ditto. * config/riscv/vector.md: Add VLS modes. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/vls/avg-1.c: Adapt test. * gcc.target/riscv/rvv/autovec/vls/avg-2.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/avg-3.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/avg-4.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/avg-5.c: Ditto. * gcc.target/riscv/rvv/autovec/vls/avg-6.c: Ditto. * gcc.target/riscv/rvv/autovec/widen/vec-avg-rv32gcv.c: Ditto. * gcc.target/riscv/rvv/autovec/widen/vec-avg-rv64gcv.c: Ditto. --- gcc/config/riscv/autovec.md | 50 ++- gcc/config/riscv/riscv-protos.h | 8 +++ gcc/config/riscv/riscv-v.cc | 11 gcc/config/riscv/vector-iterators.md | 5 -- gcc/config/riscv/vector.md| 12 ++--- .../gcc.target/riscv/rvv/autovec/vls/avg-1.c | 4 +- .../gcc.target/riscv/rvv/autovec/vls/avg-2.c | 4 +- .../gcc.target/riscv/rvv/autovec/vls/avg-3.c | 4 +- .../gcc.target/riscv/rvv/autovec/vls/avg-4.c | 6 +-- .../gcc.target/riscv/rvv/autovec/vls/avg-5.c | 6 +-- .../gcc.target/riscv/rvv/autovec/vls/avg-6.c | 6 +-- .../riscv/rvv/autovec/widen/vec-avg-rv32gcv.c | 7 +-- .../riscv/rvv/autovec/widen/vec-avg-rv64gcv.c | 7 +-- 13 files changed, 86 insertions(+), 44 deletions(-) diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md index 775eaa825b0..706cd9717cb 100644 --- a/gcc/config/riscv/autovec.md +++ b/gcc/config/riscv/autovec.md @@ -2345,39 +2345,39 @@ ;; op[0] = (narrow) ((wide) op[1] + (wide) op[2] + 1)) >> 1; ;; - -(define_expand "avg3_floor" +(define_expand "avg3_floor" [(set (match_operand: 0 "register_operand") (truncate: -(:VWEXTI +(ashiftrt:VWEXTI (plus:VWEXTI - (any_extend:VWEXTI + (sign_extend:VWEXTI (match_operand: 1 "register_operand")) - (any_extend:VWEXTI + (sign_extend:VWEXTI (match_operand: 2 "register_operand"))] "TARGET_VECTOR" { /* First emit a widening addition. */ rtx tmp1 = gen_reg_rtx (mode); rtx ops1[] = {tmp1, operands[1], operands[2]}; - insn_code icode = code_for_pred_dual_widen (PLUS, , mode); + insn_code icode = code_for_pred_dual_widen (PLUS, SIGN_EXTEND, mode); riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1); /* Then a narrowing shift. */ rtx ops2[] = {operands[0], tmp1, const1_rtx}; - icode = code_for_pred_narrow_scalar (, mode); + icode = code_for_pred_narrow_scalar (ASHIFTRT, mode); riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops2); DONE; }) -(define_expand "avg3_ceil" +(define_expand "avg3_ceil" [(set (match_operand: 0 "register_operand") (truncate: -(:VWEXTI +(ashiftrt:VWEXTI (plus:VWEXTI (plus:VWEXTI - (any_extend:VWEXTI + (sign_extend:VWEXTI (match_operand: 1 "register_operand")) - (any_extend:VWEXTI + (sign_extend:VWEXTI (match_operand: 2 "register_operand"))) (const_int 1)] "TARGET_VECTOR" @@ -2385,7 +2385,7 @@ /* First emit a widening addition. */ rtx tmp1 = gen_reg_rtx (mode); rtx ops1[] = {tmp1, operands[1], operands[2]}; - insn_code icode = code_for
[PATCH V2] RISC-V: Minor tweak dynamic cost model
v2 update: Robostify tests. While working on cost model, I notice one case that dynamic lmul cost doesn't work well. Before this patch: foo: lui a4,%hi(.LANCHOR0) li a0,1953 li a1,63 addia4,a4,%lo(.LANCHOR0) li a3,64 vsetvli a2,zero,e32,mf2,ta,ma vmv.v.x v5,a0 vmv.v.x v4,a1 vid.v v3 .L2: vsetvli a5,a3,e32,mf2,ta,ma vadd.vi v2,v3,1 vadd.vv v1,v3,v5 mv a2,a5 vmacc.vvv1,v2,v4 sllia1,a5,2 vse32.v v1,0(a4) sub a3,a3,a5 add a4,a4,a1 vsetvli a5,zero,e32,mf2,ta,ma vmv.v.x v1,a2 vadd.vv v3,v3,v1 bne a3,zero,.L2 li a0,0 ret Unexpected: Use scalable vector and LMUL = MF2 which is wasting computation resources. Ideally, we should use LMUL = M8 VLS modes. The root cause is the dynamic LMUL heuristic dominates the VLS heuristic. Adapt the cost model heuristic. After this patch: foo: lui a4,%hi(.LANCHOR0) addia4,a4,%lo(.LANCHOR0) li a3,4096 li a5,32 li a1,2016 addia2,a4,128 addiw a3,a3,-32 vsetvli zero,a5,e32,m8,ta,ma li a0,0 vid.v v8 vsll.vi v8,v8,6 vadd.vx v16,v8,a1 vadd.vx v8,v8,a3 vse32.v v16,0(a4) vse32.v v8,0(a2) ret Tested on both RV32/RV64 no regression. Ok for trunk ? gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (costs::better_main_loop_than_p): Minior tweak. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c: Fix test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c: Ditto. --- gcc/config/riscv/riscv-vector-costs.cc | 3 ++- .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c| 5 ++--- .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c| 5 ++--- .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c| 7 +++ 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index f4a1a789f23..e53f4a186f3 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -994,7 +994,8 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const vect_vf_for_cost (other_loop_vinfo)); /* Apply the unrolling heuristic described above m_unrolled_vls_niters. */ - if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts)) + if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts) + && m_cost_type != other->m_cost_type) { bool this_prefer_unrolled = this->prefer_unrolled_loop (); bool other_prefer_unrolled = other->prefer_unrolled_loop (); diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c index 3ddffa37fe4..89a6c678960 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c @@ -3,7 +3,7 @@ #include -#define N 40 +#define N 48 int a[N]; @@ -22,7 +22,6 @@ foo (){ return 0; } -/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */ /* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */ -/* { dg-final { scan-assembler-times {vsetivli} 2 } } */ +/* { dg-final { scan-assembler-times {vsetivli} 1 } } */ /* { dg-final { scan-assembler-not {vsetvli} } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c index 7625ec5c4b1..86732ef2ce5 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c @@ -3,7 +3,7 @@ #include -#define N 40 +#define N 64 int a[N]; @@ -22,7 +22,6 @@ foo (){ return 0; } -/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */ /* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */ -/* { dg-final { scan-assembler-times {vsetivli} 1 } } */ +/* { dg-final { scan-assembler-not {vsetivli} } } */ /* { dg-final { scan-assembler-times {vsetvli} 1 } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c index 7625ec5c4b1..a1fcb3f3443 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c @@ -1,9 +1,9 @@ /* { dg-do compile } */ -/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param
[PATCH] RISC-V: Minor tweak dynamic cost model
While working on cost model, I notice one case that dynamic lmul cost doesn't work well. Before this patch: foo: lui a4,%hi(.LANCHOR0) li a0,1953 li a1,63 addia4,a4,%lo(.LANCHOR0) li a3,64 vsetvli a2,zero,e32,mf2,ta,ma vmv.v.x v5,a0 vmv.v.x v4,a1 vid.v v3 .L2: vsetvli a5,a3,e32,mf2,ta,ma vadd.vi v2,v3,1 vadd.vv v1,v3,v5 mv a2,a5 vmacc.vvv1,v2,v4 sllia1,a5,2 vse32.v v1,0(a4) sub a3,a3,a5 add a4,a4,a1 vsetvli a5,zero,e32,mf2,ta,ma vmv.v.x v1,a2 vadd.vv v3,v3,v1 bne a3,zero,.L2 li a0,0 ret Unexpected: Use scalable vector and LMUL = MF2 which is wasting computation resources. Ideally, we should use LMUL = M8 VLS modes. The root cause is the dynamic LMUL heuristic dominates the VLS heuristic. Adapt the cost model heuristic. After this patch: foo: lui a4,%hi(.LANCHOR0) addia4,a4,%lo(.LANCHOR0) li a3,4096 li a5,32 li a1,2016 addia2,a4,128 addiw a3,a3,-32 vsetvli zero,a5,e32,m8,ta,ma li a0,0 vid.v v8 vsll.vi v8,v8,6 vadd.vx v16,v8,a1 vadd.vx v8,v8,a3 vse32.v v16,0(a4) vse32.v v8,0(a2) ret Tested on both RV32/RV64 no regression. Ok for trunk ? gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (costs::better_main_loop_than_p): Minior tweak. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c: Fix test. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c: Ditto. --- gcc/config/riscv/riscv-vector-costs.cc | 3 ++- .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c | 5 ++--- .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c | 5 ++--- .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index f4a1a789f23..e53f4a186f3 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -994,7 +994,8 @@ costs::better_main_loop_than_p (const vector_costs *uncast_other) const vect_vf_for_cost (other_loop_vinfo)); /* Apply the unrolling heuristic described above m_unrolled_vls_niters. */ - if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts)) + if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts) + && m_cost_type != other->m_cost_type) { bool this_prefer_unrolled = this->prefer_unrolled_loop (); bool other_prefer_unrolled = other->prefer_unrolled_loop (); diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c index 3ddffa37fe4..89a6c678960 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c @@ -3,7 +3,7 @@ #include -#define N 40 +#define N 48 int a[N]; @@ -22,7 +22,6 @@ foo (){ return 0; } -/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */ /* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */ -/* { dg-final { scan-assembler-times {vsetivli} 2 } } */ +/* { dg-final { scan-assembler-times {vsetivli} 1 } } */ /* { dg-final { scan-assembler-not {vsetvli} } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c index 7625ec5c4b1..86732ef2ce5 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c @@ -3,7 +3,7 @@ #include -#define N 40 +#define N 64 int a[N]; @@ -22,7 +22,6 @@ foo (){ return 0; } -/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */ /* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */ -/* { dg-final { scan-assembler-times {vsetivli} 1 } } */ +/* { dg-final { scan-assembler-not {vsetivli} } } */ /* { dg-final { scan-assembler-times {vsetvli} 1 } } */ diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c index 7625ec5c4b1..505c4cd2c40 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -fn
[Committed] RISC-V: Robostify dynamic lmul test
While working on refining the cost model, I notice this test will generate unexpected scalar xor instructions if we don't tune cost model carefully. Add more assembler to avoid future regression. Committed. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: Add assembler-not check. --- gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c | 1 + 1 file changed, 1 insertion(+) diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c index 87e963edc47..38cbefbe625 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c @@ -22,3 +22,4 @@ x264_pixel_8x8 (unsigned char *pix1, unsigned char *pix2, int i_stride_pix2) } /* { dg-final { scan-assembler {e32,m2} } } */ +/* { dg-final { scan-assembler-not {xor} } } */ -- 2.36.3
[Committed] RISC-V: Fix comments of segment load/store intrinsic
We have supported segment load/store intrinsics. Committed as it is obvious. gcc/ChangeLog: * config/riscv/riscv-vector-builtins-functions.def (vleff): Move comments. (vundefined): Ditto. --- gcc/config/riscv/riscv-vector-builtins-functions.def | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def b/gcc/config/riscv/riscv-vector-builtins-functions.def index 96dd0d95dec..f742c98be8a 100644 --- a/gcc/config/riscv/riscv-vector-builtins-functions.def +++ b/gcc/config/riscv/riscv-vector-builtins-functions.def @@ -79,8 +79,6 @@ DEF_RVV_FUNCTION (vsoxei64, indexed_loadstore, none_m_preds, all_v_scalar_ptr_ee // 7.7. Unit-stride Fault-Only-First Loads DEF_RVV_FUNCTION (vleff, fault_load, full_preds, all_v_scalar_const_ptr_size_ptr_ops) -// TODO: 7.8. Vector Load/Store Segment Instructions - /* 11. Vector Integer Arithmetic Instructions. */ // 11.1. Vector Single-Width Integer Add and Subtract @@ -630,6 +628,8 @@ DEF_RVV_FUNCTION (vset, vset, none_preds, all_v_vset_tuple_ops) DEF_RVV_FUNCTION (vget, vget, none_preds, all_v_vget_tuple_ops) DEF_RVV_FUNCTION (vcreate, vcreate, none_preds, all_v_vcreate_tuple_ops) DEF_RVV_FUNCTION (vundefined, vundefined, none_preds, all_none_void_tuple_ops) + +// 7.8. Vector Load/Store Segment Instructions DEF_RVV_FUNCTION (vlseg, seg_loadstore, full_preds, tuple_v_scalar_const_ptr_ops) DEF_RVV_FUNCTION (vsseg, seg_loadstore, none_m_preds, tuple_v_scalar_ptr_ops) DEF_RVV_FUNCTION (vlsseg, seg_loadstore, full_preds, tuple_v_scalar_const_ptr_ptrdiff_ops) -- 2.36.3
[Committed] RISC-V: Fix comments of segment load/store intrinsic[NFC]
We have supported segment load/store intrinsics. Committed as it is obvious. gcc/ChangeLog: * config/riscv/riscv-vector-builtins-functions.def (vleff): Move comments to real place. (vcreate): Ditto. --- gcc/config/riscv/riscv-vector-builtins-functions.def | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def b/gcc/config/riscv/riscv-vector-builtins-functions.def index 96dd0d95dec..14560923d11 100644 --- a/gcc/config/riscv/riscv-vector-builtins-functions.def +++ b/gcc/config/riscv/riscv-vector-builtins-functions.def @@ -79,8 +79,6 @@ DEF_RVV_FUNCTION (vsoxei64, indexed_loadstore, none_m_preds, all_v_scalar_ptr_ee // 7.7. Unit-stride Fault-Only-First Loads DEF_RVV_FUNCTION (vleff, fault_load, full_preds, all_v_scalar_const_ptr_size_ptr_ops) -// TODO: 7.8. Vector Load/Store Segment Instructions - /* 11. Vector Integer Arithmetic Instructions. */ // 11.1. Vector Single-Width Integer Add and Subtract @@ -625,7 +623,7 @@ DEF_RVV_FUNCTION (vcreate, vcreate, none_preds, all_v_vcreate_lmul2_x2_ops) DEF_RVV_FUNCTION (vcreate, vcreate, none_preds, all_v_vcreate_lmul2_x4_ops) DEF_RVV_FUNCTION (vcreate, vcreate, none_preds, all_v_vcreate_lmul4_x2_ops) -// Tuple types +// 7.8. Vector Load/Store Segment Instructions DEF_RVV_FUNCTION (vset, vset, none_preds, all_v_vset_tuple_ops) DEF_RVV_FUNCTION (vget, vget, none_preds, all_v_vget_tuple_ops) DEF_RVV_FUNCTION (vcreate, vcreate, none_preds, all_v_vcreate_tuple_ops) -- 2.36.3
[PATCH] RISC-V: Fix loop invariant check
As Robin suggested, remove gimple_uid check which is sufficient for our need. Tested on both RV32/RV64 no regression, ok for trunk ? gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (loop_invariant_op_p): Fix loop invariant check. --- gcc/config/riscv/riscv-vector-costs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 3bae581d6fd..f4a1a789f23 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -241,7 +241,7 @@ loop_invariant_op_p (class loop *loop, if (SSA_NAME_IS_DEFAULT_DEF (op) || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op return true; - return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1; + return false; } /* Return true if the variable should be counted into liveness. */ -- 2.36.3
[Committed] RISC-V: Use MAX instead of std::max [VSETVL PASS]
Obvious fix, Committed. gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc: replace std::max by MAX. --- gcc/config/riscv/riscv-vsetvl.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 7d748edc0ef..df7ed149388 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -1668,7 +1668,7 @@ private: } inline void use_max_sew (vsetvl_info &prev, const vsetvl_info &next) { -auto max_sew = std::max (prev.get_sew (), next.get_sew ()); +int max_sew = MAX (prev.get_sew (), next.get_sew ()); prev.set_sew (max_sew); use_min_of_max_sew (prev, next); } @@ -1702,7 +1702,7 @@ private: inline void use_max_sew_and_lmul_with_prev_ratio (vsetvl_info &prev, const vsetvl_info &next) { -auto max_sew = std::max (prev.get_sew (), next.get_sew ()); +int max_sew = MAX (prev.get_sew (), next.get_sew ()); prev.set_vlmul (calculate_vlmul (max_sew, prev.get_ratio ())); prev.set_sew (max_sew); } -- 2.36.3
[Committed] RISC-V: Update MAX_SEW for available vsevl info[VSETVL PASS]
This patch fixes a bug of VSETVL PASS in this following situation: Ignore curr info since prev info available with it: prev_info: VALID (insn 8, bb 2) Demand fields: demand_ratio_and_ge_sew demand_avl SEW=16, VLMUL=mf4, RATIO=64, MAX_SEW=64 TAIL_POLICY=agnostic, MASK_POLICY=agnostic AVL=(const_int 1 [0x1]) VL=(nil) curr_info: VALID (insn 12, bb 2) Demand fields: demand_ge_sew demand_non_zero_avl SEW=16, VLMUL=m1, RATIO=16, MAX_SEW=32 TAIL_POLICY=agnostic, MASK_POLICY=agnostic AVL=(const_int 1 [0x1]) VL=(nil) We should update prev_info MAX_SEW from 64 into 32. Before this patch: foo: vsetivlizero,1,e64,m1,ta,ma vle64.v v1,0(a1) vmv.s.x v3,a0 vfmv.s.fv2,fa0 vadd.vv v1,v1,v1 ret After this patch: foo: vsetivlizero,1,e16,mf4,ta,ma vle64.v v1,0(a1) vmv.s.x v3,a0 vfmv.s.fv2,fa0 vsetvli zero,zero,e64,m1,ta,ma vadd.vv v1,v1,v1 ret Tested on both RV32 and RV64 no regression. Committed. PR target/113248 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::fuse_local_vsetvl_info): gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/pr113248.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc| 17 + .../gcc.target/riscv/rvv/vsetvl/pr113248.c | 15 +++ 2 files changed, 32 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 3a2ea9ad44a..7d748edc0ef 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2876,6 +2876,23 @@ pre_vsetvl::fuse_local_vsetvl_info () curr_info.dump (dump_file, ""); fprintf (dump_file, "\n"); } + /* Even though prev_info is available with curr_info, +we need to update the MAX_SEW of prev_info since +we don't check MAX_SEW in available_p check. + +prev_info: +Demand fields: demand_ratio_and_ge_sew demand_avl +SEW=16, VLMUL=mf4, RATIO=64, MAX_SEW=64 + +curr_info: +Demand fields: demand_ge_sew demand_non_zero_avl +SEW=16, VLMUL=m1, RATIO=16, MAX_SEW=32 + +In the example above, prev_info is available with +curr_info, we need to update prev_info MAX_SEW from +64 into 32. */ + prev_info.set_max_sew ( + MIN (prev_info.get_max_sew (), curr_info.get_max_sew ())); if (!curr_info.vl_used_by_non_rvv_insn_p () && vsetvl_insn_p (curr_info.get_insn ()->rtl ())) m_delete_list.safe_push (curr_info); diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c new file mode 100644 index 000..b3b506177df --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-mtune=generic-ooo --param=riscv-autovec-preference=scalable -march=rv32gc_zve64f_zvfh -mabi=ilp32d -O3" } */ + +#include "riscv_vector.h" + +void foo(_Float16 y, int64_t *i64p) +{ + vint64m1_t vx =__riscv_vle64_v_i64m1 (i64p, 1); + vx = __riscv_vadd_vv_i64m1 (vx, vx, 1); + vfloat16m1_t vy =__riscv_vfmv_s_f_f16m1 (y, 1); + asm volatile ("# use %0 %1" : : "vr"(vx), "vr" (vy)); +} + +/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e16,\s*mf4,\s*t[au],\s*m[au]} 1 } } */ +/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*zero,\s*e64,\s*m1,\s*t[au],\s*m[au]} 1 } } */ -- 2.36.3
[Committed V2] RISC-V: Teach liveness computation loop invariant shift amount
1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize shift with vector shift amount, that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant. 2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize shift with scalar shift amount, that is, vectorization of 'a[i] >> x', the shift amount is loop invariant. For the 2) case, we don't need to allocate a vector register group for shift amount. So consider this following case: void f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x, int n) { for (int i = 0; i < n; i++) { int tmp = b[i] >> x; int tmp2 = tmp * b[i]; c[i] = tmp2 * b[i]; d[i] = tmp * tmp2 * b[i] >> x; } } Before this patch, we choose LMUL = 4, now after this patch, we can choose LMUL = 8: f: ble a5,zero,.L5 .L3: vsetvli a0,a5,e32,m8,ta,ma sllia6,a0,2 vle32.v v16,0(a1) vsra.vx v24,v16,a4 vmul.vv v8,v24,v16 vmul.vv v0,v8,v16 vse32.v v0,0(a2) vmul.vv v8,v8,v24 vmul.vv v8,v8,v16 vsra.vx v8,v8,a4 vse32.v v8,0(a3) add a1,a1,a6 add a2,a2,a6 add a3,a3,a6 sub a5,a5,a0 bne a5,zero,.L3 .L5: ret Tested on both RV32/RV64 no regression. Ok for trunk ? Note that we will apply same heuristic for vadd.vx, ... etc when the late-combine pass from Richard Sandiford is committed (Since we need late combine pass to do vv->vx transformation for vadd). gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (loop_invariant_op_p): New function. (variable_vectorized_p): Teach loop invariant. (has_unexpected_spills_p): Ditto. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 31 +++-- .../costmodel/riscv/rvv/dynamic-lmul4-12.c| 40 .../costmodel/riscv/rvv/dynamic-lmul8-14.c| 64 +++ 3 files changed, 131 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index ec8156fbaf8..3bae581d6fd 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -230,9 +230,24 @@ get_biggest_mode (machine_mode mode1, machine_mode mode2) return mode1_size >= mode2_size ? mode1 : mode2; } +/* Return true if OP is invariant. */ + +static bool +loop_invariant_op_p (class loop *loop, +tree op) +{ + if (is_gimple_constant (op)) +return true; + if (SSA_NAME_IS_DEFAULT_DEF (op) + || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op +return true; + return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1; +} + /* Return true if the variable should be counted into liveness. */ static bool -variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p) +variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var, + bool lhs_p) { if (!var) return false; @@ -275,6 +290,10 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p) || !tree_fits_shwi_p (var) || !IN_RANGE (tree_to_shwi (var), -16, 15) || gimple_assign_rhs1 (stmt) != var; + case LSHIFT_EXPR: + case RSHIFT_EXPR: + return gimple_assign_rhs2 (stmt) != var +|| !loop_invariant_op_p (loop, var); default: break; } @@ -312,10 +331,12 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p) The live range of SSA 2 is [0, 4] in bb 3. */ static machine_mode compute_local_live_ranges ( + loop_vec_info loop_vinfo, const hash_map> &program_points_per_bb, hash_map> &live_ranges_per_bb) { machine_mode biggest_mode = QImode; + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); if (!program_points_per_bb.is_empty ()) { auto_vec visited_vars; @@ -339,7 +360,8 @@ compute_local_live_ranges ( unsigned int point = program_point.point; gimple *stmt = program_point.stmt; tree lhs = gimple_get_lhs (stmt); - if (variable_vectorized_p (program_point.stmt_info, lhs, true)) + if (variable_vectorized_p (loop, program_point.stmt_info, lhs, +true)) { biggest_mode = get_biggest_mode (biggest_mode, TYPE_MODE (TREE_TYPE (lhs))); @@ -356,7 +378,7 @@ compute_local_live_ranges ( for (i = 0; i < gimple_num_args (stmt); i++) {
[Committed V2] RISC-V: Allow simplification non-vlmax with len = NUNITS reg to reg move
V2: Address comments from Robin. While working on fixing a bug, I notice this following code has redundant move: #include "riscv_vector.h" void f (float x, float y, void *out) { float f[4] = { x, x, x, y }; vfloat32m1_t v = __riscv_vle32_v_f32m1 (f, 4); __riscv_vse32_v_f32m1 (out, v, 4); } Before this patch: f: vsetivlizero,4,e32,m1,ta,ma addisp,sp,-16 vfmv.v.fv1,fa0 vfslide1down.vf v1,v1,fa1 vmv.v.v v1,v1 > redundant move. vse32.v v1,0(a0) addisp,sp,16 jr ra The rootcause is that the complicate vmv.v.v pattern doesn't simplify it into simple (set (reg) (reg)) reg-to-reg move pattern. Currently, we support such simplification for VLMAX. However, the case I found is non-VLMAX but with LEN = NUNITS which should be considered as equivalent to VLMAX. Add a simple fix for such situation. Tested on both RV32/RV64 no regressions. gcc/ChangeLog: * config/riscv/riscv-protos.h (whole_reg_to_reg_move_p): New function. * config/riscv/riscv-v.cc (whole_reg_to_reg_move_p): Ditto. * config/riscv/vector.md: Allow non-vlmax with len = NUNITS simplification. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/base/vf_avl-4.c: New test. --- gcc/config/riscv/riscv-protos.h | 1 + gcc/config/riscv/riscv-v.cc | 23 +++ gcc/config/riscv/vector.md| 9 ++-- .../gcc.target/riscv/rvv/base/vf_avl-4.c | 13 +++ 4 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 0f0337cfb38..00a5b645abe 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -687,6 +687,7 @@ bool imm_avl_p (machine_mode); bool can_be_broadcasted_p (rtx); bool gather_scatter_valid_offset_p (machine_mode); HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int); +bool whole_reg_to_reg_move_p (rtx *, machine_mode, int); } /* We classify builtin types into two classes: diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index ec859645415..2491522191a 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -5117,4 +5117,27 @@ estimated_poly_value (poly_int64 val, unsigned int kind) return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN; } +/* Return true it is whole register-register move. */ +bool +whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index) +{ + /* An operation is a whole-register move if either + (1) Its vlmax operand equals VLMAX + (2) Its vl operand equals the number of units of its mode. */ + if (register_operand (ops[0], mode) + && register_operand (ops[3], mode) + && satisfies_constraint_vu (ops[2]) + && satisfies_constraint_Wc1 (ops[1])) +{ + if (INTVAL (ops[avl_type_index]) == VLMAX) + return true; + /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32 +into NON-VLMAX with LEN = NUNITS. */ + else if (CONST_INT_P (ops[4]) + && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode))) + return true; +} + return false; +} + } // namespace riscv_vector diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index 3d2c1c3ce8f..be5beb5ab64 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -1724,10 +1724,7 @@ vse.v\t%3,%0%p1 vmv.v.v\t%0,%3 vmv.v.v\t%0,%3" - "&& register_operand (operands[0], mode) - && register_operand (operands[3], mode) - && satisfies_constraint_vu (operands[2]) - && INTVAL (operands[7]) == riscv_vector::VLMAX" + "&& riscv_vector::whole_reg_to_reg_move_p (operands, mode, 7)" [(set (match_dup 0) (match_dup 3))] "" [(set_attr "type" "vlde,vlde,vlde,vste,vimov,vimov") @@ -1776,9 +1773,7 @@ vmmv.m\t%0,%3 vmclr.m\t%0 vmset.m\t%0" - "&& register_operand (operands[0], mode) - && register_operand (operands[3], mode) - && INTVAL (operands[5]) == riscv_vector::VLMAX" + "&& riscv_vector::whole_reg_to_reg_move_p (operands, mode, 5)" [(set (match_dup 0) (match_dup 3))] "" [(set_attr "type" "vldm,vstm,vmalu,vmalu,vmalu") diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c new file mode 100644 index 000..1b4bfd96481 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=fixed-vlmax" } */ + +#include "riscv_vector.h" +void +f (float x, float y, void *out) +{ + float f[4] = { x, x, x, y }; + vfloat32m1_t v = __riscv_vle32_v_f32m1 (f, 4); + __riscv_vse32_v_f32m1 (out, v, 4); +} + +/* { dg-final { scan-assembler-not {vmv} } } */ --
[PATCH] RISC-V: Allow simplification non-vlmax with len = NUNITS reg to reg move
While working on fixing a bug, I notice this following code has redundant move: #include "riscv_vector.h" void f (float x, float y, void *out) { float f[4] = { x, x, x, y }; vfloat32m1_t v = __riscv_vle32_v_f32m1 (f, 4); __riscv_vse32_v_f32m1 (out, v, 4); } Before this patch: f: vsetivlizero,4,e32,m1,ta,ma addisp,sp,-16 vfmv.v.fv1,fa0 vfslide1down.vf v1,v1,fa1 vmv.v.v v1,v1 > redundant move. vse32.v v1,0(a0) addisp,sp,16 jr ra The rootcause is that the complicate vmv.v.v pattern doesn't simplify it into simple (set (reg) (reg)) reg-to-reg move pattern. Currently, we support such simplification for VLMAX. However, the case I found is non-VLMAX but with LEN = NUNITS which should be considered as equivalent to VLMAX. Add a simple fix for such situation. Tested on both RV32/RV64 no regressions. Ok for trunk ? gcc/ChangeLog: * config/riscv/riscv-protos.h (whole_reg_to_reg_move_p): New function. * config/riscv/riscv-v.cc (whole_reg_to_reg_move_p): Ditto. * config/riscv/vector.md: Allow non-vlmax with len = NUNITS simplification. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/base/vf_avl-4.c: New test. --- gcc/config/riscv/riscv-protos.h | 1 + gcc/config/riscv/riscv-v.cc | 21 +++ gcc/config/riscv/vector.md| 9 ++-- .../gcc.target/riscv/rvv/base/vf_avl-4.c | 13 4 files changed, 37 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h index 0f0337cfb38..064e8f443f3 100644 --- a/gcc/config/riscv/riscv-protos.h +++ b/gcc/config/riscv/riscv-protos.h @@ -687,6 +687,7 @@ bool imm_avl_p (machine_mode); bool can_be_broadcasted_p (rtx); bool gather_scatter_valid_offset_p (machine_mode); HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int); +bool whole_reg_to_reg_move_p (rtx *, machine_mode); } /* We classify builtin types into two classes: diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index b7727b2b3e6..e5ba28d9078 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -5122,4 +5122,25 @@ estimated_poly_value (poly_int64 val, unsigned int kind) return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN; } +/* Return true it is whole register-register move. */ +bool +whole_reg_to_reg_move_p (rtx *ops, machine_mode mode) +{ + if (register_operand (ops[0], mode) + && register_operand (ops[3], mode) + && satisfies_constraint_vu (ops[2]) + && satisfies_constraint_Wc1 (ops[1])) +{ + int vlmax_index = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? 5 : 7; + if (INTVAL (ops[vlmax_index]) == VLMAX) + return true; + /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32 +into NON-VLMAX with LEN = NUNITS. */ + else if (CONST_INT_P (ops[4]) + && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode))) + return true; +} + return false; +} + } // namespace riscv_vector diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index 3d2c1c3ce8f..abd293f310c 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -1724,10 +1724,7 @@ vse.v\t%3,%0%p1 vmv.v.v\t%0,%3 vmv.v.v\t%0,%3" - "&& register_operand (operands[0], mode) - && register_operand (operands[3], mode) - && satisfies_constraint_vu (operands[2]) - && INTVAL (operands[7]) == riscv_vector::VLMAX" + "&& riscv_vector::whole_reg_to_reg_move_p (operands, mode)" [(set (match_dup 0) (match_dup 3))] "" [(set_attr "type" "vlde,vlde,vlde,vste,vimov,vimov") @@ -1776,9 +1773,7 @@ vmmv.m\t%0,%3 vmclr.m\t%0 vmset.m\t%0" - "&& register_operand (operands[0], mode) - && register_operand (operands[3], mode) - && INTVAL (operands[5]) == riscv_vector::VLMAX" + "&& riscv_vector::whole_reg_to_reg_move_p (operands, mode)" [(set (match_dup 0) (match_dup 3))] "" [(set_attr "type" "vldm,vstm,vmalu,vmalu,vmalu") diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c new file mode 100644 index 000..1b4bfd96481 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=fixed-vlmax" } */ + +#include "riscv_vector.h" +void +f (float x, float y, void *out) +{ + float f[4] = { x, x, x, y }; + vfloat32m1_t v = __riscv_vle32_v_f32m1 (f, 4); + __riscv_vse32_v_f32m1 (out, v, 4); +} + +/* { dg-final { scan-assembler-not {vmv} } } */ -- 2.36.3
[PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]
1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize shift with vector shift amount, that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant. 2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize shift with scalar shift amount, that is, vectorization of 'a[i] >> x', the shift amount is loop invariant. For the 2) case, we don't need to allocate a vector register group for shift amount. So consider this following case: void f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x, int n) { for (int i = 0; i < n; i++) { int tmp = b[i] >> x; int tmp2 = tmp * b[i]; c[i] = tmp2 * b[i]; d[i] = tmp * tmp2 * b[i] >> x; } } Before this patch, we choose LMUL = 4, now after this patch, we can choose LMUL = 8: f: ble a5,zero,.L5 .L3: vsetvli a0,a5,e32,m8,ta,ma sllia6,a0,2 vle32.v v16,0(a1) vsra.vx v24,v16,a4 vmul.vv v8,v24,v16 vmul.vv v0,v8,v16 vse32.v v0,0(a2) vmul.vv v8,v8,v24 vmul.vv v8,v8,v16 vsra.vx v8,v8,a4 vse32.v v8,0(a3) add a1,a1,a6 add a2,a2,a6 add a3,a3,a6 sub a5,a5,a0 bne a5,zero,.L3 .L5: ret Tested on both RV32/RV64 no regression. Ok for trunk ? Note that we will apply same heuristic for vadd.vx, ... etc when the late-combine pass from Richard Sandiford is committed (Since we need late combine pass to do vv->vx transformation for vadd). gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (loop_invariant_op_p): New function. (variable_vectorized_p): Teach loop invariant. (has_unexpected_spills_p): Ditto. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 31 +++-- .../costmodel/riscv/rvv/dynamic-lmul4-12.c| 40 .../costmodel/riscv/rvv/dynamic-lmul8-14.c| 64 +++ 3 files changed, 131 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index ec8156fbaf8..00b0b4d64b9 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -230,9 +230,24 @@ get_biggest_mode (machine_mode mode1, machine_mode mode2) return mode1_size >= mode2_size ? mode1 : mode2; } +/* Return true if OP is invariant. */ + +static bool +loop_invariant_op_p (class loop *loop, +tree op) +{ + if (is_gimple_min_invariant (op)) +return true; + if (SSA_NAME_IS_DEFAULT_DEF (op) + || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op +return true; + return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1; +} + /* Return true if the variable should be counted into liveness. */ static bool -variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p) +variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var, + bool lhs_p) { if (!var) return false; @@ -275,6 +290,10 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p) || !tree_fits_shwi_p (var) || !IN_RANGE (tree_to_shwi (var), -16, 15) || gimple_assign_rhs1 (stmt) != var; + case LSHIFT_EXPR: + case RSHIFT_EXPR: + return gimple_assign_rhs2 (stmt) != var +|| !loop_invariant_op_p (loop, var); default: break; } @@ -312,10 +331,12 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p) The live range of SSA 2 is [0, 4] in bb 3. */ static machine_mode compute_local_live_ranges ( + loop_vec_info loop_vinfo, const hash_map> &program_points_per_bb, hash_map> &live_ranges_per_bb) { machine_mode biggest_mode = QImode; + class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); if (!program_points_per_bb.is_empty ()) { auto_vec visited_vars; @@ -339,7 +360,8 @@ compute_local_live_ranges ( unsigned int point = program_point.point; gimple *stmt = program_point.stmt; tree lhs = gimple_get_lhs (stmt); - if (variable_vectorized_p (program_point.stmt_info, lhs, true)) + if (variable_vectorized_p (loop, program_point.stmt_info, lhs, +true)) { biggest_mode = get_biggest_mode (biggest_mode, TYPE_MODE (TREE_TYPE (lhs))); @@ -356,7 +378,7 @@ compute_local_live_ranges ( for (i = 0; i < gimple_num_args (stmt); i++)
[Committed V3] RISC-V: Make liveness estimation be aware of .vi variant
Consider this following case: void f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) { for (int i = 0; i < n; i++) { int tmp = b[i] + 15; int tmp2 = tmp + b[i]; c[i] = tmp2 + b[i]; d[i] = tmp + tmp2 + b[i]; } } Current dynamic LMUL cost model choose LMUL = 4 because we count the "15" as consuming 1 vector register group which is not accurate. We teach the dynamic LMUL cost model be aware of the potential vi variant instructions transformation, so that we can choose LMUL = 8 according to more accurate cost model. After this patch: f: ble a4,zero,.L5 .L3: vsetvli a5,a4,e32,m8,ta,ma sllia0,a5,2 vle32.v v16,0(a1) vadd.vi v24,v16,15 vadd.vv v8,v24,v16 vadd.vv v0,v8,v16 vse32.v v0,0(a2) vadd.vv v8,v8,v24 vadd.vv v8,v8,v16 vse32.v v8,0(a3) add a1,a1,a0 add a2,a2,a0 add a3,a3,a0 sub a4,a4,a5 bne a4,zero,.L3 .L5: ret Tested on both RV32 and RV64 no regression. gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (variable_vectorized_p): Teach vi variant. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 32 ++-- .../costmodel/riscv/rvv/dynamic-lmul8-13.c| 74 +++ 2 files changed, 99 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 21f8a81c89c..ec8156fbaf8 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -255,6 +255,31 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p) return false; } } + else if (is_gimple_assign (stmt)) +{ + tree_code tcode = gimple_assign_rhs_code (stmt); + /* vi variant doesn't need to allocate such statement. +E.g. tmp_15 = _4 + 1; will be transformed into vadd.vi +so the INTEGER_CST '1' doesn't need a vector register. */ + switch (tcode) + { + case PLUS_EXPR: + case BIT_IOR_EXPR: + case BIT_XOR_EXPR: + case BIT_AND_EXPR: + return TREE_CODE (var) != INTEGER_CST +|| !tree_fits_shwi_p (var) +|| !IN_RANGE (tree_to_shwi (var), -16, 15); + case MINUS_EXPR: + return TREE_CODE (var) != INTEGER_CST +|| !tree_fits_shwi_p (var) +|| !IN_RANGE (tree_to_shwi (var), -16, 15) +|| gimple_assign_rhs1 (stmt) != var; + default: + break; + } +} + if (lhs_p) return is_gimple_reg (var) && (!POINTER_TYPE_P (TREE_TYPE (var)) @@ -331,13 +356,6 @@ compute_local_live_ranges ( for (i = 0; i < gimple_num_args (stmt); i++) { tree var = gimple_arg (stmt, i); - /* Both IMM and REG are included since a VECTOR_CST may be -potentially held in a vector register. However, it's not -accurate, since a PLUS_EXPR can be vectorized into vadd.vi -if IMM is -16 ~ 15. - -TODO: We may elide the cases that the unnecessary IMM in -the future. */ if (variable_vectorized_p (program_point.stmt_info, var, false)) { diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c new file mode 100644 index 000..baef4e39014 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c @@ -0,0 +1,74 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */ + +void +f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = b[i] + 15; + int tmp2 = tmp + b[i]; + c[i] = tmp2 + b[i]; + d[i] = tmp + tmp2 + b[i]; +} +} + +void +f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = 15 - b[i]; + int tmp2 = tmp * b[i]; + c[i] = tmp2 * b[i]; + d[i] = tmp * tmp2 * b[i]; +} +} + +void +f3 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = b[i] & 15; + int tmp2 = tmp * b[i]; + c[i] = tmp2 * b[i]; + d[i] = tmp * tmp2 * b[i]; +} +} + +void +f4 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = b[i]
[Committed V2] RISC-V: Make liveness estimation be aware of .vi variant
Consider this following case: void f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) { for (int i = 0; i < n; i++) { int tmp = b[i] + 15; int tmp2 = tmp + b[i]; c[i] = tmp2 + b[i]; d[i] = tmp + tmp2 + b[i]; } } Current dynamic LMUL cost model choose LMUL = 4 because we count the "15" as consuming 1 vector register group which is not accurate. We teach the dynamic LMUL cost model be aware of the potential vi variant instructions transformation, so that we can choose LMUL = 8 according to more accurate cost model. After this patch: f: ble a4,zero,.L5 .L3: vsetvli a5,a4,e32,m8,ta,ma sllia0,a5,2 vle32.v v16,0(a1) vadd.vi v24,v16,15 vadd.vv v8,v24,v16 vadd.vv v0,v8,v16 vse32.v v0,0(a2) vadd.vv v8,v8,v24 vadd.vv v8,v8,v16 vse32.v v8,0(a3) add a1,a1,a0 add a2,a2,a0 add a3,a3,a0 sub a4,a4,a5 bne a4,zero,.L3 .L5: ret Tested on both RV32 and RV64 no regression. Ok for trunk ? gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (variable_vectorized_p): Teach vi variant. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 30 ++-- .../costmodel/riscv/rvv/dynamic-lmul8-13.c| 74 +++ 2 files changed, 97 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 21f8a81c89c..e4435032035 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -255,6 +255,29 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p) return false; } } + else if (is_gimple_assign (stmt)) +{ + tree_code tcode = gimple_assign_rhs_code (stmt); + /* vi variant doesn't need to allocate such statement. +E.g. tmp_15 = _4 + 1; will be transformed into vadd.vi +so the INTEGER_CST '1' doesn't need a vector register. */ + switch (tcode) + { + case PLUS_EXPR: + case BIT_IOR_EXPR: + case BIT_XOR_EXPR: + case BIT_AND_EXPR: + return TREE_CODE (var) != INTEGER_CST +|| !IN_RANGE (tree_to_shwi (var), -16, 15); + case MINUS_EXPR: + return TREE_CODE (var) != INTEGER_CST +|| !IN_RANGE (tree_to_shwi (var), -16, 15) +|| gimple_assign_rhs1 (stmt) != var; + default: + break; + } +} + if (lhs_p) return is_gimple_reg (var) && (!POINTER_TYPE_P (TREE_TYPE (var)) @@ -331,13 +354,6 @@ compute_local_live_ranges ( for (i = 0; i < gimple_num_args (stmt); i++) { tree var = gimple_arg (stmt, i); - /* Both IMM and REG are included since a VECTOR_CST may be -potentially held in a vector register. However, it's not -accurate, since a PLUS_EXPR can be vectorized into vadd.vi -if IMM is -16 ~ 15. - -TODO: We may elide the cases that the unnecessary IMM in -the future. */ if (variable_vectorized_p (program_point.stmt_info, var, false)) { diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c new file mode 100644 index 000..baef4e39014 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c @@ -0,0 +1,74 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */ + +void +f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = b[i] + 15; + int tmp2 = tmp + b[i]; + c[i] = tmp2 + b[i]; + d[i] = tmp + tmp2 + b[i]; +} +} + +void +f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = 15 - b[i]; + int tmp2 = tmp * b[i]; + c[i] = tmp2 * b[i]; + d[i] = tmp * tmp2 * b[i]; +} +} + +void +f3 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = b[i] & 15; + int tmp2 = tmp * b[i]; + c[i] = tmp2 * b[i]; + d[i] = tmp * tmp2 * b[i]; +} +} + +void +f4 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = b[i] | 15; + int tmp2 = tmp * b[i]; + c[i] = tmp2 * b[i]; + d[i
[PATCH] RISC-V: Teach liveness estimation be aware of .vi variant
Consider this following case: void f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) { for (int i = 0; i < n; i++) { int tmp = b[i] + 15; int tmp2 = tmp + b[i]; c[i] = tmp2 + b[i]; d[i] = tmp + tmp2 + b[i]; } } Current dynamic LMUL cost model choose LMUL = 4 because we count the "15" as consuming 1 vector register group which is not accurate. We teach the dynamic LMUL cost model be aware of the potential vi variant instructions transformation, so that we can choose LMUL = 8 according to more accurate cost model. After this patch: f: ble a4,zero,.L5 .L3: vsetvli a5,a4,e32,m8,ta,ma sllia0,a5,2 vle32.v v16,0(a1) vadd.vi v24,v16,15 vadd.vv v8,v24,v16 vadd.vv v0,v8,v16 vse32.v v0,0(a2) vadd.vv v8,v8,v24 vadd.vv v8,v8,v16 vse32.v v8,0(a3) add a1,a1,a0 add a2,a2,a0 add a3,a3,a0 sub a4,a4,a5 bne a4,zero,.L3 .L5: ret Tested on both RV32 and RV64 no regression. Ok for trunk ? gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (variable_vectorized_p): Teach vi variant. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 30 ++-- .../costmodel/riscv/rvv/dynamic-lmul8-13.c| 74 +++ 2 files changed, 97 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 21f8a81c89c..7f083b04edd 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -255,6 +255,29 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p) return false; } } + else if (is_gimple_assign (stmt)) +{ + tree_code tcode = gimple_assign_rhs_code (stmt); + /* vi variant doesn't need to allocate such statement. +E.g. tmp_15 = _4 + 1; will be transformed into vadd.vi +so the INTEGER_CST '1' doesn't need vector a register. */ + switch (tcode) + { + case PLUS_EXPR: + case BIT_IOR_EXPR: + case BIT_XOR_EXPR: + case BIT_AND_EXPR: + return TREE_CODE (var) != INTEGER_CST +|| !IN_RANGE (tree_to_shwi (var), -16, 15); + case MINUS_EXPR: + return TREE_CODE (var) != INTEGER_CST +|| !IN_RANGE (tree_to_shwi (var), -16, 15) +|| gimple_assign_rhs1 (stmt) != var; + default: + break; + } +} + if (lhs_p) return is_gimple_reg (var) && (!POINTER_TYPE_P (TREE_TYPE (var)) @@ -331,13 +354,6 @@ compute_local_live_ranges ( for (i = 0; i < gimple_num_args (stmt); i++) { tree var = gimple_arg (stmt, i); - /* Both IMM and REG are included since a VECTOR_CST may be -potentially held in a vector register. However, it's not -accurate, since a PLUS_EXPR can be vectorized into vadd.vi -if IMM is -16 ~ 15. - -TODO: We may elide the cases that the unnecessary IMM in -the future. */ if (variable_vectorized_p (program_point.stmt_info, var, false)) { diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c new file mode 100644 index 000..baef4e39014 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c @@ -0,0 +1,74 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */ + +void +f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = b[i] + 15; + int tmp2 = tmp + b[i]; + c[i] = tmp2 + b[i]; + d[i] = tmp + tmp2 + b[i]; +} +} + +void +f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = 15 - b[i]; + int tmp2 = tmp * b[i]; + c[i] = tmp2 * b[i]; + d[i] = tmp * tmp2 * b[i]; +} +} + +void +f3 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = b[i] & 15; + int tmp2 = tmp * b[i]; + c[i] = tmp2 * b[i]; + d[i] = tmp * tmp2 * b[i]; +} +} + +void +f4 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n) +{ + for (int i = 0; i < n; i++) +{ + int tmp = b[i] | 15; + int tmp2 = tmp * b[i]; + c[i] = tmp2 * b[i]; + d[i
[Committed] RISC-V: Refine LMUL computation for MASK_LEN_LOAD/MASK_LEN_STORE IFN
Notice a case has "Maximum lmul = 16" which is incorrect. Correct LMUL estimation for MASK_LEN_LOAD/MASK_LEN_STORE. Committed. gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (variable_vectorized_p): New function. (compute_nregs_for_mode): Refine LMUL. (max_number_of_live_regs): Ditto. (compute_estimated_lmul): Ditto. (has_unexpected_spills_p): Ditto. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-11.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 66 +++ .../costmodel/riscv/rvv/dynamic-lmul4-11.c| 16 + 2 files changed, 68 insertions(+), 14 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-11.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index b9fdfdc5e3a..21f8a81c89c 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -230,6 +230,42 @@ get_biggest_mode (machine_mode mode1, machine_mode mode2) return mode1_size >= mode2_size ? mode1 : mode2; } +/* Return true if the variable should be counted into liveness. */ +static bool +variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p) +{ + if (!var) +return false; + gimple *stmt = STMT_VINFO_STMT (stmt_info); + enum stmt_vec_info_type type += STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)); + if (is_gimple_call (stmt) && gimple_call_internal_p (stmt)) +{ + if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE + || gimple_call_internal_fn (stmt) == IFN_MASK_LOAD) + { + /* .MASK_LOAD (_5, 32B, _33) + ^^^ +Only the 3rd argument will be vectorized and consume +a vector register. */ + if (TREE_CODE (TREE_TYPE (var)) == BOOLEAN_TYPE + || (is_gimple_reg (var) && !POINTER_TYPE_P (TREE_TYPE (var + return true; + else + return false; + } +} + if (lhs_p) +return is_gimple_reg (var) + && (!POINTER_TYPE_P (TREE_TYPE (var)) + || type != store_vec_info_type); + else +return poly_int_tree_p (var) + || (is_gimple_val (var) + && (!POINTER_TYPE_P (TREE_TYPE (var)) + || type != load_vec_info_type)); +} + /* Compute local live ranges of each vectorized variable. Note that we only compute local live ranges (within a block) since local live ranges information is accurate enough for us to determine @@ -277,13 +313,8 @@ compute_local_live_ranges ( { unsigned int point = program_point.point; gimple *stmt = program_point.stmt; - stmt_vec_info stmt_info = program_point.stmt_info; tree lhs = gimple_get_lhs (stmt); - enum stmt_vec_info_type type - = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)); - if (lhs != NULL_TREE && is_gimple_reg (lhs) - && (!POINTER_TYPE_P (TREE_TYPE (lhs)) - || type != store_vec_info_type)) + if (variable_vectorized_p (program_point.stmt_info, lhs, true)) { biggest_mode = get_biggest_mode (biggest_mode, TYPE_MODE (TREE_TYPE (lhs))); @@ -307,10 +338,8 @@ compute_local_live_ranges ( TODO: We may elide the cases that the unnecessary IMM in the future. */ - if (poly_int_tree_p (var) - || (is_gimple_val (var) - && (!POINTER_TYPE_P (TREE_TYPE (var)) - || type != load_vec_info_type))) + if (variable_vectorized_p (program_point.stmt_info, var, +false)) { biggest_mode = get_biggest_mode (biggest_mode, @@ -383,7 +412,9 @@ compute_nregs_for_mode (loop_vec_info loop_vinfo, machine_mode mode, unsigned int biggest_size = GET_MODE_SIZE (biggest_mode).to_constant (); gcc_assert (biggest_size >= mode_size); unsigned int ratio = biggest_size / mode_size; - return MAX (lmul / ratio, 1) * rgroup_size; + /* RVV mask bool modes always consume 1 vector register regardless LMUL. */ + unsigned int nregs = mode == BImode ? 1 : lmul / ratio; + return MAX (nregs, 1) * rgroup_size; } /* This function helps to determine whether current LMUL will cause @@ -414,7 +445,9 @@ max_number_of_live_regs (loop_vec_info loop_vinfo, const basic_block bb, pair live_range = (*iter).second; for (i = live_range.first + 1; i <= live_range.second; i++) { - machine_mode mode = TYPE_MODE (TREE_TYPE (var)); + machine_mode mode = TREE_CODE (TREE_TYPE (var)) == BOOLEAN_TYPE + ? BImode +
[Committed] RISC-V: Fix indent
Fix indent of some codes to make them 8 spaces align. Committed. gcc/ChangeLog: * config/riscv/vector.md: Fix indent. --- gcc/config/riscv/vector.md | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md index 24f91f058ef..1de656a8ced 100644 --- a/gcc/config/riscv/vector.md +++ b/gcc/config/riscv/vector.md @@ -53,8 +53,8 @@ vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,\ vgather,vcompress,vlsegde,vssegte,vlsegds,vssegts,vlsegdux,vlsegdox,\ vssegtux,vssegtox,vlsegdff,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,\ - vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,\ - vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c") + vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,\ + vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c") (const_string "true")] (const_string "false"))) @@ -77,8 +77,8 @@ vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,\ vgather,vcompress,vlsegde,vssegte,vlsegds,vssegts,vlsegdux,vlsegdox,\ vssegtux,vssegtox,vlsegdff,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,\ - vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,\ - vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c") + vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,\ + vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c") (const_string "true")] (const_string "false"))) @@ -707,7 +707,7 @@ vired,viwred,vfredu,vfredo,vfwredu,vfwredo,vimovxv,vfmovfv,\ vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,\ vgather,vldff,viwmuladd,vfwmuladd,vlsegde,vlsegds,vlsegdux,vlsegdox,vlsegdff,\ - vandn,vbrev,vbrev8,vrev8,vrol,vror,vwsll,vclmul,vclmulh") + vandn,vbrev,vbrev8,vrev8,vrol,vror,vwsll,vclmul,vclmulh") (const_int 2) (eq_attr "type" "vimerge,vfmerge,vcompress,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,\ -- 2.36.3
[Committed V3] RISC-V: Fix bug of earliest fusion for infinite loop[VSETVL PASS]
As PR113206 and PR113209, the bugs happens on the following situation: li a4,32 ... vsetvli zero,a4,e8,m8,ta,ma ... slliw a4,a3,24 sraiw a4,a4,24 bge a3,a1,.L8 sb a4,%lo(e)(a0) vsetvli zero,a4,e8,m8,ta,ma --> a4 is polluted value not the expected "32". ... .L7: j .L7 ---> infinite loop. The root cause is that infinite loop confuse earliest computation and let earliest fusion happens on unexpected place. Disable blocks that belong to infinite loop to fix this bug since applying ealiest LCM fusion on infinite loop seems quite complicated and we don't see any benefits. Note that disabling earliest fusion on infinite loops doesn't hurt the vsetvli performance, instead, it does improve codegen of some cases. Tested on both RV32 and RV64 no regression. PR target/113206 PR target/113209 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (invalid_opt_bb_p): New function. (pre_vsetvl::compute_lcm_local_properties): Disable earliest fusion on blocks belong to infinite loop. (pre_vsetvl::emit_vsetvl): Remove fake edges. * config/riscv/t-riscv: Add a new include file. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/avl_single-23.c: Adapt test. * gcc.target/riscv/rvv/vsetvl/vlmax_call-1.c: Robostify test. * gcc.target/riscv/rvv/vsetvl/vlmax_call-2.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_call-3.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-1.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-2.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-3.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-4.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-5.c: Ditto. * gcc.target/riscv/rvv/autovec/pr113206-1.c: New test. * gcc.target/riscv/rvv/autovec/pr113206-2.c: New test. * gcc.target/riscv/rvv/autovec/pr113209.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 43 +++ gcc/config/riscv/t-riscv | 2 +- .../gcc.target/riscv/rvv/autovec/pr113206-1.c | 29 + .../gcc.target/riscv/rvv/autovec/pr113206-2.c | 29 + .../gcc.target/riscv/rvv/autovec/pr113209.c | 34 +++ .../riscv/rvv/vsetvl/avl_single-23.c | 1 - .../riscv/rvv/vsetvl/vlmax_call-1.c | 15 +++ .../riscv/rvv/vsetvl/vlmax_call-2.c | 12 +++--- .../riscv/rvv/vsetvl/vlmax_call-3.c | 12 +++--- .../riscv/rvv/vsetvl/vlmax_conflict-5.c | 5 +-- .../riscv/rvv/vsetvl/vlmax_single_vtype-1.c | 14 +++--- .../riscv/rvv/vsetvl/vlmax_single_vtype-2.c | 6 +-- .../riscv/rvv/vsetvl/vlmax_single_vtype-3.c | 6 +-- .../riscv/rvv/vsetvl/vlmax_single_vtype-4.c | 4 +- .../riscv/rvv/vsetvl/vlmax_single_vtype-5.c | 4 +- 15 files changed, 166 insertions(+), 50 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-2.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index eabaef80f89..d44922feafd 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -85,6 +85,7 @@ along with GCC; see the file COPYING3. If not see #include "predict.h" #include "profile-count.h" #include "gcse.h" +#include "cfgloop.h" using namespace rtl_ssa; using namespace riscv_vector; @@ -648,6 +649,27 @@ has_no_uses (basic_block cfg_bb, rtx_insn *rinsn, int regno) return true; } +/* Return true for the special block that we can't apply LCM optimization. */ +static bool +invalid_opt_bb_p (basic_block cfg_bb) +{ + edge e; + edge_iterator ei; + + /* We don't do LCM optimizations on complex edges. */ + FOR_EACH_EDGE (e, ei, cfg_bb->preds) +if (e->flags & EDGE_COMPLEX) + return true; + + /* We only do LCM optimizations on blocks that are post dominated by + EXIT block, that is, we don't do LCM optimizations on infinite loop. */ + FOR_EACH_EDGE (e, ei, cfg_bb->succs) +if (e->flags & EDGE_FAKE) + return true; + + return false; +} + /* This flags indicates the minimum demand of the vl and vtype values by the RVV instruction. For example, DEMAND_RATIO_P indicates that this RVV instruction only needs the SEW/LMUL ratio to remain the same, and does not @@ -2261,6 +2283,9 @@ public: { /* Initialization of RTL_SSA. */ calculate_dominance_info (CDI_DOMINATORS); +loop_optimizer_init (LOOPS_NORMAL); +/* Create FAKE edges for infinite loops. */ +connect_infinite_loops_to_exit (); df_analyze (); crtl->ssa = new function_info (cfun); m_vector_block_infos.safe_grow_c
[PATCH V2] RISC-V: Fix bug of earliest fusion for infinite loop[VSETVL PASS]
As PR113206 and PR113209, the bugs happens on the following situation: li a4,32 ... vsetvli zero,a4,e8,m8,ta,ma ... slliw a4,a3,24 sraiw a4,a4,24 bge a3,a1,.L8 sb a4,%lo(e)(a0) vsetvli zero,a4,e8,m8,ta,ma --> a4 is polluted value not the expected "32". ... .L7: j .L7 ---> infinite loop. The root cause is that infinite loop confuse earliest computation and let earliest fusion happens on unexpected place. Disable blocks that belong to infinite loop to fix this bug since applying ealiest LCM fusion on infinite loop seems quite complicated and we don't see any benefits. Note that disabling earliest fusion on infinite loops doesn't hurt the vsetvli performance, instead, it does improve codegen of some cases. Tested on both RV32 and RV64 no regression. Ok for trunk ? PR target/113206 PR target/113209 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (invalid_opt_bb_p): New function. (pre_vsetvl::compute_lcm_local_properties): Disable earliest fusion on blocks belong to infinite loop. (pre_vsetvl::emit_vsetvl): Remove fake edges. * config/riscv/t-riscv: Add a new include file. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/avl_single-23.c: Adapt test. * gcc.target/riscv/rvv/vsetvl/vlmax_call-1.c: Robostify test. * gcc.target/riscv/rvv/vsetvl/vlmax_call-2.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_call-3.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-1.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-2.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-3.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-4.c: Ditto. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-5.c: Ditto. * gcc.target/riscv/rvv/autovec/pr113206-1.c: New test. * gcc.target/riscv/rvv/autovec/pr113206-2.c: New test. * gcc.target/riscv/rvv/autovec/pr113209.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 41 +++ gcc/config/riscv/t-riscv | 2 +- .../gcc.target/riscv/rvv/autovec/pr113206-1.c | 29 + .../gcc.target/riscv/rvv/autovec/pr113206-2.c | 29 + .../gcc.target/riscv/rvv/autovec/pr113209.c | 34 +++ .../riscv/rvv/vsetvl/avl_single-23.c | 1 - .../riscv/rvv/vsetvl/vlmax_call-1.c | 15 --- .../riscv/rvv/vsetvl/vlmax_call-2.c | 12 +++--- .../riscv/rvv/vsetvl/vlmax_call-3.c | 12 +++--- .../riscv/rvv/vsetvl/vlmax_conflict-5.c | 5 +-- .../riscv/rvv/vsetvl/vlmax_single_vtype-1.c | 14 +++ .../riscv/rvv/vsetvl/vlmax_single_vtype-2.c | 6 +-- .../riscv/rvv/vsetvl/vlmax_single_vtype-3.c | 6 +-- .../riscv/rvv/vsetvl/vlmax_single_vtype-4.c | 4 +- .../riscv/rvv/vsetvl/vlmax_single_vtype-5.c | 4 +- 15 files changed, 164 insertions(+), 50 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-2.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index eabaef80f89..7b1d8376e41 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -85,6 +85,7 @@ along with GCC; see the file COPYING3. If not see #include "predict.h" #include "profile-count.h" #include "gcse.h" +#include "cfgloop.h" using namespace rtl_ssa; using namespace riscv_vector; @@ -648,6 +649,27 @@ has_no_uses (basic_block cfg_bb, rtx_insn *rinsn, int regno) return true; } +/* Return true for the special block that we can't apply LCM optimization. */ +static bool +invalid_opt_bb_p (basic_block cfg_bb) +{ + edge e; + edge_iterator ei; + + /* We don't do LCM optimizations on complex edges. */ + FOR_EACH_EDGE (e, ei, cfg_bb->preds) +if (e->flags & EDGE_COMPLEX) + return true; + + /* We only do LCM optimizations on blocks that are post dominated by + EXIT block, that is, we don't do LCM optimizations on infinite loop. */ + FOR_EACH_EDGE (e, ei, cfg_bb->succs) +if (e->flags & EDGE_FAKE) + return true; + + return false; +} + /* This flags indicates the minimum demand of the vl and vtype values by the RVV instruction. For example, DEMAND_RATIO_P indicates that this RVV instruction only needs the SEW/LMUL ratio to remain the same, and does not @@ -2261,6 +2283,8 @@ public: { /* Initialization of RTL_SSA. */ calculate_dominance_info (CDI_DOMINATORS); +loop_optimizer_init (LOOPS_NORMAL); +connect_infinite_loops_to_exit (); df_analyze (); crtl->ssa = new function_info (cfun); m_vector_block_infos.safe_grow_cleared (last_basic_block_for_fn (
[PATCH] RISC-V: Fix bug of earliest fusion for infinite loop[VSETVL PASS]
As PR113206, the bugs happens on the following situation: li a4,32 ... vsetvli zero,a4,e8,m8,ta,ma ... slliw a4,a3,24 sraiw a4,a4,24 bge a3,a1,.L8 sb a4,%lo(e)(a0) vsetvli zero,a4,e8,m8,ta,ma --> a4 is polluted value not the expected "32". ... .L7: j .L7 ---> infinite loop. The root cause is that infinite loop confuse earliest computation and let earliest fusion happens on unexpected place. Disable blocks that belong to infinite loop to fix this bug since applying ealiest LCM fusion on infinite loop seems quite complicated and we don't see any benefits. Note that disabling earliest fusion on infinite loops doesn't hurt the vsetvli performance, instead, it does improve codegen of some cases. Tested on both RV32 and RV64 no regression. Ok for trunk ? PR target/113206 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (invalid_opt_bb_p): New function. (pre_vsetvl::compute_lcm_local_properties): Disable earliest fusion on blocks belong to infinite loop. * config/riscv/t-riscv: Add a new include file. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/avl_single-23.c: Adapt test. * gcc.target/riscv/rvv/vsetvl/vlmax_call-1.c: Robostify test. * gcc.target/riscv/rvv/vsetvl/vlmax_call-2.c: Robostify test. * gcc.target/riscv/rvv/vsetvl/vlmax_call-3.c: Robostify test. * gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Robostify test. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-1.c: Robostify test. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-2.c: Robostify test. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-3.c: Robostify test. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-4.c: Robostify test. * gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-5.c: Robostify test. * gcc.target/riscv/rvv/autovec/pr113206-1.c: New test. * gcc.target/riscv/rvv/autovec/pr113206-2.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 39 +++ gcc/config/riscv/t-riscv | 2 +- .../gcc.target/riscv/rvv/autovec/pr113206-1.c | 29 ++ .../gcc.target/riscv/rvv/autovec/pr113206-2.c | 29 ++ .../riscv/rvv/vsetvl/avl_single-23.c | 1 - .../riscv/rvv/vsetvl/vlmax_call-1.c | 15 --- .../riscv/rvv/vsetvl/vlmax_call-2.c | 12 +++--- .../riscv/rvv/vsetvl/vlmax_call-3.c | 12 +++--- .../riscv/rvv/vsetvl/vlmax_conflict-5.c | 5 +-- .../riscv/rvv/vsetvl/vlmax_single_vtype-1.c | 14 +++ .../riscv/rvv/vsetvl/vlmax_single_vtype-2.c | 6 +-- .../riscv/rvv/vsetvl/vlmax_single_vtype-3.c | 6 +-- .../riscv/rvv/vsetvl/vlmax_single_vtype-4.c | 4 +- .../riscv/rvv/vsetvl/vlmax_single_vtype-5.c | 4 +- 14 files changed, 128 insertions(+), 50 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-1.c create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-2.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index eabaef80f89..07dcdfd217e 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -85,6 +85,7 @@ along with GCC; see the file COPYING3. If not see #include "predict.h" #include "profile-count.h" #include "gcse.h" +#include "cfgloop.h" using namespace rtl_ssa; using namespace riscv_vector; @@ -648,6 +649,27 @@ has_no_uses (basic_block cfg_bb, rtx_insn *rinsn, int regno) return true; } +/* Return true for the special block that we can't apply LCM optimization. */ +static bool +invalid_opt_bb_p (basic_block cfg_bb) +{ + edge e; + edge_iterator ei; + + /* We don't do LCM optimizations on complex edges. */ + FOR_EACH_EDGE (e, ei, cfg_bb->preds) +if (e->flags & EDGE_COMPLEX) + return true; + + /* We only do LCM optimizations on blocks that are post dominated by + EXIT block, that is, we don't do LCM optimizations on infinite loop. */ + FOR_EACH_EDGE (e, ei, cfg_bb->succs) +if (e->flags & EDGE_FAKE) + return true; + + return false; +} + /* This flags indicates the minimum demand of the vl and vtype values by the RVV instruction. For example, DEMAND_RATIO_P indicates that this RVV instruction only needs the SEW/LMUL ratio to remain the same, and does not @@ -2261,6 +2283,8 @@ public: { /* Initialization of RTL_SSA. */ calculate_dominance_info (CDI_DOMINATORS); +loop_optimizer_init (LOOPS_NORMAL); +connect_infinite_loops_to_exit (); df_analyze (); crtl->ssa = new function_info (cfun); m_vector_block_infos.safe_grow_cleared (last_basic_block_for_fn (cfun)); @@ -2271,6 +2295,8 @@ public: void finish () { free_dominance_info (CDI_DOMINATORS); +remove_fake_exit_edges (); +loop_optimizer_finalize (); if (crtl->ssa->perform_pending_updates ())
[Committed] RISC-V: Add simplification of dummy len and dummy mask COND_LEN_xxx pattern
In https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=d1eacedc6d9ba9f5522f2c8d49ccfdf7939ad72d I optimize COND_LEN_xxx pattern with dummy len and dummy mask with too simply solution which causes redundant vsetvli in the following case: vsetvli a5,a2,e8,m1,ta,ma vle32.v v8,0(a0) vsetivlizero,16,e32,m4,tu,mu > We should apply VLMAX instead of a CONST_INT AVL sllia4,a5,2 vand.vv v0,v8,v16 vand.vv v4,v8,v12 vmseq.viv0,v0,0 sub a2,a2,a5 vneg.v v4,v8,v0.t vsetvli zero,a5,e32,m4,ta,ma The root cause above is the following codes: is_vlmax_len_p (...) return poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)) && !satisfies_constraint_K (len);---> incorrect check. Actually, we should not elide the VLMAX situation that has AVL in range of [0,31]. After removing the the check above, we will have this following issue: vsetivlizero,4,e32,m1,ta,ma vlseg4e32.v v4,(a5) vlseg4e32.v v12,(a3) vsetvli a5,zero,e32,m1,tu,ma ---> This is redundant since VLMAX AVL = 4 when it is fixed-vlmax vfadd.vfv3,v13,fa0 vfadd.vfv1,v12,fa1 vfmul.vvv17,v3,v5 vfmul.vvv16,v1,v5 Since all the following operations (vfadd.vf ... etc) are COND_LEN_xxx with dummy len and dummy mask, we add the simplification operations dummy len and dummy mask into VLMAX TA and MA policy. So, after this patch. Both cases are optimal codegen now: case 1: vsetvli a5,a2,e32,m1,ta,mu vle32.v v2,0(a0) sllia4,a5,2 vand.vv v1,v2,v3 vand.vv v0,v2,v4 sub a2,a2,a5 vmseq.viv0,v0,0 vneg.v v1,v2,v0.t vse32.v v1,0(a1) case 2: vsetivli zero,4,e32,m1,tu,ma addi a4,a5,400 vlseg4e32.v v12,(a3) vfadd.vf v3,v13,fa0 vfadd.vf v1,v12,fa1 vlseg4e32.v v4,(a4) vfadd.vf v2,v14,fa1 vfmul.vv v17,v3,v5 vfmul.vv v16,v1,v5 This patch is just additional fix of previous approved patch. Tested on both RV32 and RV64 newlib no regression. Committed. gcc/ChangeLog: * config/riscv/riscv-v.cc (is_vlmax_len_p): Remove satisfies_constraint_K. (expand_cond_len_op): Add simplification of dummy len and dummy mask. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/base/vf_avl-3.c: New test. --- gcc/config/riscv/riscv-v.cc| 11 --- gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-3.c | 11 +++ 2 files changed, 19 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-3.c diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index b4c7e0f0126..3c83be35715 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -74,8 +74,7 @@ is_vlmax_len_p (machine_mode mode, rtx len) { poly_int64 value; return poly_int_rtx_p (len, &value) -&& known_eq (value, GET_MODE_NUNITS (mode)) -&& !satisfies_constraint_K (len); +&& known_eq (value, GET_MODE_NUNITS (mode)); } /* Helper functions for insn_flags && insn_types */ @@ -3855,7 +3854,13 @@ expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len) bool is_vlmax_len = is_vlmax_len_p (mode, len); unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type; - if (is_dummy_mask) + /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len, + dummy mask) into NEG_EXPR in GIMPLE FOLD yet. So, we do such + simplification in RISC-V backend and may do that in middle-end in the + future. */ + if (is_dummy_mask && is_vlmax_len) +insn_flags |= TDEFAULT_POLICY_P | MDEFAULT_POLICY_P; + else if (is_dummy_mask) insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P; else if (is_vlmax_len) insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P; diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-3.c b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-3.c new file mode 100644 index 000..116b5b538cc --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-3.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d --param riscv-autovec-preference=fixed-vlmax" } */ + +void foo (int *src, int *dst, int size) { + int i; + for (i = 0; i < size; i++) + *dst++ = *src & 0x80 ? (*src++ & 0x7f) : -*src++; +} + +/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*mu} 1 } } */ +/* { dg-final { scan-assembler-times {vsetvli} 1 } } */ -- 2.36.3
[PATCH] RISC-V: Make liveness be aware of rgroup number of LENS[dynamic LMUL]
This patch fixes the following situation: vl4re16.v v12,0(a5) ... vl4re16.v v16,0(a3) vs4r.v v12,0(a5) ... vl4re16.v v4,0(a0) vs4r.v v16,0(a3) ... vsetvli a3,zero,e16,m4,ta,ma ... vmv.v.x v8,t6 vmsgeu.vv v2,v16,v8 vsub.vv v16,v16,v8 vs4r.v v16,0(a5) ... vs4r.v v4,0(a0) vmsgeu.vv v1,v4,v8 ... vsub.vv v4,v4,v8 sllia6,a4,2 vs4r.v v4,0(a5) ... vsub.vv v4,v12,v8 vmsgeu.vv v3,v12,v8 vs4r.v v4,0(a5) ... There are many spills which are 'vs4r.v'. The root cause is that we don't count vector REG liveness referencing the rgroup controls. _29 = _25->iatom[0]; is transformed into the following vect statement with 4 different loop_len (loop_len_74, loop_len_75, loop_len_76, loop_len_77). vect__29.11_78 = .MASK_LEN_LOAD (vectp_sb.9_72, 32B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, loop_len_74, 0); vect__29.12_80 = .MASK_LEN_LOAD (vectp_sb.9_79, 32B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, loop_len_75, 0); vect__29.13_82 = .MASK_LEN_LOAD (vectp_sb.9_81, 32B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, loop_len_76, 0); vect__29.14_84 = .MASK_LEN_LOAD (vectp_sb.9_83, 32B, { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, loop_len_77, 0); which are the LENS number (LOOP_VINFO_LENS (loop_vinfo).length ()). Count liveness according to LOOP_VINFO_LENS (loop_vinfo).length () to compute liveness more accurately: vsetivlizero,8,e16,m1,ta,ma vmsgeu.vi v19,v14,8 vadd.vi v18,v14,-8 vmsgeu.vi v17,v1,8 vadd.vi v16,v1,-8 vlm.v v15,0(a5) ... Tested no regression, ok for trunk ? PR target/113112 gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (compute_nregs_for_mode): Add rgroup info. (max_number_of_live_regs): Ditto. (has_unexpected_spills_p): Ditto. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/pr113112-5.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 34 +++ .../vect/costmodel/riscv/rvv/pr113112-5.c | 24 + 2 files changed, 52 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-5.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 1199b3af067..12d3b57aff6 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -373,13 +373,17 @@ compute_local_live_ranges ( E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4. Then return RVVM4SImode (LMUL = 4, element mode = SImode). */ static unsigned int -compute_nregs_for_mode (machine_mode mode, machine_mode biggest_mode, int lmul) +compute_nregs_for_mode (loop_vec_info loop_vinfo, machine_mode mode, + machine_mode biggest_mode, int lmul) { + unsigned int rgroup_size = LOOP_VINFO_LENS (loop_vinfo).is_empty () + ? 1 + : LOOP_VINFO_LENS (loop_vinfo).length (); unsigned int mode_size = GET_MODE_SIZE (mode).to_constant (); unsigned int biggest_size = GET_MODE_SIZE (biggest_mode).to_constant (); gcc_assert (biggest_size >= mode_size); unsigned int ratio = biggest_size / mode_size; - return MAX (lmul / ratio, 1); + return MAX (lmul / ratio, 1) * rgroup_size; } /* This function helps to determine whether current LMUL will cause @@ -393,7 +397,7 @@ compute_nregs_for_mode (machine_mode mode, machine_mode biggest_mode, int lmul) mode. - Third, Return the maximum V_REGs are alive of the loop. */ static unsigned int -max_number_of_live_regs (const basic_block bb, +max_number_of_live_regs (loop_vec_info loop_vinfo, const basic_block bb, const hash_map &live_ranges, unsigned int max_point, machine_mode biggest_mode, int lmul) @@ -412,7 +416,7 @@ max_number_of_live_regs (const basic_block bb, { machine_mode mode = TYPE_MODE (TREE_TYPE (var)); unsigned int nregs - = compute_nregs_for_mode (mode, biggest_mode, lmul); + = compute_nregs_for_mode (loop_vinfo, mode, biggest_mode, lmul); live_vars_vec[i] += nregs; if (live_vars_vec[i] > max_nregs) { @@ -687,6 +691,24 @@ update_local_live_ranges ( dump_printf_loc (MSG_NOTE, vect_location, "Add perm indice %T, start = 0, end = %d\n", sel, max_point); + if (!LOOP_VINFO_LENS (loop_vinfo).is_empty () + && LOOP_VINFO_LENS (loop_vinfo).length () > 1) + { + /* If we are vectorizing a permutation when the rgroup number +> 1, we will need additional mask to shuffle the second +vector. */ + tree mask = build_decl (UNKNOWN_LOCATION, VAR_DECL, +
[Committed] RISC-V: Declare STMT_VINFO_TYPE (...) as local variable
Committed. gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc: Move STMT_VINFO_TYPE (...) to local. --- gcc/config/riscv/riscv-vector-costs.cc | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index b41a79429d4..1199b3af067 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -279,10 +279,11 @@ compute_local_live_ranges ( gimple *stmt = program_point.stmt; stmt_vec_info stmt_info = program_point.stmt_info; tree lhs = gimple_get_lhs (stmt); + enum stmt_vec_info_type type + = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)); if (lhs != NULL_TREE && is_gimple_reg (lhs) && (!POINTER_TYPE_P (TREE_TYPE (lhs)) - || STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)) - != store_vec_info_type)) + || type != store_vec_info_type)) { biggest_mode = get_biggest_mode (biggest_mode, TYPE_MODE (TREE_TYPE (lhs))); @@ -309,9 +310,7 @@ compute_local_live_ranges ( if (poly_int_tree_p (var) || (is_gimple_val (var) && (!POINTER_TYPE_P (TREE_TYPE (var)) - || STMT_VINFO_TYPE ( - vect_stmt_to_vectorize (stmt_info)) - != load_vec_info_type))) + || type != load_vec_info_type))) { biggest_mode = get_biggest_mode (biggest_mode, -- 2.36.3
[Committed] RISC-V: Robostify testcase pr113112-1.c
The redudant dump check is fragile and easily changed, not necessary. Tested on both RV32/RV64 no regression. Remove it and committed. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: Remove redundant checks. --- gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c index 95df7809d49..2dc39ad8e8b 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c @@ -24,6 +24,3 @@ foo (int n){ /* { dg-final { scan-assembler-not {jr} } } */ /* { dg-final { scan-assembler-times {ret} 1 } } */ /* { dg-final { scan-tree-dump-times "Preferring smaller LMUL loop because it has unexpected spills" 1 "vect" } } */ -/* { dg-final { scan-tree-dump "At most 8 number of live V_REG at program point 1 for bb 4" "vect" } } */ -/* { dg-final { scan-tree-dump "At most 40 number of live V_REG at program point 1 for bb 3" "vect" } } */ -/* { dg-final { scan-tree-dump "At most 8 number of live V_REG at program point 1 for bb 5" "vect" } } */ -- 2.36.3
[PATCH] RISC-V: Count pointer type SSA into RVV regs liveness for dynamic LMUL cost model
This patch fixes the following choosing unexpected big LMUL which cause register spillings. Before this patch, choosing LMUL = 4: addisp,sp,-160 addiw t1,a2,-1 li a5,7 bleut1,a5,.L16 vsetivlizero,8,e64,m4,ta,ma vmv.v.x v4,a0 vs4r.v v4,0(sp)---> spill to the stack. vmv.v.x v4,a1 addia5,sp,64 vs4r.v v4,0(a5)---> spill to the stack. The root cause is the following codes: if (poly_int_tree_p (var) || (is_gimple_val (var) && !POINTER_TYPE_P (TREE_TYPE (var We count the variable as consuming a RVV reg group when it is not POINTER_TYPE. It is right for load/store STMT for example: _1 = (MEM)*addr --> addr won't be allocated an RVV vector group. However, we find it is not right for non-load/store STMT: _3 = _1 == x_8(D); _1 is pointer type too but we does allocate a RVV register group for it. So after this patch, we are choosing the perfect LMUL for the testcase in this patch: ble a2,zero,.L17 addiw a7,a2,-1 li a5,3 bleua7,a5,.L15 srliw a5,a7,2 sllia6,a5,1 add a6,a6,a5 lui a5,%hi(replacements) addit1,a5,%lo(replacements) sllia6,a6,5 lui t4,%hi(.LANCHOR0) lui t3,%hi(.LANCHOR0+8) lui a3,%hi(.LANCHOR0+16) lui a4,%hi(.LC1) vsetivlizero,4,e16,mf2,ta,ma addit4,t4,%lo(.LANCHOR0) addit3,t3,%lo(.LANCHOR0+8) addia3,a3,%lo(.LANCHOR0+16) addia4,a4,%lo(.LC1) add a6,t1,a6 addia5,a5,%lo(replacements) vle16.v v18,0(t4) vle16.v v17,0(t3) vle16.v v16,0(a3) vmsgeu.vi v25,v18,4 vadd.vi v24,v18,-4 vmsgeu.vi v23,v17,4 vadd.vi v22,v17,-4 vlm.v v21,0(a4) vmsgeu.vi v20,v16,4 vadd.vi v19,v16,-4 vsetvli zero,zero,e64,m2,ta,mu vmv.v.x v12,a0 vmv.v.x v14,a1 .L4: vlseg3e64.v v6,(a5) vmseq.vvv2,v6,v12 vmseq.vvv0,v8,v12 vmsne.vvv1,v8,v12 vmand.mmv1,v1,v2 vmerge.vvm v2,v8,v14,v0 vmv1r.v v0,v1 addia4,a5,24 vmerge.vvm v6,v6,v14,v0 vmerge.vim v2,v2,0,v0 vrgatherei16.vv v4,v6,v18 vmv1r.v v0,v25 vrgatherei16.vv v4,v2,v24,v0.t vs1r.v v4,0(a5) addia3,a5,48 vmv1r.v v0,v21 vmv2r.v v4,v2 vcompress.vmv4,v6,v0 vs1r.v v4,0(a4) vmv1r.v v0,v23 addia4,a5,72 vrgatherei16.vv v4,v6,v17 vrgatherei16.vv v4,v2,v22,v0.t vs1r.v v4,0(a3) vmv1r.v v0,v20 vrgatherei16.vv v4,v6,v16 addia5,a5,96 vrgatherei16.vv v4,v2,v19,v0.t vs1r.v v4,0(a4) bne a6,a5,.L4 No spillings, no "sp" register used. Tested on both RV32 and RV64, no regression. Ok for trunk ? PR target/113112 gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (compute_nregs_for_mode): Fix pointer type liveness count. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 12 ++-- .../vect/costmodel/riscv/rvv/pr113112-4.c | 28 +++ 2 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 0c485dc4f29..b41a79429d4 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -277,9 +277,12 @@ compute_local_live_ranges ( { unsigned int point = program_point.point; gimple *stmt = program_point.stmt; + stmt_vec_info stmt_info = program_point.stmt_info; tree lhs = gimple_get_lhs (stmt); if (lhs != NULL_TREE && is_gimple_reg (lhs) - && !POINTER_TYPE_P (TREE_TYPE (lhs))) + && (!POINTER_TYPE_P (TREE_TYPE (lhs)) + || STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info)) + != store_vec_info_type)) { biggest_mode = get_biggest_mode (biggest_mode, TYPE_MODE (TREE_TYPE (lhs))); @@ -305,7 +308,10 @@ compute_local_live_ranges ( the future. */ if (poly_int_tree_p (var) || (is_gimple_val (var) - && !POINTER_TYPE_P (TREE_TYPE (var + && (!POINTER_TYPE_P (TREE_TYPE (var)) + || STMT_VINFO_TYPE ( +
[Committed] RISC-V: Make dynamic LMUL cost model more accurate for conversion codes
Notice current dynamic LMUL is not accurate for conversion codes. Refine for it, there is current case is changed from choosing LMUL = 4 into LMUL = 8. Tested no regression, committed. Before this patch (LMUL = 4): After this patch (LMUL = 8): lw a7,56(sp) lwa7,56(sp) ld t5,0(sp) ldt5,0(sp) ld t1,8(sp) ldt1,8(sp) ld t6,16(sp) ldt6,16(sp) ld t0,24(sp) ldt0,24(sp) ld t3,32(sp) ldt3,32(sp) ld t4,40(sp) ldt4,40(sp) ble a7,zero,.L5 ble a7,zero,.L5 .L3: .L3: vsetvli a4,a7,e32,m2,ta,mavsetvli a4,a7,e32,m4,ta vle8.v v1,0(a2) vle8.vv3,0(a2) vle8.v v4,0(a1) vle8.vv16,0(t0) vsext.vf4 v8,v1 vle8.vv7,0(a1) vsext.vf4 v2,v4 vle8.vv12,0(t6) vsetvli zero,zero,e8,mf2,ta,mavle8.vv2,0(a5) vadd.vv v4,v4,v1 vle8.vv1,0(t5) vsetvli zero,zero,e32,m2,ta,mavsext.vf4 v20,v3 vle8.v v5,0(t0) vsext.vf4 v8,v7 vle8.v v6,0(t6) vadd.vv v8,v8,v20 vadd.vv v2,v2,v8 vadd.vv v8,v8,v8 vadd.vv v2,v2,v2 vadd.vv v8,v8,v20 vadd.vv v2,v2,v8 vsetvli zero,zero,e8,m1 vsetvli zero,zero,e8,mf2,ta,mavadd.vv v15,v12,v16 vadd.vv v6,v6,v5 vsetvli zero,zero,e32,m4 vsetvli zero,zero,e32,m2,ta,mavsext.vf4 v12,v15 vle8.v v8,0(t5) vadd.vv v8,v8,v12 vle8.v v9,0(a5) vsetvli zero,zero,e8,m1 vsext.vf4 v10,v4vadd.vv v7,v7,v3 vsext.vf4 v12,v6vsetvli zero,zero,e32,m4 vadd.vv v2,v2,v12 vsext.vf4 v4,v7 vadd.vv v2,v2,v10 vadd.vv v8,v8,v4 vsetvli zero,zero,e16,m1,ta,mavsetvli zero,zero,e16,m2 vncvt.x.x.w v4,v2 vncvt.x.x.w v4,v8 vsetvli zero,zero,e32,m2,ta,mavsetvli zero,zero,e8,m1 vadd.vv v6,v2,v2 vncvt.x.x.w v4,v4 vsetvli zero,zero,e8,mf2,ta,mavadd.vv v15,v3,v4 vncvt.x.x.w v4,v4 vadd.vv v2,v2,v4 vadd.vv v5,v5,v4 vse8.vv15,0(t4) vadd.vv v9,v9,v4 vadd.vv v3,v16,v4 vadd.vv v1,v1,v4 vse8.vv2,0(a3) vadd.vv v4,v8,v4 vadd.vv v1,v1,v4 vse8.v v1,0(t4) vse8.vv1,0(a6) vse8.v v9,0(a3) vse8.vv3,0(t1) vsetvli zero,zero,e32,m2,ta,mavsetvli zero,zero,e32,m4 vse8.v v4,0(a6) vsext.vf4 v4,v3 vsext.vf4 v8,v5 vadd.vv v4,v4,v8 vse8.v v5,0(t1) vsetvli zero,zero,e64,m8 vadd.vv v2,v8,v2 vsext.vf2 v16,v4 vsetvli zero,zero,e64,m4,ta,mavse64.v v16,0(t3) vsext.vf2 v8,v2 vsetvli zero,zero,e32,m4 vsetvli zero,zero,e32,m2,ta,mavadd.vv v8,v8,v8 sllit2,a4,3 vsext.vf4 v4,v15 vse64.v v8,0(t3) slli t2,a4,3 vsext.vf4 v2,v1 vadd.vv v4,v8,v4 sub a7,a7,a4 sub a7,a7,a4 vadd.vv v2,v6,v2 vsetvli zero,zero,e64,m8 vsetvli zero,zero,e64,m4,ta,mavsext.vf2 v8,v4 vsext.vf2 v4,v2 vse64.v v8,0(a0) vse64.v v4,0(a0) add a1,a1,a4 add a2,a2,a4 add a2,a2,a4 add a1,a1,a4 add a5,a5,a4 add t6,t6,a4 add t5,t5,a4 add t0,t0,a4 add t6,t6,a4 add a5,a5,a4 add
[Committed] RISC-V: Make known NITERS loop be aware of dynamic lmul cost model liveness information
Consider this following case: int f[12][100]; void bad1(int v1, int v2) { for (int r = 0; r < 100; r += 4) { int i = r + 1; f[0][r] = f[1][r] * (f[2][r]) - f[1][i] * (f[2][i]); f[0][i] = f[1][r] * (f[2][i]) + f[1][i] * (f[2][r]); f[0][r+2] = f[1][r+2] * (f[2][r+2]) - f[1][i+2] * (f[2][i+2]); f[0][i+2] = f[1][r+2] * (f[2][i+2]) + f[1][i+2] * (f[2][r+2]); } } Pick up LMUL = 8 VLS blindly: lui a4,%hi(f) addia4,a4,%lo(f) addisp,sp,-592 addia3,a4,800 lui a5,%hi(.LANCHOR0) vl8re32.v v24,0(a3) addia5,a5,%lo(.LANCHOR0) addia1,a4,400 addia3,sp,140 vl8re32.v v16,0(a1) vl4re16.v v4,0(a5) addia7,a5,192 vs4r.v v4,0(a3) addit0,a5,64 addia3,sp,336 li t2,32 addia2,a5,128 vsetvli a5,zero,e32,m8,ta,ma vrgatherei16.vv v8,v16,v4 vmul.vv v8,v8,v24 vl8re32.v v0,0(a7) vs8r.v v8,0(a3) vmsltu.vx v8,v0,t2 addia3,sp,12 addit2,sp,204 vsm.v v8,0(t2) vl4re16.v v4,0(t0) vl4re16.v v0,0(a2) vs4r.v v4,0(a3) addit0,sp,336 vrgatherei16.vv v8,v24,v4 addia3,sp,208 vrgatherei16.vv v24,v16,v0 vs4r.v v0,0(a3) vmul.vv v8,v8,v24 vlm.v v0,0(t2) vl8re32.v v24,0(t0) addia3,sp,208 vsub.vv v16,v24,v8 addit6,a4,528 vadd.vv v8,v24,v8 addit5,a4,928 vmerge.vvm v8,v8,v16,v0 addit3,a4,128 vs8r.v v8,0(a4) addit4,a4,1056 addit1,a4,656 addia0,a4,256 addia6,a4,1184 addia1,a4,784 addia7,a4,384 addia4,sp,140 vl4re16.v v0,0(a3) vl8re32.v v24,0(t6) vl4re16.v v4,0(a4) vrgatherei16.vv v16,v24,v0 addia3,sp,12 vs8r.v v16,0(t0) vl8re32.v v8,0(t5) vrgatherei16.vv v16,v24,v4 vl4re16.v v4,0(a3) vrgatherei16.vv v24,v8,v4 vmul.vv v16,v16,v8 vl8re32.v v8,0(t0) vmul.vv v8,v8,v24 vsub.vv v24,v16,v8 vlm.v v0,0(t2) addia3,sp,208 vadd.vv v8,v8,v16 vl8re32.v v16,0(t4) vmerge.vvm v8,v8,v24,v0 vrgatherei16.vv v24,v16,v4 vs8r.v v24,0(t0) vl4re16.v v28,0(a3) addia3,sp,464 vs8r.v v8,0(t3) vl8re32.v v8,0(t1) vrgatherei16.vv v0,v8,v28 vs8r.v v0,0(a3) addia3,sp,140 vl4re16.v v24,0(a3) addia3,sp,464 vrgatherei16.vv v0,v8,v24 vl8re32.v v24,0(t0) vmv8r.v v8,v0 vl8re32.v v0,0(a3) vmul.vv v8,v8,v16 vmul.vv v24,v24,v0 vsub.vv v16,v8,v24 vadd.vv v8,v8,v24 vsetivlizero,4,e32,m8,ta,ma vle32.v v24,0(a6) vsetvli a4,zero,e32,m8,ta,ma addia4,sp,12 vlm.v v0,0(t2) vmerge.vvm v8,v8,v16,v0 vl4re16.v v16,0(a4) vrgatherei16.vv v0,v24,v16 vsetivlizero,4,e32,m8,ta,ma vs8r.v v0,0(a4) addia4,sp,208 vl4re16.v v0,0(a4) vs8r.v v8,0(a0) vle32.v v16,0(a1) vsetvli a5,zero,e32,m8,ta,ma vrgatherei16.vv v8,v16,v0 vs8r.v v8,0(a4) addia4,sp,140 vl4re16.v v4,0(a4) addia5,sp,12 vrgatherei16.vv v8,v16,v4 vl8re32.v v0,0(a5) vsetivlizero,4,e32,m8,ta,ma addia5,sp,208 vmv8r.v v16,v8 vl8re32.v v8,0(a5) vmul.vv v24,v24,v16 vmul.vv v8,v0,v8 vsub.vv v16,v24,v8 vadd.vv v8,v8,v24 vsetvli a5,zero,e8,m2,ta,ma vlm.v v0,0(t2) vsetivlizero,4,e32,m8,ta,ma vmerge.vvm v8,v8,v16,v0 vse32.v v8,0(a7) addisp,sp,592 jr ra This patch makes loop with known NITERS be aware of liveness estimation, after this patch, choosing LMUL = 4: lui a5,%hi(f) addia5,a5,%lo(f) addia3,a5,400 addia4,a5,800 vsetivlizero,8,e32,m2,ta,ma vlseg4e32.v v16,(a3) vlseg4e32.v v8,(a4) vmul.vv v2,v8,v16 addia3,a5,528 vmv.v.v v24,v10 vnmsub.vv v24,v18,v2 addia4,a5,928 vmul.vv v2,v12,v22 vmul.vv v6,v8,v18 vmv.v.v v30,v2 vmacc.vvv30,v14,v20 vmv.v.v v26,v6 vmacc.vvv26,v10,v16 vmul.vv v4,v12,v20 vmv.v.v v28,v14 vnmsub.vv v28,v22,v4 vsseg4e32.v v24,(a5) vlseg4e32.v v16,(a3) vlseg4e32.v v8,(a4)
[PATCH V2] RISC-V: Disallow transformation into VLMAX AVL for cond_len_xxx when length is in range [0, 31]
Notice we have this following situation: vsetivlizero,4,e32,m1,ta,ma vlseg4e32.v v4,(a5) vlseg4e32.v v12,(a3) vsetvli a5,zero,e32,m1,tu,ma ---> This is redundant since VLMAX AVL = 4 when it is fixed-vlmax vfadd.vfv3,v13,fa0 vfadd.vfv1,v12,fa1 vfmul.vvv17,v3,v5 vfmul.vvv16,v1,v5 The rootcause is that we transform COND_LEN_xxx into VLMAX AVL when len == NUNITS blindly. However, we don't need to transform all of them since when len is range of [0,31], we don't need to consume scalar registers. After this patch: vsetivlizero,4,e32,m1,tu,ma addia4,a5,400 vlseg4e32.v v12,(a3) vfadd.vfv3,v13,fa0 vfadd.vfv1,v12,fa1 vlseg4e32.v v4,(a4) vfadd.vfv2,v14,fa1 vfmul.vvv17,v3,v5 vfmul.vvv16,v1,v5 Tested on both RV32 and RV64 no regression. Ok for trunk ? gcc/ChangeLog: * config/riscv/riscv-v.cc (is_vlmax_len_p): New function. (expand_load_store): Disallow transformation into VLMAX when len is in range of [0,31] (expand_cond_len_op): Ditto. (expand_gather_scatter): Ditto. (expand_lanes_load_store): Ditto. (expand_fold_extract_last): Ditto. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/post-ra-avl.c: Adapt test. * gcc.target/riscv/rvv/base/vf_avl-2.c: New test. --- gcc/config/riscv/riscv-v.cc | 21 +-- .../riscv/rvv/autovec/post-ra-avl.c | 2 +- .../gcc.target/riscv/rvv/base/vf_avl-2.c | 21 +++ 3 files changed, 37 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 038ab084a37..0cc7af58da6 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -68,6 +68,16 @@ imm_avl_p (machine_mode mode) : false; } +/* Return true if LEN is equal to NUNITS that outbounds range of [0, 31]. */ +static bool +is_vlmax_len_p (machine_mode mode, rtx len) +{ + poly_int64 value; + return poly_int_rtx_p (len, &value) +&& known_eq (value, GET_MODE_NUNITS (mode)) +&& !satisfies_constraint_K (len); +} + /* Helper functions for insn_flags && insn_types */ /* Return true if caller need pass mask operand for insn pattern with @@ -3776,7 +3786,7 @@ expand_load_store (rtx *ops, bool is_load) rtx len = ops[3]; machine_mode mode = GET_MODE (ops[0]); - if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode))) + if (is_vlmax_len_p (mode, len)) { /* If the length operand is equal to VF, it is VLMAX load/store. */ if (is_load) @@ -3842,8 +3852,7 @@ expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len) machine_mode mask_mode = GET_MODE (mask); poly_int64 value; bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode)); - bool is_vlmax_len -= poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)); + bool is_vlmax_len = is_vlmax_len_p (mode, len); unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type; if (is_dummy_mask) @@ -4012,7 +4021,7 @@ expand_gather_scatter (rtx *ops, bool is_load) unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode); poly_int64 nunits = GET_MODE_NUNITS (vec_mode); poly_int64 value; - bool is_vlmax = poly_int_rtx_p (len, &value) && known_eq (value, nunits); + bool is_vlmax = is_vlmax_len_p (vec_mode, len); /* Extend the offset element to address width. */ if (inner_offsize < BITS_PER_WORD) @@ -4199,7 +4208,7 @@ expand_lanes_load_store (rtx *ops, bool is_load) rtx reg = is_load ? ops[0] : ops[1]; machine_mode mode = GET_MODE (ops[0]); - if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode))) + if (is_vlmax_len_p (mode, len)) { /* If the length operand is equal to VF, it is VLMAX load/store. */ if (is_load) @@ -4252,7 +4261,7 @@ expand_fold_extract_last (rtx *ops) rtx slide_vect = gen_reg_rtx (mode); insn_code icode; - if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode))) + if (is_vlmax_len_p (mode, len)) len = NULL_RTX; /* Calculate the number of 1-bit in mask. */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c index f3d12bac7cd..bff6dcb1c38 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c @@ -13,4 +13,4 @@ int foo() { return a; } -/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero} 1 } } */ +/* { dg-final { scan-assembler-not {vsetvli\s+[a-x0-9]+,\s*zero} } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_
[PATCH] RISC-V: Disallow transformation into VLMAX AVL for cond_len_xxx when length is in range [0, 31]
Notice we have this following situation: vsetivlizero,4,e32,m1,ta,ma vlseg4e32.v v4,(a5) vlseg4e32.v v12,(a3) vsetvli a5,zero,e32,m1,tu,ma ---> This is redundant since VLMAX AVL = 4 when it is fixed-vlmax vfadd.vfv3,v13,fa0 vfadd.vfv1,v12,fa1 vfmul.vvv17,v3,v5 vfmul.vvv16,v1,v5 The rootcause is that we transform COND_LEN_xxx into VLMAX AVL when len == NUNITS blindly. However, we don't need to transform all of them since when len is range of [0,31], we don't need to consume scalar registers. After this patch: vsetivlizero,4,e32,m1,tu,ma addia4,a5,400 vlseg4e32.v v12,(a3) vfadd.vfv3,v13,fa0 vfadd.vfv1,v12,fa1 vlseg4e32.v v4,(a4) vfadd.vfv2,v14,fa1 vfmul.vvv17,v3,v5 vfmul.vvv16,v1,v5 Tested on both RV32 and RV64 no regression. Ok for trunk ? gcc/ChangeLog: * config/riscv/riscv-v.cc (is_vlmax_len_p): New function. (expand_load_store): Disallow transformation into VLMAX when len is in range of [0,31] (expand_cond_len_op): Ditto. (expand_gather_scatter): Ditto. (expand_lanes_load_store): Ditto. (expand_fold_extract_last): Ditto. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/post-ra-avl.c: Adapt test. * gcc.target/riscv/rvv/base/vf_avl-2.c: New test. --- gcc/config/riscv/riscv-v.cc | 21 +-- .../riscv/rvv/autovec/post-ra-avl.c | 2 +- .../gcc.target/riscv/rvv/base/vf_avl-2.c | 21 +++ 3 files changed, 37 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 038ab084a37..0cc7af58da6 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -68,6 +68,16 @@ imm_avl_p (machine_mode mode) : false; } +/* Return true if LEN is equal to NUNITS that outbounds range of [0, 31]. */ +static bool +is_vlmax_len_p (machine_mode mode, rtx len) +{ + poly_int64 value; + return poly_int_rtx_p (len, &value) +&& known_eq (value, GET_MODE_NUNITS (mode)) +&& !satisfies_constraint_K (len); +} + /* Helper functions for insn_flags && insn_types */ /* Return true if caller need pass mask operand for insn pattern with @@ -3776,7 +3786,7 @@ expand_load_store (rtx *ops, bool is_load) rtx len = ops[3]; machine_mode mode = GET_MODE (ops[0]); - if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode))) + if (is_vlmax_len_p (mode, len)) { /* If the length operand is equal to VF, it is VLMAX load/store. */ if (is_load) @@ -3842,8 +3852,7 @@ expand_cond_len_op (unsigned icode, insn_flags op_type, rtx *ops, rtx len) machine_mode mask_mode = GET_MODE (mask); poly_int64 value; bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode)); - bool is_vlmax_len -= poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)); + bool is_vlmax_len = is_vlmax_len_p (mode, len); unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type; if (is_dummy_mask) @@ -4012,7 +4021,7 @@ expand_gather_scatter (rtx *ops, bool is_load) unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode); poly_int64 nunits = GET_MODE_NUNITS (vec_mode); poly_int64 value; - bool is_vlmax = poly_int_rtx_p (len, &value) && known_eq (value, nunits); + bool is_vlmax = is_vlmax_len_p (vec_mode, len); /* Extend the offset element to address width. */ if (inner_offsize < BITS_PER_WORD) @@ -4199,7 +4208,7 @@ expand_lanes_load_store (rtx *ops, bool is_load) rtx reg = is_load ? ops[0] : ops[1]; machine_mode mode = GET_MODE (ops[0]); - if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode))) + if (is_vlmax_len_p (mode, len)) { /* If the length operand is equal to VF, it is VLMAX load/store. */ if (is_load) @@ -4252,7 +4261,7 @@ expand_fold_extract_last (rtx *ops) rtx slide_vect = gen_reg_rtx (mode); insn_code icode; - if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode))) + if (is_vlmax_len_p (mode, len)) len = NULL_RTX; /* Calculate the number of 1-bit in mask. */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c index f3d12bac7cd..c77b2d187fe 100644 --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c @@ -13,4 +13,4 @@ int foo() { return a; } -/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero} 1 } } */ +/* { dg-final { scan-assembler-not {vsetvli} } } */ diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c b/gcc/tests
[Committed] RISC-V: Fix typo
gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c: Fix typo. --- .../gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c index f3c2315c2c5..e47af25aa9b 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c @@ -19,5 +19,5 @@ bar (int *x, int a, int b, int n) /* { dg-final { scan-assembler {e32,m4} } } */ /* { dg-final { scan-assembler-not {jr} } } */ -/* { dg-final { scan-assembler-times {ret} 2 } } * +/* { dg-final { scan-assembler-times {ret} 2 } } */ /* { dg-final { scan-tree-dump-times "Preferring smaller LMUL loop because it has unexpected spills" 1 "vect" } } */ -- 2.36.3
[Committed] RISC-V: Some minior tweak on dynamic LMUL cost model
Tweak some codes of dynamic LMUL cost model to make computation more predictable and accurate. Tested on both RV32 and RV64 no regression. Committed. PR target/113112 gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (compute_estimated_lmul): Tweak LMUL estimation. (has_unexpected_spills_p): Ditto. (costs::record_potential_unexpected_spills): Ditto. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-1.c: Add more checks. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-2.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-3.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-4.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-5.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-6.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-7.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-1.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-2.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-3.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-4.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-5.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-1.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-2.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-3.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-5.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-7.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-1.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-10.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-11.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-2.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-3.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-4.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-5.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-6.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-7.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-8.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-9.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-12.c: New test. * gcc.dg/vect/costmodel/riscv/rvv/pr113112-2.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 42 +-- .../costmodel/riscv/rvv/dynamic-lmul1-1.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul1-2.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul1-3.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul1-4.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul1-5.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul1-6.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul1-7.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul2-1.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul2-2.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul2-3.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul2-4.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul2-5.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul4-1.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul4-2.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul4-3.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul4-5.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul4-6.c | 5 ++- .../costmodel/riscv/rvv/dynamic-lmul4-7.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul4-8.c | 5 ++- .../costmodel/riscv/rvv/dynamic-lmul8-1.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul8-10.c| 3 ++ .../costmodel/riscv/rvv/dynamic-lmul8-11.c| 3 ++ .../costmodel/riscv/rvv/dynamic-lmul8-12.c| 25 +++ .../costmodel/riscv/rvv/dynamic-lmul8-2.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul8-3.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul8-4.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul8-5.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul8-6.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul8-7.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul8-8.c | 3 ++ .../costmodel/riscv/rvv/dynamic-lmul8-9.c | 3 ++ .../vect/costmodel/riscv/rvv/pr113112-2.c | 20 + 33 files changed, 166 insertions(+), 15 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-12.c create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-2.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index 7b837b08f9e..74b8e86a5e1 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -394,21 +394,32 @@ compute_estimated_lmul (loop_vec_info loop_vinfo, machine_mode mode)
[PATCH] RISC-V: Move RVV V_REGS liveness computation into analyze_loop_vinfo
Currently, we compute RVV V_REGS liveness during better_main_loop_than_p which is not appropriate time to do that since we for example, when have the codes will finally pick LMUL = 8 vectorization factor, we compute liveness for LMUL = 8 multiple times which are redundant. Since we have leverage the current ARM SVE COST model: /* Do one-time initialization based on the vinfo. */ loop_vec_info loop_vinfo = dyn_cast (m_vinfo); if (!m_analyzed_vinfo) { if (loop_vinfo) analyze_loop_vinfo (loop_vinfo); m_analyzed_vinfo = true; } Analyze COST model only once for each cost model. So here we move dynamic LMUL liveness information into analyze_loop_vinfo. /* Do one-time initialization of the costs given that we're costing the loop vectorization described by LOOP_VINFO. */ void costs::analyze_loop_vinfo (loop_vec_info loop_vinfo) { ... /* Detect whether the LOOP has unexpected spills. */ record_potential_unexpected_spills (loop_vinfo); } So that we can avoid redundant computations and the current dynamic LMUL cost model flow is much more reasonable and consistent with others. Tested on RV32 and RV64 no regressions. gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (compute_estimated_lmul): Allow fractional vecrtor. (preferred_new_lmul_p): Move RVV V_REGS liveness computation into analyze_loop_vinfo. (has_unexpected_spills_p): New function. (costs::record_potential_unexpected_spills): Ditto. (costs::better_main_loop_than_p): Move RVV V_REGS liveness computation into analyze_loop_vinfo. * config/riscv/riscv-vector-costs.h: New functions and variables. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul-mixed-1.c: Robostify test. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-1.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-2.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-3.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-4.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-5.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-6.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-7.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-1.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-2.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-3.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-4.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-5.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-6.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-1.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-2.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-3.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-5.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-7.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-1.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-10.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-11.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-2.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-3.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-4.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-5.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-6.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-7.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-8.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-9.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/no-dynamic-lmul-1.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/pr111848.c: Ditto. * gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: Ditto. --- gcc/config/riscv/riscv-vector-costs.cc| 110 +++--- gcc/config/riscv/riscv-vector-costs.h | 8 ++ .../riscv/rvv/dynamic-lmul-mixed-1.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul1-1.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul1-2.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul1-3.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul1-4.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul1-5.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul1-6.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul1-7.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul2-1.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul2-2.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul2-3.c | 5 +- .../costmodel/riscv/rvv/dynamic-lmul2-4.c | 5 +- .../costmodel/riscv/rvv/dynamic
[Committed] RISC-V: Add one more ASM check in PR113112-1.c
gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: Add one more ASM check. --- gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c index a44a1c041af..31b41ba707e 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c @@ -20,6 +20,7 @@ foo (int n){ return 0; } +/* { dg-final { scan-assembler {e32,m4} } } */ /* { dg-final { scan-assembler-not {jr} } } */ /* { dg-final { scan-assembler-times {ret} 1 } } */ /* { dg-final { scan-tree-dump "Maximum lmul = 8" "vect" } } */ -- 2.36.3
[Committed] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis
Consider this following case: foo: ble a0,zero,.L11 lui a2,%hi(.LANCHOR0) addisp,sp,-128 addia2,a2,%lo(.LANCHOR0) mv a1,a0 vsetvli a6,zero,e32,m8,ta,ma vid.v v8 vs8r.v v8,0(sp) ---> spill .L3: vl8re32.v v16,0(sp)---> reload vsetvli a4,a1,e8,m2,ta,ma li a3,0 vsetvli a5,zero,e32,m8,ta,ma vmv8r.v v0,v16 vmv.v.x v8,a4 vmv.v.i v24,0 vadd.vv v8,v16,v8 vmv8r.v v16,v24 vs8r.v v8,0(sp)---> spill .L4: addiw a3,a3,1 vadd.vv v8,v0,v16 vadd.vi v16,v16,1 vadd.vv v24,v24,v8 bne a0,a3,.L4 vsetvli zero,a4,e32,m8,ta,ma sub a1,a1,a4 vse32.v v24,0(a2) sllia4,a4,2 add a2,a2,a4 bne a1,zero,.L3 li a0,0 addisp,sp,128 jr ra .L11: li a0,0 ret Pick unexpected LMUL = 8. The root cause is we didn't involve PHI initial value in the dynamic LMUL calculation: # j_17 = PHI---> # vect_vec_iv_.8_24 = PHI <_25(9), { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }(5)> We didn't count { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } in consuming vector register but it does allocate an vector register group for it. This patch fixes this missing count. Then after this patch we pick up perfect LMUL (LMUL = M4) foo: ble a0,zero,.L9 lui a4,%hi(.LANCHOR0) addia4,a4,%lo(.LANCHOR0) mv a2,a0 vsetivlizero,16,e32,m4,ta,ma vid.v v20 .L3: vsetvli a3,a2,e8,m1,ta,ma li a5,0 vsetivlizero,16,e32,m4,ta,ma vmv4r.v v16,v20 vmv.v.i v12,0 vmv.v.x v4,a3 vmv4r.v v8,v12 vadd.vv v20,v20,v4 .L4: addiw a5,a5,1 vmv4r.v v4,v8 vadd.vi v8,v8,1 vadd.vv v4,v16,v4 vadd.vv v12,v12,v4 bne a0,a5,.L4 sllia5,a3,2 vsetvli zero,a3,e32,m4,ta,ma sub a2,a2,a3 vse32.v v12,0(a4) add a4,a4,a5 bne a2,zero,.L3 .L9: li a0,0 ret Tested on --with-arch=gcv no regression. PR target/113112 gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Refine dump information. (preferred_new_lmul_p): Make PHI initial value into live regs calculation. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 45 --- .../vect/costmodel/riscv/rvv/pr113112-1.c | 31 + 2 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index a316603e207..946eb4a9fc6 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -355,10 +355,11 @@ max_number_of_live_regs (const basic_block bb, } if (dump_enabled_p ()) -dump_printf_loc (MSG_NOTE, vect_location, -"Maximum lmul = %d, %d number of live V_REG at program " -"point %d for bb %d\n", -lmul, max_nregs, live_point, bb->index); +dump_printf_loc ( + MSG_NOTE, vect_location, + "Maximum lmul = %d, At most %d number of live V_REG at program " + "point %d for bb %d\n", + lmul, max_nregs, live_point, bb->index); return max_nregs; } @@ -472,6 +473,41 @@ update_local_live_ranges ( tree def = gimple_phi_arg_def (phi, j); auto *live_ranges = live_ranges_per_bb.get (bb); auto *live_range = live_ranges->get (def); + if (poly_int_tree_p (def)) + { + /* Insert live range of INTEGER_CST or POLY_CST since we will +need to allocate a vector register for it. + +E.g. # j_17 = PHI will be transformed +into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)> + +The live range for such value is short which only lives +from program point 0 to 1. */ + if (live_range) + { + unsigned int start = (*live_range).first; + (*live_range).first = 0; + if (dump_enabled_p ()) + dump_printf_loc ( + MSG_NOTE, vect_location, + "Update %T start point from %d to 0:\n", def, start); + } + else + { + live_range
[PATCH] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis
Consider this following case: foo: ble a0,zero,.L11 lui a2,%hi(.LANCHOR0) addisp,sp,-128 addia2,a2,%lo(.LANCHOR0) mv a1,a0 vsetvli a6,zero,e32,m8,ta,ma vid.v v8 vs8r.v v8,0(sp) ---> spill .L3: vl8re32.v v16,0(sp)---> reload vsetvli a4,a1,e8,m2,ta,ma li a3,0 vsetvli a5,zero,e32,m8,ta,ma vmv8r.v v0,v16 vmv.v.x v8,a4 vmv.v.i v24,0 vadd.vv v8,v16,v8 vmv8r.v v16,v24 vs8r.v v8,0(sp)---> spill .L4: addiw a3,a3,1 vadd.vv v8,v0,v16 vadd.vi v16,v16,1 vadd.vv v24,v24,v8 bne a0,a3,.L4 vsetvli zero,a4,e32,m8,ta,ma sub a1,a1,a4 vse32.v v24,0(a2) sllia4,a4,2 add a2,a2,a4 bne a1,zero,.L3 li a0,0 addisp,sp,128 jr ra .L11: li a0,0 ret Pick unexpected LMUL = 8. The root cause is we didn't involve PHI initial value in the dynamic LMUL calculation: # j_17 = PHI---> # vect_vec_iv_.8_24 = PHI <_25(9), { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }(5)> We didn't count { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } in consuming vector register but it does allocate an vector register group for it. This patch fixes this missing count. Then after this patch we pick up perfect LMUL (LMUL = M4) foo: ble a0,zero,.L9 lui a4,%hi(.LANCHOR0) addia4,a4,%lo(.LANCHOR0) mv a2,a0 vsetivlizero,16,e32,m4,ta,ma vid.v v20 .L3: vsetvli a3,a2,e8,m1,ta,ma li a5,0 vsetivlizero,16,e32,m4,ta,ma vmv4r.v v16,v20 vmv.v.i v12,0 vmv.v.x v4,a3 vmv4r.v v8,v12 vadd.vv v20,v20,v4 .L4: addiw a5,a5,1 vmv4r.v v4,v8 vadd.vi v8,v8,1 vadd.vv v4,v16,v4 vadd.vv v12,v12,v4 bne a0,a5,.L4 sllia5,a3,2 vsetvli zero,a3,e32,m4,ta,ma sub a2,a2,a3 vse32.v v12,0(a4) add a4,a4,a5 bne a2,zero,.L3 .L9: li a0,0 ret Tested on --with-arch=gcv no regression. Ok for trunk ? PR target/113112 gcc/ChangeLog: * config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Refine dump information. (preferred_new_lmul_p): Make PHI initial value into live regs calculation. gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: New test. --- gcc/config/riscv/riscv-vector-costs.cc| 45 --- .../vect/costmodel/riscv/rvv/pr113112-1.c | 31 + 2 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc index a316603e207..2d4b82a643a 100644 --- a/gcc/config/riscv/riscv-vector-costs.cc +++ b/gcc/config/riscv/riscv-vector-costs.cc @@ -355,10 +355,11 @@ max_number_of_live_regs (const basic_block bb, } if (dump_enabled_p ()) -dump_printf_loc (MSG_NOTE, vect_location, -"Maximum lmul = %d, %d number of live V_REG at program " -"point %d for bb %d\n", -lmul, max_nregs, live_point, bb->index); +dump_printf_loc ( + MSG_NOTE, vect_location, + "Maximum lmul = %d, At most %d number of live V_REG at program " + "point %d for bb %d\n", + lmul, max_nregs, live_point, bb->index); return max_nregs; } @@ -472,6 +473,41 @@ update_local_live_ranges ( tree def = gimple_phi_arg_def (phi, j); auto *live_ranges = live_ranges_per_bb.get (bb); auto *live_range = live_ranges->get (def); + if (poly_int_tree_p (def)) + { + /* Insert live range of INTEGER_CST since we will need to +allocate a vector register for it. + +E.g. # j_17 = PHI will be transformed +into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)> + +The live range for such value is short which only lives +at program point 0. */ + if (live_range) + { + unsigned int start = (*live_range).first; + (*live_range).first = 0; + if (dump_enabled_p ()) + dump_printf_loc ( + MSG_NOTE, vect_location, + "Update %T start point from %d to 0:\n", def, start); + } + else + { + live_ranges->p
[Committed] RISC-V: Add dynamic LMUL test for x264
When working on evaluating x264 performance, I notice the best LMUL for such case with -march=rv64gcv is LMUL = 2 LMUL = 1: x264_pixel_8x8: add a4,a1,a2 addia6,a0,16 vsetivlizero,4,e8,mf4,ta,ma add a5,a4,a2 vle8.v v12,0(a6) vle8.v v2,0(a4) addia6,a0,4 addia4,a4,4 vle8.v v11,0(a6) vle8.v v9,0(a4) addia6,a1,4 addia4,a0,32 vle8.v v13,0(a0) vle8.v v1,0(a1) vle8.v v4,0(a6) vle8.v v8,0(a4) vle8.v v7,0(a5) vwsubu.vv v3,v13,v1 add a3,a5,a2 addia6,a0,20 addia4,a0,36 vle8.v v10,0(a6) vle8.v v6,0(a4) addia5,a5,4 vle8.v v5,0(a5) vsetvli zero,zero,e16,mf2,ta,mu vmslt.viv0,v3,0 vneg.v v3,v3,v0.t vsetvli zero,zero,e8,mf4,ta,ma vwsubu.vv v1,v12,v2 vsetvli zero,zero,e16,mf2,ta,mu vmslt.viv0,v1,0 vneg.v v1,v1,v0.t vmv1r.v v2,v1 vwadd.vvv1,v3,v2 vsetvli zero,zero,e8,mf4,ta,ma vwsubu.vv v2,v11,v4 vsetvli zero,zero,e16,mf2,ta,mu vmslt.viv0,v2,0 vneg.v v2,v2,v0.t vsetvli zero,zero,e8,mf4,ta,ma vwsubu.vv v3,v10,v9 vsetvli zero,zero,e16,mf2,ta,mu vmv1r.v v4,v2 vmslt.viv0,v3,0 vneg.v v3,v3,v0.t vwadd.vvv2,v4,v3 vsetvli zero,zero,e8,mf4,ta,ma vwsubu.vv v3,v8,v7 vsetvli zero,zero,e16,mf2,ta,mu add a4,a3,a2 vmslt.viv0,v3,0 vneg.v v3,v3,v0.t vwadd.wvv1,v1,v3 vsetvli zero,zero,e8,mf4,ta,ma add a5,a4,a2 vwsubu.vv v3,v6,v5 addia6,a0,48 vsetvli zero,zero,e16,mf2,ta,mu vle8.v v16,0(a3) vle8.v v12,0(a4) addia3,a3,4 addia4,a4,4 vle8.v v17,0(a6) vle8.v v14,0(a3) vle8.v v10,0(a4) vle8.v v8,0(a5) add a6,a5,a2 addia3,a0,64 addia4,a0,80 addia5,a5,4 vle8.v v13,0(a3) vle8.v v4,0(a5) vle8.v v9,0(a4) vle8.v v6,0(a6) vmslt.viv0,v3,0 addia7,a0,52 vneg.v v3,v3,v0.t vle8.v v15,0(a7) vwadd.wvv2,v2,v3 addia3,a0,68 addia4,a0,84 vle8.v v11,0(a3) vle8.v v5,0(a4) addia5,a0,96 vle8.v v7,0(a5) vsetvli zero,zero,e8,mf4,ta,ma vwsubu.vv v3,v17,v16 vsetvli zero,zero,e16,mf2,ta,mu vmslt.viv0,v3,0 vneg.v v3,v3,v0.t vwadd.wvv1,v1,v3 vsetvli zero,zero,e8,mf4,ta,ma vwsubu.vv v3,v15,v14 vsetvli zero,zero,e16,mf2,ta,mu vmslt.viv0,v3,0 vneg.v v3,v3,v0.t vwadd.wvv2,v2,v3 vsetvli zero,zero,e8,mf4,ta,ma vwsubu.vv v3,v13,v12 vsetvli zero,zero,e16,mf2,ta,mu sllia4,a2,3 vmslt.viv0,v3,0 vneg.v v3,v3,v0.t vwadd.wvv1,v1,v3 vsetvli zero,zero,e8,mf4,ta,ma sub a4,a4,a2 vwsubu.vv v3,v11,v10 vsetvli zero,zero,e16,mf2,ta,mu add a1,a1,a4 vmslt.viv0,v3,0 vneg.v v3,v3,v0.t vwadd.wvv2,v2,v3 vsetvli zero,zero,e8,mf4,ta,ma lbu a7,0(a1) vwsubu.vv v3,v9,v8 lbu a5,112(a0) vsetvli zero,zero,e16,mf2,ta,mu subwa5,a5,a7 vmslt.viv0,v3,0 lbu a3,113(a0) vneg.v v3,v3,v0.t lbu a4,1(a1) vwadd.wvv1,v1,v3 addia6,a6,4 vsetvli zero,zero,e8,mf4,ta,ma subwa3,a3,a4 vwsubu.vv v3,v5,v4 addia2,a0,100 vsetvli zero,zero,e16,mf2,ta,mu vle8.v v4,0(a6) sraiw a6,a5,31 vle8.v v5,0(a2) sraiw a7,a3,31 vmslt.viv0,v3,0 xor a2,a5,a6 vneg.v v3,v3,v0.t vwadd.wvv2,v2,v3 vsetvli zero,zero,e8,mf4,ta,ma lbu a4,114(a0) vwsubu.vv v3,v7,v6 lbu t1,2(a1) vsetvli zero,zero,e16,mf2,ta,mu subwa2,a2,a6 xor a6,a3,a7 vmslt.viv0,v3,0 subwa4,a4,t1 vneg.v v3,v3,v0.t lbu t1,3(a1) vwadd.wvv1,v1,v3 lbu a5,115(a0) subwa6,a6,a7 vsetvli zero,zero,e8,mf4,ta,ma li a7,0 vwsubu.vv v3,v5,v4 sraiw t3,a4,31 vsetvli zero,zero,e16,mf2,ta,mu subwa5,a5,t1 vmslt.viv0,v3,0 vneg.v v3,v3,v0.t vwadd.wvv2,v2,v3 sraiw t1,a5,31 vsetvli zero,zero,e32,m1,ta,ma
[Committed] RISC-V: Fix ICE of moving SUBREG of vector mode to DImode scalar register on RV32 system.
This patch fixes following ICE on full coverage testing of RV32. Running target riscv-sim/-march=rv32gc_zve32f/-mabi=ilp32d/-mcmodel=medlow/--param=riscv-autovec-lmul=dynamic FAIL: gcc.c-torture/compile/930120-1.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/930120-1.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/930120-1.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/pr42196-1.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/pr42196-1.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -O1 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -Os (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-3.c -O1 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-3.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-3.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-3.c -Os (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/bswap-1.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/bswap-1.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) Running target riscv-sim/-march=rv32gc_zve32f/-mabi=ilp32d/-mcmodel=medlow/--param=riscv-autovec-lmul=dynamic/--param=riscv-autovec-preference=fixed-vlmax FAIL: gcc.c-torture/compile/930120-1.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/930120-1.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/930120-1.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/pr42196-1.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/pr42196-1.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -O1 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -Os (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-3.c -O1 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-3.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-3.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-3.c -Os (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/bswap-1.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/bswap-1.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) Running target riscv-sim/-march=rv32gc_zve32f/-mabi=ilp32d/-mcmodel=medlow/--param=riscv-autovec-lmul=m2 FAIL: gcc.c-torture/compile/930120-1.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/930120-1.c -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/930120-1.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/pr42196-1.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/compile/pr42196-1.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -O1 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -O2 (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -O3 -g (internal compiler error: in emit_move_insn, at expr.cc:4606) FAIL: gcc.c-torture/execute/20050316-2.c -Os (internal compiler error: in emit_move_insn, at expr.cc:
[PATCH] RISC-V: Optimize SELECT_VL codegen when length is known as smaller than VF
While trying to fix bugs of PR113097, notice this following situation we generate redundant vsetvli _255 = SELECT_VL (3, POLY_INT_CST [4, 4]); COND_LEN (..., _255) Before this patch: vsetivli a5, 3... ... vadd.vv (use a5) After this patch: ... vadd.vv (use AVL = 3) The reason we can do this is because known_ge (3, [4,4]) is true. It's safe to apply such optimization Tested on both RV32 and RV64 full coverage testing, no regression. PR target/113087 gcc/ChangeLog: * config/riscv/riscv-v.cc (expand_select_vl): Optimize SELECT_VL. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr113087-2.c: New test. --- gcc/config/riscv/riscv-v.cc | 10 +++ .../gcc.target/riscv/rvv/autovec/pr113087-2.c | 61 +++ 2 files changed, 71 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-2.c diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index 486f5deb296..fc9825f168a 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -3671,6 +3671,16 @@ void expand_select_vl (rtx *ops) { poly_int64 nunits = rtx_to_poly_int64 (ops[2]); + if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits)) +{ + /* If length is known <= VF, we just use the length directly instead +of using vsetvli. + +E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]); +We move 3 into _255 intead of using explicit vsetvl. */ + emit_move_insn (ops[0], ops[1]); + return; +} /* We arbitrary picked QImode as inner scalar mode to get vector mode. since vsetvl only demand ratio. We let VSETVL PASS to optimize it. */ scalar_int_mode mode = QImode; diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-2.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-2.c new file mode 100644 index 000..836260fe911 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-2.c @@ -0,0 +1,61 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */ + +#include +int (e) (int g, int h) { return h > 0x10 || g > 0x >> h ? g : g << h; } +struct i +{ + int j; + int l : 1; +}; +struct m +{ + char k; + int n; +}; +char o; +char p; +short s; +int q; +struct m r; +int v; +int t; +short z; +long ac; +int ad; +int ae; + +static void +ai (struct i bf) +{ + for (; v; v++) +r.k = 0; + do +ac ^= bf.j; + while (bf.j < 0); + s = 0; + if (bf.l) +q |= 0x800; +} + +int +main () +{ + struct i aw = {0xE00, 1}; + o = 4; + s = p; + ai (aw); + t = 1; + ++p; + for (; t <= 7; t++) +{ + ad &= 1; + (o &= 1 - e (0x4012, ++ae)) & (z |= 1); +} + for (; r.n;) +; + assert (o == 4); + return 0; +} + +/* { dg-final { scan-assembler-not {vsetivli\s+[a-x0-9]+,\s*3} } } */ -- 2.36.3
[PATCH] RISC-V: Fix bug of VSETVL fusion
This patch fixes bugs in the fusion of this following case: li a5,-1 vmv.s.x v0,a5 -> demand any non-zero AVL vsetvli a5, ... Incorrect fusion after VSETVL PASS: li a5,-1 vsetvli a5... vmv.s.x v0, a5 --> a5 is modified as incorrect value. We disallow this incorrect fusion above. Full coverage testing of RV64 and RV32 no regression. PR target/113087 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc: Disallow fusion when VL modification pollutes non AVL use. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/autovec/pr113087-1.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 41 - .../gcc.target/riscv/rvv/autovec/pr113087-1.c | 60 +++ 2 files changed, 99 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-1.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 6af8d8429ab..eabaef80f89 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -1128,6 +1128,27 @@ public: return gen_vsetvl_discard_result (Pmode, avl, sew, vlmul, ta, ma); } + /* Return true that the non-AVL operands of THIS will be modified + if we fuse the VL modification from OTHER into THIS. */ + bool vl_modify_non_avl_op_p (const vsetvl_info &other) const + { +/* We don't need to worry about any operands from THIS be + modified by OTHER vsetvl since we OTHER vsetvl doesn't + modify any operand. */ +if (!other.has_vl ()) + return false; + +/* THIS VL operand always preempt OTHER VL operand. */ +if (this->has_vl ()) + return false; + +/* If THIS has non IMM AVL and THIS is AVL compatible with + OTHER, the AVL value of THIS is same as VL value of OTHER. */ +if (!this->has_imm_avl ()) + return false; +return find_access (this->get_insn ()->uses (), REGNO (other.get_vl ())); + } + bool operator== (const vsetvl_info &other) const { gcc_assert (!uninit_p () && !other.uninit_p () @@ -1896,6 +1917,20 @@ public: gcc_unreachable (); } + bool vl_not_in_conflict_p (const vsetvl_info &prev, const vsetvl_info &next) + { +/* We don't fuse this following case: + + li a5, -1 + vmv.s.x v0, a5 -- PREV + vsetvli a5, ...-- NEXT + + Don't fuse NEXT into PREV. +*/ +return !prev.vl_modify_non_avl_op_p (next) + && !next.vl_modify_non_avl_op_p (prev); + } + bool avl_compatible_p (const vsetvl_info &prev, const vsetvl_info &next) { gcc_assert (prev.valid_p () && next.valid_p ()); @@ -1953,7 +1988,8 @@ public: { bool compatible_p = sew_lmul_compatible_p (prev, next) && policy_compatible_p (prev, next) - && avl_compatible_p (prev, next); + && avl_compatible_p (prev, next) + && vl_not_in_conflict_p (prev, next); return compatible_p; } @@ -1961,7 +1997,8 @@ public: { bool available_p = sew_lmul_available_p (prev, next) && policy_available_p (prev, next) - && avl_available_p (prev, next); + && avl_available_p (prev, next) + && vl_not_in_conflict_p (prev, next); gcc_assert (!available_p || compatible_p (prev, next)); return available_p; } diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-1.c new file mode 100644 index 000..7b743effc79 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-1.c @@ -0,0 +1,60 @@ +/* { dg-do run } */ +/* { dg-options "-O3" } */ +/* { dg-require-effective-target riscv_v } */ + +#include +int (e) (int g, int h) { return h > 0x10 || g > 0x >> h ? g : g << h; } +struct i +{ + int j; + int l : 1; +}; +struct m +{ + char k; + int n; +}; +char o; +char p; +short s; +int q; +struct m r; +int v; +int t; +short z; +long ac; +int ad; +int ae; + +static void +ai (struct i bf) +{ + for (; v; v++) +r.k = 0; + do +ac ^= bf.j; + while (bf.j < 0); + s = 0; + if (bf.l) +q |= 0x800; +} + +int +main () +{ + struct i aw = {0xE00, 1}; + o = 4; + s = p; + ai (aw); + t = 1; + ++p; + for (; t <= 7; t++) +{ + ad &= 1; + (o &= 1 - e (0x4012, ++ae)) & (z |= 1); +} + for (; r.n;) +; + assert (o == 4); + return 0; +} -- 2.36.3
[PATCH] RISC-V: Fix FAIL of bb-slp-cond-1.c for RVV
Due to recent VLSmode changes (Change for fixing ICE and run-time FAIL). The dump check is same as ARM SVE now. So adapt test for RISC-V. gcc/testsuite/ChangeLog: * gcc.dg/vect/bb-slp-cond-1.c: Adapt for RISC-V. --- gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c b/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c index 4089eb51b2e..8faf6b6e3ac 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c @@ -47,6 +47,6 @@ int main () } /* { dg-final { scan-tree-dump {(no need for alias check [^\n]* when VF is 1|no alias between [^\n]* when [^\n]* is outside \(-16, 16\))} "vect" { target vect_element_align } } } */ -/* { dg-final { scan-tree-dump-times "loop vectorized" 1 "vect" { target { vect_element_align && { ! { amdgcn-*-* riscv*-*-* } } } } } } */ -/* { dg-final { scan-tree-dump-times "loop vectorized" 2 "vect" { target { amdgcn-*-* riscv*-*-* } } } } */ +/* { dg-final { scan-tree-dump-times "loop vectorized" 1 "vect" { target { vect_element_align && { ! { amdgcn-*-* } } } } } } */ +/* { dg-final { scan-tree-dump-times "loop vectorized" 2 "vect" { target { amdgcn-*-* } } } } */ -- 2.36.3
[PATCH] Regression FIX: Remove vect_variable_length XFAIL from some tests
Hi, this patch fixes these following regression FAILs on RVV: XPASS: gcc.dg/tree-ssa/pr84512.c scan-tree-dump optimized "return 285;" XPASS: gcc.dg/vect/bb-slp-43.c -flto -ffat-lto-objects scan-tree-dump-not slp2 "vector operands from scalars" XPASS: gcc.dg/vect/bb-slp-43.c scan-tree-dump-not slp2 "vector operands from scalars" XPASS: gcc.dg/vect/bb-slp-subgroups-3.c -flto -ffat-lto-objects scan-tree-dump-times slp2 "optimized: basic block" 2 XPASS: gcc.dg/vect/bb-slp-subgroups-3.c scan-tree-dump-times slp2 "optimized: basic block" 2 Since vect_variable_length are available for ARM SVE and RVV, I just use compiler explorer to confirm ARM SVE same as RVV. Hi, @Tamar. Could you double check whether this patch fix is reasonable to you ? And. Hi, @Richard. Is this patch Ok for trunk if this patch fixes regression for both RVV and ARM SVE. gcc/testsuite/ChangeLog: * gcc.dg/tree-ssa/pr84512.c: Remove vect_variable_length XFAIL. * gcc.dg/vect/bb-slp-43.c: Ditto. * gcc.dg/vect/bb-slp-subgroups-3.c: Ditto. --- gcc/testsuite/gcc.dg/tree-ssa/pr84512.c| 2 +- gcc/testsuite/gcc.dg/vect/bb-slp-43.c | 2 +- gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr84512.c b/gcc/testsuite/gcc.dg/tree-ssa/pr84512.c index 496c78b28dc..3c027012670 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr84512.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr84512.c @@ -13,4 +13,4 @@ int foo() } /* Listed targets xfailed due to PR84958. */ -/* { dg-final { scan-tree-dump "return 285;" "optimized" { xfail { amdgcn*-*-* || vect_variable_length } } } } */ +/* { dg-final { scan-tree-dump "return 285;" "optimized" { xfail { amdgcn*-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c index dad2d24262d..40bd2e0dfbf 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c @@ -14,4 +14,4 @@ f (int *restrict x, short *restrict y) } /* { dg-final { scan-tree-dump-not "mixed mask and nonmask" "slp2" } } */ -/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } xfail { vect_variable_length && { ! vect256 } } } } } */ +/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c index fb719915db7..3f0d45ce4a1 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c @@ -42,7 +42,7 @@ main (int argc, char **argv) /* Because we disable the cost model, targets with variable-length vectors can end up vectorizing the store to a[0..7] on its own. With the cost model we do something sensible. */ -/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { target { ! amdgcn-*-* } xfail vect_variable_length } } } */ +/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { target { ! amdgcn-*-* } } } } */ /* amdgcn can do this in one vector. */ /* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp2" { target amdgcn-*-* } } } */ -- 2.36.3
[Committed] RISC-V: Refine some codes of expand_const_vector [NFC]
gcc/ChangeLog: * config/riscv/riscv-v.cc (expand_const_vector): Use builder.inner_mode (). --- gcc/config/riscv/riscv-v.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc index d1eb7a0a9a5..486f5deb296 100644 --- a/gcc/config/riscv/riscv-v.cc +++ b/gcc/config/riscv/riscv-v.cc @@ -1380,15 +1380,15 @@ expand_const_vector (rtx target, rtx src) rtx base1 = builder.elt (1); rtx base2 = builder.elt (2); - scalar_mode elem_mode = GET_MODE_INNER (mode); - rtx step = simplify_binary_operation (MINUS, elem_mode, base2, base1); + rtx step = simplify_binary_operation (MINUS, builder.inner_mode (), + base2, base1); /* Step 1 - { base1, base1 + step, base1 + step * 2, ... } */ rtx tmp = gen_reg_rtx (mode); expand_vec_series (tmp, base1, step); /* Step 2 - { base0, base1, base1 + step, base1 + step * 2, ... } */ if (!rtx_equal_p (base0, const0_rtx)) - base0 = force_reg (elem_mode, base0); + base0 = force_reg (builder.inner_mode (), base0); insn_code icode = optab_handler (vec_shl_insert_optab, mode); gcc_assert (icode != CODE_FOR_nothing); -- 2.36.3
[Committed] RISC-V: Fix FAIL of dynamic-lmul2-7.c
Fix this FAIL: FAIL: gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c scan-tree-dump-times vect "Maximum lmul = 2" 1 gcc/testsuite/ChangeLog: * gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: Adapt test. --- gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c index 0e6d5fe5f62..636332dbb62 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c @@ -22,5 +22,5 @@ bar (int *x, int a, int b, int n) /* { dg-final { scan-assembler-times {ret} 2 } } * /* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */ /* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */ -/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 1 "vect" } } */ +/* { dg-final { scan-tree-dump "Maximum lmul = 2" "vect" } } */ /* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */ -- 2.36.3
[Committed] RISC-V: Remove 256/512/1024 VLS vectors
Since https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=2e7abd09621a4401d44f4513adf126bce4b4828b we only allow VLSmodes with size <= TARGET_MIN_VLEN * TARGET_MAX_LMUL. So when -march=rv64gcv default LMUL = 1, we don't have VLS modes of 256/512/1024 vectors. Disable them in vect test which fixes the following FAILs in the regression: FAIL: gcc.dg/vect/bb-slp-pr65935.c -flto -ffat-lto-objects scan-tree-dump-times slp1 "optimized: basic block" 11 FAIL: gcc.dg/vect/bb-slp-pr65935.c scan-tree-dump-times slp1 "optimized: basic block" 11 FAIL: gcc.dg/vect/bb-slp-subgroups-2.c -flto -ffat-lto-objects scan-tree-dump-times slp2 "optimized: basic block" 1 FAIL: gcc.dg/vect/bb-slp-subgroups-2.c scan-tree-dump-times slp2 "optimized: basic block" 1 gcc/testsuite/ChangeLog: * lib/target-supports.exp: Remove 256/512/1024 vectors. --- gcc/testsuite/lib/target-supports.exp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 370df10978d..7f13ff0ca56 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -9033,7 +9033,7 @@ proc available_vector_sizes { } { lappend result 4096 2048 1024 512 256 128 64 32 16 8 4 2 } elseif { [istarget riscv*-*-*] } { if { [check_effective_target_riscv_v] } { - lappend result 0 32 64 128 256 512 1024 + lappend result 0 32 64 128 } lappend result 128 } else { -- 2.36.3