from:"Juzhe\-Zhong"

[SUBREG V4 3/4] IRA: Apply DF_LIVE_SUBREG data

2024-05-12 Thread Juzhe-Zhong

---
 gcc/ira-build.cc |  7 ---
 gcc/ira-color.cc |  8 
 gcc/ira-emit.cc  | 12 ++--
 gcc/ira-lives.cc |  7 ---
 gcc/ira.cc   | 19 ---
 5 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/gcc/ira-build.cc b/gcc/ira-build.cc
index ea593d5a087..283ff36d3dd 100644
--- a/gcc/ira-build.cc
+++ b/gcc/ira-build.cc
@@ -1921,7 +1921,8 @@ create_bb_allocnos (ira_loop_tree_node_t bb_node)
   create_insn_allocnos (PATTERN (insn), NULL, false);
   /* It might be a allocno living through from one subloop to
  another.  */
-  EXECUTE_IF_SET_IN_REG_SET (df_get_live_in (bb), FIRST_PSEUDO_REGISTER, i, bi)
+  EXECUTE_IF_SET_IN_REG_SET (df_get_subreg_live_in (bb), FIRST_PSEUDO_REGISTER,
+i, bi)
 if (ira_curr_regno_allocno_map[i] == NULL)
   ira_create_allocno (i, false, ira_curr_loop_tree_node);
 }
@@ -1937,9 +1938,9 @@ create_loop_allocnos (edge e)
   bitmap_iterator bi;
   ira_loop_tree_node_t parent;
 
-  live_in_regs = df_get_live_in (e->dest);
+  live_in_regs = df_get_subreg_live_in (e->dest);
   border_allocnos = ira_curr_loop_tree_node->border_allocnos;
-  EXECUTE_IF_SET_IN_REG_SET (df_get_live_out (e->src),
+  EXECUTE_IF_SET_IN_REG_SET (df_get_subreg_live_out (e->src),
 FIRST_PSEUDO_REGISTER, i, bi)
 if (bitmap_bit_p (live_in_regs, i))
   {
diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index b9ae32d1b4d..bfebc48ef83 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -2786,8 +2786,8 @@ ira_loop_edge_freq (ira_loop_tree_node_t loop_node, int 
regno, bool exit_p)
   FOR_EACH_EDGE (e, ei, loop_node->loop->header->preds)
if (e->src != loop_node->loop->latch
&& (regno < 0
-   || (bitmap_bit_p (df_get_live_out (e->src), regno)
-   && bitmap_bit_p (df_get_live_in (e->dest), regno
+   || (bitmap_bit_p (df_get_subreg_live_out (e->src), regno)
+   && bitmap_bit_p (df_get_subreg_live_in (e->dest), regno
  freq += EDGE_FREQUENCY (e);
 }
   else
@@ -2795,8 +2795,8 @@ ira_loop_edge_freq (ira_loop_tree_node_t loop_node, int 
regno, bool exit_p)
   auto_vec edges = get_loop_exit_edges (loop_node->loop);
   FOR_EACH_VEC_ELT (edges, i, e)
if (regno < 0
-   || (bitmap_bit_p (df_get_live_out (e->src), regno)
-   && bitmap_bit_p (df_get_live_in (e->dest), regno)))
+   || (bitmap_bit_p (df_get_subreg_live_out (e->src), regno)
+   && bitmap_bit_p (df_get_subreg_live_in (e->dest), regno)))
  freq += EDGE_FREQUENCY (e);
 }
 
diff --git a/gcc/ira-emit.cc b/gcc/ira-emit.cc
index d347f11fa02..8075b082e36 100644
--- a/gcc/ira-emit.cc
+++ b/gcc/ira-emit.cc
@@ -510,8 +510,8 @@ generate_edge_moves (edge e)
 return;
   src_map = src_loop_node->regno_allocno_map;
   dest_map = dest_loop_node->regno_allocno_map;
-  regs_live_in_dest = df_get_live_in (e->dest);
-  regs_live_out_src = df_get_live_out (e->src);
+  regs_live_in_dest = df_get_subreg_live_in (e->dest);
+  regs_live_out_src = df_get_subreg_live_out (e->src);
   EXECUTE_IF_SET_IN_REG_SET (regs_live_in_dest,
 FIRST_PSEUDO_REGISTER, regno, bi)
 if (bitmap_bit_p (regs_live_out_src, regno))
@@ -1229,16 +1229,16 @@ add_ranges_and_copies (void)
 destination block) to use for searching allocnos by their
 regnos because of subsequent IR flattening.  */
   node = IRA_BB_NODE (bb)->parent;
-  bitmap_copy (live_through, df_get_live_in (bb));
+  bitmap_copy (live_through, df_get_subreg_live_in (bb));
   add_range_and_copies_from_move_list
(at_bb_start[bb->index], node, live_through, REG_FREQ_FROM_BB (bb));
-  bitmap_copy (live_through, df_get_live_out (bb));
+  bitmap_copy (live_through, df_get_subreg_live_out (bb));
   add_range_and_copies_from_move_list
(at_bb_end[bb->index], node, live_through, REG_FREQ_FROM_BB (bb));
   FOR_EACH_EDGE (e, ei, bb->succs)
{
- bitmap_and (live_through,
- df_get_live_in (e->dest), df_get_live_out (bb));
+ bitmap_and (live_through, df_get_subreg_live_in (e->dest),
+ df_get_subreg_live_out (bb));
  add_range_and_copies_from_move_list
((move_t) e->aux, node, live_through,
 REG_FREQ_FROM_EDGE_FREQ (EDGE_FREQUENCY (e)));
diff --git a/gcc/ira-lives.cc b/gcc/ira-lives.cc
index e07d3dc3e89..7641184069d 100644
--- a/gcc/ira-lives.cc
+++ b/gcc/ira-lives.cc
@@ -1254,7 +1254,8 @@ process_out_of_region_eh_regs (basic_block bb)
   if (! eh_p)
 return;
 
-  EXECUTE_IF_SET_IN_BITMAP (df_get_live_out (bb), FIRST_PSEUDO_REGISTER, i, bi)
+  EXECUTE_IF_SET_IN_BITMAP (df_get_subreg_live_out (bb), FIRST_PSEUDO_REGISTER,
+   i, bi)
 {
   ira_allocno_t a = ira_curr_regno_allocno_map[i];
   for (int n = ALLOCNO_NUM_OBJECTS (a) - 1; n >= 0; n--)
@@ -1288,7 +1289,7

[SUBREG V4 2/4] DF: Add DF_LIVE_SUBREG problem

2024-05-12 Thread Juzhe-Zhong

---
 gcc/Makefile.in  |   1 +
 gcc/df-problems.cc   | 886 ++-
 gcc/df.h | 159 +++
 gcc/regs.h   |   5 +
 gcc/sbitmap.cc   |  98 +
 gcc/sbitmap.h|   2 +
 gcc/subreg-live-range.cc | 233 ++
 gcc/subreg-live-range.h  |  60 +++
 gcc/timevar.def  |   1 +
 9 files changed, 1444 insertions(+), 1 deletion(-)
 create mode 100644 gcc/subreg-live-range.cc
 create mode 100644 gcc/subreg-live-range.h

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index a7f15694c34..67d2e3ca1bc 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1684,6 +1684,7 @@ OBJS = \
store-motion.o \
streamer-hooks.o \
stringpool.o \
+   subreg-live-range.o \
substring-locations.o \
target-globals.o \
targhooks.o \
diff --git a/gcc/df-problems.cc b/gcc/df-problems.cc
index 88ee0dd67fc..01f1f850925 100644
--- a/gcc/df-problems.cc
+++ b/gcc/df-problems.cc
@@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "target.h"
 #include "rtl.h"
 #include "df.h"
+#include "subreg-live-range.h"
 #include "memmodel.h"
 #include "tm_p.h"
 #include "insn-config.h"
@@ -1344,8 +1345,891 @@ df_lr_verify_transfer_functions (void)
   bitmap_clear (&all_blocks);
 }
 
+/*
+   REGISTER AND SUBREGS LIVES
+   Like DF_LR, but include tracking subreg liveness.  Currently used to provide
+   subreg liveness related information to the register allocator.  The subreg
+   information is currently tracked for registers that satisfy the following
+   conditions:
+ 1.  REG is a pseudo register
+ 2.  MODE_SIZE > UNIT_SIZE
+ 3.  MODE_SIZE is a multiple of UNIT_SIZE
+ 4.  REG is used via subreg pattern
+   Assuming: MODE = the machine mode of the REG
+MODE_SIZE = GET_MODE_SIZE (MODE)
+UNIT_SIZE = REGMODE_NATURAL_SIZE (MODE)
+   Condition 3 is currently strict, maybe it can be removed in the future, but
+   for now it is sufficient.
+*/
+
+/* These two empty data are used as default data in case the user does not turn
+ * on the track-subreg-liveness feature.  */
+bitmap_head df_subreg_empty_bitmap;
+subregs_live df_subreg_empty_live;
+
+/* Private data for live_subreg problem.  */
+struct df_live_subreg_problem_data
+{
+  /* Record registers that need to track subreg liveness.  */
+  bitmap_head tracked_regs;
+  /* An obstack for the bitmaps we need for this problem.  */
+  bitmap_obstack live_subreg_bitmaps;
+};
+
+/* Helper functions.  */
+
+static df_live_subreg_bb_info *
+df_live_subreg_get_bb_info (unsigned int index)
+{
+  if (index < df_live_subreg->block_info_size)
+return &static_cast (
+  df_live_subreg->block_info)[index];
+  else
+return nullptr;
+}
+
+static df_live_subreg_local_bb_info *
+get_live_subreg_local_bb_info (unsigned int bb_index)
+{
+  return df_live_subreg_get_bb_info (bb_index);
+}
+
+/* Return true if regno is a multireg.  */
+bool
+multireg_p (int regno)
+{
+  if (regno < FIRST_PSEUDO_REGISTER)
+return false;
+  rtx regno_rtx = regno_reg_rtx[regno];
+  machine_mode reg_mode = GET_MODE (regno_rtx);
+  poly_int64 total_size = GET_MODE_SIZE (reg_mode);
+  poly_int64 natural_size = REGMODE_NATURAL_SIZE (reg_mode);
+  return maybe_gt (total_size, natural_size)
+&& multiple_p (total_size, natural_size);
+}
+
+/* Return true if the REGNO need be track with subreg liveness.  */
+
+static bool
+need_track_subreg_p (unsigned regno)
+{
+  auto problem_data
+= (struct df_live_subreg_problem_data *) df_live_subreg->problem_data;
+  return bitmap_bit_p (&problem_data->tracked_regs, regno);
+}
+
+/* Fill RANGE with the subreg range for OP in REGMODE_NATURAL_SIZE granularity.
+ */
+void
+init_range (rtx op, sbitmap range)
+{
+  rtx reg = SUBREG_P (op) ? SUBREG_REG (op) : op;
+  machine_mode reg_mode = GET_MODE (reg);
+
+  if (!read_modify_subreg_p (op))
+{
+  bitmap_set_range (range, 0, get_nblocks (reg_mode));
+  return;
+}
+
+  rtx subreg = op;
+  machine_mode subreg_mode = GET_MODE (subreg);
+  poly_int64 offset = SUBREG_BYTE (subreg);
+  int nblocks = get_nblocks (reg_mode);
+  poly_int64 unit_size = REGMODE_NATURAL_SIZE (reg_mode);
+  poly_int64 subreg_size = GET_MODE_SIZE (subreg_mode);
+  poly_int64 left = offset + subreg_size;
+
+  int subreg_start = -1;
+  int subreg_nblocks = -1;
+  for (int i = 0; i < nblocks; i += 1)
+{
+  poly_int64 right = unit_size * (i + 1);
+  if (subreg_start < 0 && maybe_lt (offset, right))
+   subreg_start = i;
+  if (subreg_nblocks < 0 && maybe_le (left, right))
+   {
+ subreg_nblocks = i + 1 - subreg_start;
+ break;
+   }
+}
+  gcc_assert (subreg_start >= 0 && subreg_nblocks > 0);
+
+  bitmap_set_range (range, subreg_start, subreg_nblocks);
+}
+
+/* Remove R

[SUBREG V4 1/4] DF: Add -ftrack-subreg-liveness option

2024-05-12 Thread Juzhe-Zhong

---
 gcc/common.opt  | 4 
 gcc/common.opt.urls | 3 +++
 gcc/doc/invoke.texi | 8 
 gcc/opts.cc | 1 +
 4 files changed, 16 insertions(+)

diff --git a/gcc/common.opt b/gcc/common.opt
index 40cab3cb36a..5710e817abe 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2163,6 +2163,10 @@ fira-share-spill-slots
 Common Var(flag_ira_share_spill_slots) Init(1) Optimization
 Share stack slots for spilled pseudo-registers.
 
+ftrack-subreg-liveness
+Common Var(flag_track_subreg_liveness) Init(0) Optimization
+Track subreg liveness information.
+
 fira-verbose=
 Common RejectNegative Joined UInteger Var(flag_ira_verbose) Init(5)
 -fira-verbose= Control IRA's level of diagnostic messages.
diff --git a/gcc/common.opt.urls b/gcc/common.opt.urls
index f71ed80a34b..59f27a6f7c6 100644
--- a/gcc/common.opt.urls
+++ b/gcc/common.opt.urls
@@ -880,6 +880,9 @@ 
UrlSuffix(gcc/Optimize-Options.html#index-fira-share-save-slots)
 fira-share-spill-slots
 UrlSuffix(gcc/Optimize-Options.html#index-fira-share-spill-slots)
 
+ftrack-subreg-liveness
+UrlSuffix(gcc/Optimize-Options.html#index-ftrack-subreg-liveness)
+
 fira-verbose=
 UrlSuffix(gcc/Developer-Options.html#index-fira-verbose)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ddcd5213f06..fbcde8aa745 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13188,6 +13188,14 @@ Disable sharing of stack slots allocated for 
pseudo-registers.  Each
 pseudo-register that does not get a hard register gets a separate
 stack slot, and as a result function stack frames are larger.
 
+@opindex ftrack-subreg-liveness
+@item -ftrack-subreg-liveness
+Enable tracking subreg liveness information. This infomation allows IRA
+and LRA to support subreg coalesce feature which can improve the quality
+of register allocation.
+
+This option is enabled at level @option{-O3} for all targets.
+
 @opindex flra-remat
 @item -flra-remat
 Enable CFG-sensitive rematerialization in LRA.  Instead of loading
diff --git a/gcc/opts.cc b/gcc/opts.cc
index 14d1767e48f..8fe3a213807 100644
--- a/gcc/opts.cc
+++ b/gcc/opts.cc
@@ -698,6 +698,7 @@ static const struct default_options default_options_table[] 
=
 { OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 },
 { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_DYNAMIC 
},
 { OPT_LEVELS_3_PLUS, OPT_fversion_loops_for_strides, NULL, 1 },
+{ OPT_LEVELS_3_PLUS, OPT_ftrack_subreg_liveness, NULL, 1 },
 
 /* -O3 parameters.  */
 { OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_auto_, NULL, 30 },
-- 
2.36.3

[SUBREG V4 4/4] LRA: Apply DF_LIVE_SUBREG data

2024-05-12 Thread Juzhe-Zhong

---
 gcc/lra-coalesce.cc|  27 +++-
 gcc/lra-constraints.cc | 109 ++---
 gcc/lra-int.h  |   4 +
 gcc/lra-lives.cc   | 357 -
 gcc/lra-remat.cc   |   8 +-
 gcc/lra-spills.cc  |  27 +++-
 gcc/lra.cc |  10 +-
 7 files changed, 430 insertions(+), 112 deletions(-)

diff --git a/gcc/lra-coalesce.cc b/gcc/lra-coalesce.cc
index a9b5b51cb3f..9416775a009 100644
--- a/gcc/lra-coalesce.cc
+++ b/gcc/lra-coalesce.cc
@@ -186,19 +186,28 @@ static bitmap_head used_pseudos_bitmap;
 /* Set up USED_PSEUDOS_BITMAP, and update LR_BITMAP (a BB live info
bitmap).  */
 static void
-update_live_info (bitmap lr_bitmap)
+update_live_info (bitmap all, bitmap full, bitmap partial)
 {
   unsigned int j;
   bitmap_iterator bi;
 
   bitmap_clear (&used_pseudos_bitmap);
-  EXECUTE_IF_AND_IN_BITMAP (&coalesced_pseudos_bitmap, lr_bitmap,
+  EXECUTE_IF_AND_IN_BITMAP (&coalesced_pseudos_bitmap, all,
FIRST_PSEUDO_REGISTER, j, bi)
 bitmap_set_bit (&used_pseudos_bitmap, first_coalesced_pseudo[j]);
-  if (! bitmap_empty_p (&used_pseudos_bitmap))
+  if (!bitmap_empty_p (&used_pseudos_bitmap))
 {
-  bitmap_and_compl_into (lr_bitmap, &coalesced_pseudos_bitmap);
-  bitmap_ior_into (lr_bitmap, &used_pseudos_bitmap);
+  bitmap_and_compl_into (all, &coalesced_pseudos_bitmap);
+  bitmap_ior_into (all, &used_pseudos_bitmap);
+
+  if (flag_track_subreg_liveness)
+   {
+ bitmap_and_compl_into (full, &coalesced_pseudos_bitmap);
+ bitmap_ior_and_compl_into (full, &used_pseudos_bitmap, partial);
+
+ bitmap_and_compl_into (partial, &coalesced_pseudos_bitmap);
+ bitmap_ior_and_compl_into (partial, &used_pseudos_bitmap, full);
+   }
 }
 }
 
@@ -301,8 +310,12 @@ lra_coalesce (void)
   bitmap_initialize (&used_pseudos_bitmap, ®_obstack);
   FOR_EACH_BB_FN (bb, cfun)
 {
-  update_live_info (df_get_live_in (bb));
-  update_live_info (df_get_live_out (bb));
+  update_live_info (df_get_subreg_live_in (bb),
+   df_get_subreg_live_full_in (bb),
+   df_get_subreg_live_partial_in (bb));
+  update_live_info (df_get_subreg_live_out (bb),
+   df_get_subreg_live_full_out (bb),
+   df_get_subreg_live_partial_out (bb));
   FOR_BB_INSNS_SAFE (bb, insn, next)
if (INSN_P (insn)
&& bitmap_bit_p (&involved_insns_bitmap, INSN_UID (insn)))
diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index e945a4da451..c9246e6be58 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -6565,34 +6565,86 @@ update_ebb_live_info (rtx_insn *head, rtx_insn *tail)
{
  if (prev_bb != NULL)
{
- /* Update df_get_live_in (prev_bb):  */
+ /* Update subreg live (prev_bb):  */
+ bitmap subreg_all_in = df_get_subreg_live_in (prev_bb);
+ bitmap subreg_full_in = df_get_subreg_live_full_in (prev_bb);
+ bitmap subreg_partial_in = df_get_subreg_live_partial_in 
(prev_bb);
+ subregs_live *range_in = df_get_subreg_live_range_in (prev_bb);
  EXECUTE_IF_SET_IN_BITMAP (&check_only_regs, 0, j, bi)
if (bitmap_bit_p (&live_regs, j))
- bitmap_set_bit (df_get_live_in (prev_bb), j);
-   else
- bitmap_clear_bit (df_get_live_in (prev_bb), j);
+ {
+   bitmap_set_bit (subreg_all_in, j);
+   if (flag_track_subreg_liveness)
+ {
+   bitmap_set_bit (subreg_full_in, j);
+   if (bitmap_bit_p (subreg_partial_in, j))
+ {
+   bitmap_clear_bit (subreg_partial_in, j);
+   range_in->remove_range (j);
+ }
+ }
+ }
+   else if (bitmap_bit_p (subreg_all_in, j))
+ {
+   bitmap_clear_bit (subreg_all_in, j);
+   if (flag_track_subreg_liveness)
+ {
+   bitmap_clear_bit (subreg_full_in, j);
+   if (bitmap_bit_p (subreg_partial_in, j))
+ {
+   bitmap_clear_bit (subreg_partial_in, j);
+   range_in->remove_range (j);
+ }
+ }
+ }
}
+ bitmap subreg_all_out = df_get_subreg_live_out (curr_bb);
  if (curr_bb != last_bb)
{
- /* Update df_get_live_out (curr_bb):  */
+ /* Update subreg live (curr_bb):  */
+ bitmap subreg_full_out = df_get_subreg_live_full_out (curr_bb);
+ bitmap subreg_partial_out = df_get_subreg_live_partial_out 
(curr_bb);
+ subregs_live *range_out = df_get_subreg_liv

[SUBREG V4 0/4] Add DF_LIVE_SUBREG data and apply to IRA and LRA

2024-05-12 Thread Juzhe-Zhong

V3: Address comments from Dimitar Dimitrov

V4: Move detailed function from subreg-live-range.h to subreg-live-range.cc.

These patches are used to add a new data flow DF_LIVE_SUBREG,
which will track subreg liveness and then apply it to IRA and LRA
passes (enabled via -O3 or -ftrack-subreg-liveness). These patches
are for GCC 15. And these codes are pushed to the devel/subreg-coalesce
branch. In addition, my colleague Shuo Chen will also be involved in some
of the remain work, thank you for your support.

These patches are separated from the subreg-coalesce patches submitted
a few months ago. I refactored the code according to comments. The next
patches will support subreg coalesce base on they. Here are some data
abot build time of SPEC INT 2017 (x86-64 target):

  baseline   baseline(+track-subreg-liveness)
specint2017 build time :  1892s  1883s

Regarding build times, I've run it a few times, but they all seem to take
much less time. Since the difference is small, it's possible that it's just
a change in environment. But it's theoretically possible, since supporting
subreg-liveness could have reduced the number of living regs.

For memory usage, I trided PR 69609 by valgrind, peak memory size grow from
2003910656 to 2003947520, very small increase.

Note that these patches don't enable register coalesce with subreg liveness in 
IRA/LRA,
so no performance change as expected.

And we will enable register coalsece with subreg liveness tracking in the 
followup patches.

Bootstrap and Regtested on x86-64 no regression.

Co-authored-by: Lehua Ding 

Juzhe-Zhong (4):
  DF: Add -ftrack-subreg-liveness option
  DF: Add DF_LIVE_SUBREG problem
  IRA: Apply DF_LIVE_SUBREG data
  LRA: Apply DF_LIVE_SUBREG data

 gcc/Makefile.in  |   1 +
 gcc/common.opt   |   4 +
 gcc/common.opt.urls  |   3 +
 gcc/df-problems.cc   | 886 ++-
 gcc/df.h | 159 +++
 gcc/doc/invoke.texi  |   8 +
 gcc/ira-build.cc |   7 +-
 gcc/ira-color.cc |   8 +-
 gcc/ira-emit.cc  |  12 +-
 gcc/ira-lives.cc |   7 +-
 gcc/ira.cc   |  19 +-
 gcc/lra-coalesce.cc  |  27 +-
 gcc/lra-constraints.cc   | 109 -
 gcc/lra-int.h|   4 +
 gcc/lra-lives.cc | 357 
 gcc/lra-remat.cc |   8 +-
 gcc/lra-spills.cc|  27 +-
 gcc/lra.cc   |  10 +-
 gcc/opts.cc  |   1 +
 gcc/regs.h   |   5 +
 gcc/sbitmap.cc   |  98 +
 gcc/sbitmap.h|   2 +
 gcc/subreg-live-range.cc | 233 ++
 gcc/subreg-live-range.h  |  60 +++
 gcc/timevar.def  |   1 +
 25 files changed, 1920 insertions(+), 136 deletions(-)
 create mode 100644 gcc/subreg-live-range.cc
 create mode 100644 gcc/subreg-live-range.h

-- 
2.36.3

[SUBREG V3 2/4] DF: Add DF_LIVE_SUBREG problem

2024-05-11 Thread Juzhe-Zhong

This patch add a new DF problem, named DF_LIVE_SUBREG. This problem
is extended from the DF_LR problem and support track the subreg liveness
of multireg pseudo if these pseudo satisfy the following conditions:

  1. the mode size greater than it's REGMODE_NATURAL_SIZE.
  2. the reg is used in insns via subreg pattern.

The main methods are as follows:

  1. split bitmap in/out/def/use fileds to full_in/out/def/use and
 partial_in/out/def/use. If a pseudo need to be tracked it's subreg
 liveness, then it is recorded in partial_in/out/def/use fileds.
 Meantimes, there are range_in/out/def/use fileds which records the live
 range of the tracked pseudo.
  2. in the df_live_subreg_finalize function, we move the tracked pseudo from
 the partial_in/out/def/use to full_in/out/def/use if the pseudo's live
 range is full.

Co-authored-by: Lehua Ding 

gcc/ChangeLog:

* Makefile.in: Add subreg-live-range object file.
* df-problems.cc (struct df_live_subreg_problem_data): New struct.
(df_live_subreg_get_bb_info): New function.
(get_live_subreg_local_bb_info): Ditto.
(multireg_p): Ditto.
(need_track_subreg_p): Ditto.
(init_range): Ditto.
(remove_subreg_range): Ditto.
(add_subreg_range_to_def): Ditto.
(add_subreg_range_to_use): Ditto.
(df_live_subreg_free_bb_info): Ditto.
(df_live_subreg_alloc): Ditto.
(df_live_subreg_reset): Ditto.
(df_live_subreg_bb_local_compute): Ditto.
(df_live_subreg_local_compute): Ditto.
(df_live_subreg_init): Ditto.
(df_live_subreg_check_result): Ditto.
(df_live_subreg_confluence_0): Ditto.
(df_live_subreg_confluence_n): Ditto.
(df_live_subreg_transfer_function): Ditto.
(df_live_subreg_finalize): Ditto.
(df_live_subreg_free): Ditto.
(df_live_subreg_top_dump): Ditto.
(df_live_subreg_bottom_dump): Ditto.
(df_live_subreg_add_problem): Ditto.
* df.h (enum df_problem_id): New enum.
(class subregs_live): New class.
(class df_live_subreg_local_bb_info): Ditto.
(class df_live_subreg_bb_info): Ditto.
(df_live_subreg): New function.
(df_live_subreg_add_problem): Ditto.
(df_live_subreg_finalize): Ditto.
(df_live_subreg_check_result): Ditto.
(multireg_p): Ditto.
(init_range): Ditto.
(add_subreg_range_to_def): Ditto.
(add_subreg_range_to_use): Ditto.
(remove_subreg_range): Ditto.
(df_get_subreg_live_in): Ditto.
(df_get_subreg_live_out): Ditto.
(df_get_subreg_live_full_in): Ditto.
(df_get_subreg_live_full_out): Ditto.
(df_get_subreg_live_partial_in): Ditto.
(df_get_subreg_live_partial_out): Ditto.
(df_get_subreg_live_range_in): Ditto.
(df_get_subreg_live_range_out): Ditto.
* regs.h (get_nblocks): New macro.
* sbitmap.cc (bitmap_full_p): New function.
(bitmap_same_p): Ditto.
(test_full): Ditto.
(test_same): Ditto.
(sbitmap_cc_tests): Ditto.
* sbitmap.h (bitmap_full_p): Ditto.
(bitmap_same_p): Ditto.
* timevar.def (TV_DF_LIVE_SUBREG): New timer stat.
* subreg-live-range.cc: New file.
* subreg-live-range.h: New file.

---
 gcc/Makefile.in  |   1 +
 gcc/df-problems.cc   | 886 ++-
 gcc/df.h | 159 +++
 gcc/regs.h   |   5 +
 gcc/sbitmap.cc   |  98 +
 gcc/sbitmap.h|   2 +
 gcc/subreg-live-range.cc |  53 +++
 gcc/subreg-live-range.h  | 206 +
 gcc/timevar.def  |   1 +
 9 files changed, 1410 insertions(+), 1 deletion(-)
 create mode 100644 gcc/subreg-live-range.cc
 create mode 100644 gcc/subreg-live-range.h

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index ecd51146357..11722506018 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1683,6 +1683,7 @@ OBJS = \
store-motion.o \
streamer-hooks.o \
stringpool.o \
+   subreg-live-range.o \
substring-locations.o \
target-globals.o \
targhooks.o \
diff --git a/gcc/df-problems.cc b/gcc/df-problems.cc
index 88ee0dd67fc..01f1f850925 100644
--- a/gcc/df-problems.cc
+++ b/gcc/df-problems.cc
@@ -28,6 +28,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "target.h"
 #include "rtl.h"
 #include "df.h"
+#include "subreg-live-range.h"
 #include "memmodel.h"
 #include "tm_p.h"
 #include "insn-config.h"
@@ -1344,8 +1345,891 @@ df_lr_verify_transfer_functions (void)
   bitmap_clear (&all_blocks);
 }
 
+/*
+   REGISTER AND SUBREGS LIVES
+   Like DF_LR, but include tracking subreg liveness.  Currently used to provide
+   subreg liveness related information to the register allocator.  The subreg
+   information is currently tracked for registers that satisf

[SUBREG V3 0/4] Add DF_LIVE_SUBREG data and apply to IRA and LRA

2024-05-11 Thread Juzhe-Zhong

V3: Address comments from Dimitar Dimitrov

These patches are used to add a new data flow DF_LIVE_SUBREG,
which will track subreg liveness and then apply it to IRA and LRA
passes (enabled via -O3 or -ftrack-subreg-liveness). These patches
are for GCC 15. And these codes are pushed to the devel/subreg-coalesce
branch. In addition, my colleague Shuo Chen will also be involved in some
of the remain work, thank you for your support.

These patches are separated from the subreg-coalesce patches submitted
a few months ago. I refactored the code according to comments. The next
patches will support subreg coalesce base on they. Here are some data
abot build time of SPEC INT 2017 (x86-64 target):

  baseline   baseline(+track-subreg-liveness)
specint2017 build time :  1892s  1883s

Regarding build times, I've run it a few times, but they all seem to take
much less time. Since the difference is small, it's possible that it's just
a change in environment. But it's theoretically possible, since supporting
subreg-liveness could have reduced the number of living regs.

For memory usage, I trided PR 69609 by valgrind, peak memory size grow from
2003910656 to 2003947520, very small increase.

Note that these patches don't enable register coalesce with subreg liveness in 
IRA/LRA,
so no performance change as expected.

And we will enable register coalsece with subreg liveness tracking in the 
followup patches.

Bootstrap and Regtested on x86-64 no regression.

Co-authored-by: Lehua Ding 

Juzhe-Zhong (4):
  DF: Add -ftrack-subreg-liveness option
  DF: Add DF_LIVE_SUBREG problem
  IRA: Add DF_LIVE_SUBREG problem
  LRA: Apply DF_LIVE_SUBREG data

 gcc/Makefile.in  |   1 +
 gcc/common.opt   |   4 +
 gcc/common.opt.urls  |   3 +
 gcc/df-problems.cc   | 886 ++-
 gcc/df.h | 159 +++
 gcc/doc/invoke.texi  |   8 +
 gcc/ira-build.cc |   7 +-
 gcc/ira-color.cc |   8 +-
 gcc/ira-emit.cc  |  12 +-
 gcc/ira-lives.cc |   7 +-
 gcc/ira.cc   |  19 +-
 gcc/lra-coalesce.cc  |  27 +-
 gcc/lra-constraints.cc   | 109 -
 gcc/lra-int.h|   4 +
 gcc/lra-lives.cc | 357 
 gcc/lra-remat.cc |   8 +-
 gcc/lra-spills.cc|  27 +-
 gcc/lra.cc   |  10 +-
 gcc/opts.cc  |   1 +
 gcc/regs.h   |   5 +
 gcc/sbitmap.cc   |  98 +
 gcc/sbitmap.h|   2 +
 gcc/subreg-live-range.cc |  53 +++
 gcc/subreg-live-range.h  | 206 +
 gcc/timevar.def  |   1 +
 25 files changed, 1886 insertions(+), 136 deletions(-)
 create mode 100644 gcc/subreg-live-range.cc
 create mode 100644 gcc/subreg-live-range.h

-- 
2.36.3

[SUBREG V3 4/4] LRA: Apply DF_LIVE_SUBREG data

2024-05-11 Thread Juzhe-Zhong

This patch apply the DF_LIVE_SUBREG to LRA pass. More changes were made
to the LRA than the IRA since the LRA will modify the DF data directly.
The main big changes are centered on the lra-lives.cc file.

Co-authored-by: Lehua Ding 

gcc/ChangeLog:

* lra-coalesce.cc (update_live_info): Apply DF_LIVE_SUBREG data.
(lra_coalesce): Ditto.
* lra-constraints.cc (update_ebb_live_info): Ditto.
(get_live_on_other_edges): Ditto.
(inherit_in_ebb): Ditto.
(lra_inheritance): Ditto.
(fix_bb_live_info): Ditto.
(remove_inheritance_pseudos): Ditto.
* lra-int.h (GCC_LRA_INT_H): Ditto.
(struct lra_insn_reg): Ditto.
* lra-lives.cc (class bb_data_pseudos): Ditto.
(need_track_subreg_p): New function.
(make_hard_regno_live): Ditto
(make_hard_regno_dead): Ditto.
(mark_regno_live): Apply DF_LIVE_SUBREG data.
(mark_regno_dead): Ditto.
(live_trans_fun): Ditto.
(live_con_fun_0): Ditto.
(live_con_fun_n): Ditto.
(initiate_live_solver): Ditto.
(finish_live_solver): Ditto.
(process_bb_lives): Ditto.
(lra_create_live_ranges_1): Ditto.
* lra-remat.cc (dump_candidates_and_remat_bb_data): Ditto.
(calculate_livein_cands): Ditto.
(do_remat): Ditto.
* lra-spills.cc (spill_pseudos): Ditto.
* lra.cc (new_insn_reg): Ditto.
(add_regs_to_insn_regno_info): Ditto.

---
 gcc/lra-coalesce.cc|  27 +++-
 gcc/lra-constraints.cc | 109 ++---
 gcc/lra-int.h  |   4 +
 gcc/lra-lives.cc   | 357 -
 gcc/lra-remat.cc   |   8 +-
 gcc/lra-spills.cc  |  27 +++-
 gcc/lra.cc |  10 +-
 7 files changed, 430 insertions(+), 112 deletions(-)

diff --git a/gcc/lra-coalesce.cc b/gcc/lra-coalesce.cc
index a9b5b51cb3f..9416775a009 100644
--- a/gcc/lra-coalesce.cc
+++ b/gcc/lra-coalesce.cc
@@ -186,19 +186,28 @@ static bitmap_head used_pseudos_bitmap;
 /* Set up USED_PSEUDOS_BITMAP, and update LR_BITMAP (a BB live info
bitmap).  */
 static void
-update_live_info (bitmap lr_bitmap)
+update_live_info (bitmap all, bitmap full, bitmap partial)
 {
   unsigned int j;
   bitmap_iterator bi;
 
   bitmap_clear (&used_pseudos_bitmap);
-  EXECUTE_IF_AND_IN_BITMAP (&coalesced_pseudos_bitmap, lr_bitmap,
+  EXECUTE_IF_AND_IN_BITMAP (&coalesced_pseudos_bitmap, all,
FIRST_PSEUDO_REGISTER, j, bi)
 bitmap_set_bit (&used_pseudos_bitmap, first_coalesced_pseudo[j]);
-  if (! bitmap_empty_p (&used_pseudos_bitmap))
+  if (!bitmap_empty_p (&used_pseudos_bitmap))
 {
-  bitmap_and_compl_into (lr_bitmap, &coalesced_pseudos_bitmap);
-  bitmap_ior_into (lr_bitmap, &used_pseudos_bitmap);
+  bitmap_and_compl_into (all, &coalesced_pseudos_bitmap);
+  bitmap_ior_into (all, &used_pseudos_bitmap);
+
+  if (flag_track_subreg_liveness)
+   {
+ bitmap_and_compl_into (full, &coalesced_pseudos_bitmap);
+ bitmap_ior_and_compl_into (full, &used_pseudos_bitmap, partial);
+
+ bitmap_and_compl_into (partial, &coalesced_pseudos_bitmap);
+ bitmap_ior_and_compl_into (partial, &used_pseudos_bitmap, full);
+   }
 }
 }
 
@@ -301,8 +310,12 @@ lra_coalesce (void)
   bitmap_initialize (&used_pseudos_bitmap, ®_obstack);
   FOR_EACH_BB_FN (bb, cfun)
 {
-  update_live_info (df_get_live_in (bb));
-  update_live_info (df_get_live_out (bb));
+  update_live_info (df_get_subreg_live_in (bb),
+   df_get_subreg_live_full_in (bb),
+   df_get_subreg_live_partial_in (bb));
+  update_live_info (df_get_subreg_live_out (bb),
+   df_get_subreg_live_full_out (bb),
+   df_get_subreg_live_partial_out (bb));
   FOR_BB_INSNS_SAFE (bb, insn, next)
if (INSN_P (insn)
&& bitmap_bit_p (&involved_insns_bitmap, INSN_UID (insn)))
diff --git a/gcc/lra-constraints.cc b/gcc/lra-constraints.cc
index 5b78fd0b7e5..effb5d8484c 100644
--- a/gcc/lra-constraints.cc
+++ b/gcc/lra-constraints.cc
@@ -6554,34 +6554,86 @@ update_ebb_live_info (rtx_insn *head, rtx_insn *tail)
{
  if (prev_bb != NULL)
{
- /* Update df_get_live_in (prev_bb):  */
+ /* Update subreg live (prev_bb):  */
+ bitmap subreg_all_in = df_get_subreg_live_in (prev_bb);
+ bitmap subreg_full_in = df_get_subreg_live_full_in (prev_bb);
+ bitmap subreg_partial_in = df_get_subreg_live_partial_in 
(prev_bb);
+ subregs_live *range_in = df_get_subreg_live_range_in (prev_bb);
  EXECUTE_IF_SET_IN_BITMAP (&check_only_regs, 0, j, bi)
if (bitmap_bit_p (&live_regs, j))
- bitmap_set_bit (df_get_live_in (prev_bb), j);
-   else
- bitmap_clear_bit (df_get_live_in (prev_bb), j);
+ {
+   bitmap_

[SUBREG V3 3/4] IRA: Add DF_LIVE_SUBREG problem

2024-05-11 Thread Juzhe-Zhong

This patch simple replace df_get_live_in to df_get_subreg_live_in
and replace df_get_live_out to df_get_subreg_live_out.

Co-authored-by: Lehua Ding 

gcc/ChangeLog:

* ira-build.cc (create_bb_allocnos): Apply DF_LIVE_SUBREG data.
(create_loop_allocnos): Diito.
* ira-color.cc (ira_loop_edge_freq): Diito.
* ira-emit.cc (generate_edge_moves): Diito.
(add_ranges_and_copies): Diito.
* ira-lives.cc (process_out_of_region_eh_regs): Diito.
(add_conflict_from_region_landing_pads): Diito.
(process_bb_node_lives): Diito.
* ira.cc (find_moveable_pseudos): Diito.
(interesting_dest_for_shprep_1): Diito.
(allocate_initial_values): Diito.
(ira): Diito.

---
 gcc/ira-build.cc |  7 ---
 gcc/ira-color.cc |  8 
 gcc/ira-emit.cc  | 12 ++--
 gcc/ira-lives.cc |  7 ---
 gcc/ira.cc   | 19 ---
 5 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/gcc/ira-build.cc b/gcc/ira-build.cc
index ea593d5a087..283ff36d3dd 100644
--- a/gcc/ira-build.cc
+++ b/gcc/ira-build.cc
@@ -1921,7 +1921,8 @@ create_bb_allocnos (ira_loop_tree_node_t bb_node)
   create_insn_allocnos (PATTERN (insn), NULL, false);
   /* It might be a allocno living through from one subloop to
  another.  */
-  EXECUTE_IF_SET_IN_REG_SET (df_get_live_in (bb), FIRST_PSEUDO_REGISTER, i, bi)
+  EXECUTE_IF_SET_IN_REG_SET (df_get_subreg_live_in (bb), FIRST_PSEUDO_REGISTER,
+i, bi)
 if (ira_curr_regno_allocno_map[i] == NULL)
   ira_create_allocno (i, false, ira_curr_loop_tree_node);
 }
@@ -1937,9 +1938,9 @@ create_loop_allocnos (edge e)
   bitmap_iterator bi;
   ira_loop_tree_node_t parent;
 
-  live_in_regs = df_get_live_in (e->dest);
+  live_in_regs = df_get_subreg_live_in (e->dest);
   border_allocnos = ira_curr_loop_tree_node->border_allocnos;
-  EXECUTE_IF_SET_IN_REG_SET (df_get_live_out (e->src),
+  EXECUTE_IF_SET_IN_REG_SET (df_get_subreg_live_out (e->src),
 FIRST_PSEUDO_REGISTER, i, bi)
 if (bitmap_bit_p (live_in_regs, i))
   {
diff --git a/gcc/ira-color.cc b/gcc/ira-color.cc
index b9ae32d1b4d..bfebc48ef83 100644
--- a/gcc/ira-color.cc
+++ b/gcc/ira-color.cc
@@ -2786,8 +2786,8 @@ ira_loop_edge_freq (ira_loop_tree_node_t loop_node, int 
regno, bool exit_p)
   FOR_EACH_EDGE (e, ei, loop_node->loop->header->preds)
if (e->src != loop_node->loop->latch
&& (regno < 0
-   || (bitmap_bit_p (df_get_live_out (e->src), regno)
-   && bitmap_bit_p (df_get_live_in (e->dest), regno
+   || (bitmap_bit_p (df_get_subreg_live_out (e->src), regno)
+   && bitmap_bit_p (df_get_subreg_live_in (e->dest), regno
  freq += EDGE_FREQUENCY (e);
 }
   else
@@ -2795,8 +2795,8 @@ ira_loop_edge_freq (ira_loop_tree_node_t loop_node, int 
regno, bool exit_p)
   auto_vec edges = get_loop_exit_edges (loop_node->loop);
   FOR_EACH_VEC_ELT (edges, i, e)
if (regno < 0
-   || (bitmap_bit_p (df_get_live_out (e->src), regno)
-   && bitmap_bit_p (df_get_live_in (e->dest), regno)))
+   || (bitmap_bit_p (df_get_subreg_live_out (e->src), regno)
+   && bitmap_bit_p (df_get_subreg_live_in (e->dest), regno)))
  freq += EDGE_FREQUENCY (e);
 }
 
diff --git a/gcc/ira-emit.cc b/gcc/ira-emit.cc
index d347f11fa02..8075b082e36 100644
--- a/gcc/ira-emit.cc
+++ b/gcc/ira-emit.cc
@@ -510,8 +510,8 @@ generate_edge_moves (edge e)
 return;
   src_map = src_loop_node->regno_allocno_map;
   dest_map = dest_loop_node->regno_allocno_map;
-  regs_live_in_dest = df_get_live_in (e->dest);
-  regs_live_out_src = df_get_live_out (e->src);
+  regs_live_in_dest = df_get_subreg_live_in (e->dest);
+  regs_live_out_src = df_get_subreg_live_out (e->src);
   EXECUTE_IF_SET_IN_REG_SET (regs_live_in_dest,
 FIRST_PSEUDO_REGISTER, regno, bi)
 if (bitmap_bit_p (regs_live_out_src, regno))
@@ -1229,16 +1229,16 @@ add_ranges_and_copies (void)
 destination block) to use for searching allocnos by their
 regnos because of subsequent IR flattening.  */
   node = IRA_BB_NODE (bb)->parent;
-  bitmap_copy (live_through, df_get_live_in (bb));
+  bitmap_copy (live_through, df_get_subreg_live_in (bb));
   add_range_and_copies_from_move_list
(at_bb_start[bb->index], node, live_through, REG_FREQ_FROM_BB (bb));
-  bitmap_copy (live_through, df_get_live_out (bb));
+  bitmap_copy (live_through, df_get_subreg_live_out (bb));
   add_range_and_copies_from_move_list
(at_bb_end[bb->index], node, live_through, REG_FREQ_FROM_BB (bb));
   FOR_EACH_EDGE (e, ei, bb->succs)
{
- bitmap_and (live_through,
- df_get_live_in (e->dest), df_get_live_out (bb));
+ bitmap_and (live_through, df_get_subreg_live_in (e->dest),
+ df_get_subreg_l

[SUBREG V3 1/4] DF: Add -ftrack-subreg-liveness option

2024-05-11 Thread Juzhe-Zhong

Add new flag -ftrack-subreg-liveness to enable track-subreg-liveness.
This flag is enabled at -O3/fast.

Co-authored-by: Lehua Ding 

gcc/ChangeLog:

* common.opt: Add -ftrack-subreg-liveness option.
* common.opt.urls: Ditto.
* doc/invoke.texi: Ditto.
* opts.cc: Ditto.

---
 gcc/common.opt  | 4 
 gcc/common.opt.urls | 3 +++
 gcc/doc/invoke.texi | 8 
 gcc/opts.cc | 1 +
 4 files changed, 16 insertions(+)

diff --git a/gcc/common.opt b/gcc/common.opt
index 40cab3cb36a..5710e817abe 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2163,6 +2163,10 @@ fira-share-spill-slots
 Common Var(flag_ira_share_spill_slots) Init(1) Optimization
 Share stack slots for spilled pseudo-registers.
 
+ftrack-subreg-liveness
+Common Var(flag_track_subreg_liveness) Init(0) Optimization
+Track subreg liveness information.
+
 fira-verbose=
 Common RejectNegative Joined UInteger Var(flag_ira_verbose) Init(5)
 -fira-verbose= Control IRA's level of diagnostic messages.
diff --git a/gcc/common.opt.urls b/gcc/common.opt.urls
index f71ed80a34b..59f27a6f7c6 100644
--- a/gcc/common.opt.urls
+++ b/gcc/common.opt.urls
@@ -880,6 +880,9 @@ 
UrlSuffix(gcc/Optimize-Options.html#index-fira-share-save-slots)
 fira-share-spill-slots
 UrlSuffix(gcc/Optimize-Options.html#index-fira-share-spill-slots)
 
+ftrack-subreg-liveness
+UrlSuffix(gcc/Optimize-Options.html#index-ftrack-subreg-liveness)
+
 fira-verbose=
 UrlSuffix(gcc/Developer-Options.html#index-fira-verbose)
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index ddcd5213f06..fbcde8aa745 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13188,6 +13188,14 @@ Disable sharing of stack slots allocated for 
pseudo-registers.  Each
 pseudo-register that does not get a hard register gets a separate
 stack slot, and as a result function stack frames are larger.
 
+@opindex ftrack-subreg-liveness
+@item -ftrack-subreg-liveness
+Enable tracking subreg liveness information. This infomation allows IRA
+and LRA to support subreg coalesce feature which can improve the quality
+of register allocation.
+
+This option is enabled at level @option{-O3} for all targets.
+
 @opindex flra-remat
 @item -flra-remat
 Enable CFG-sensitive rematerialization in LRA.  Instead of loading
diff --git a/gcc/opts.cc b/gcc/opts.cc
index 600e0ea..50c0b62c5af 100644
--- a/gcc/opts.cc
+++ b/gcc/opts.cc
@@ -689,6 +689,7 @@ static const struct default_options default_options_table[] 
=
 { OPT_LEVELS_3_PLUS, OPT_funswitch_loops, NULL, 1 },
 { OPT_LEVELS_3_PLUS, OPT_fvect_cost_model_, NULL, VECT_COST_MODEL_DYNAMIC 
},
 { OPT_LEVELS_3_PLUS, OPT_fversion_loops_for_strides, NULL, 1 },
+{ OPT_LEVELS_3_PLUS, OPT_ftrack_subreg_liveness, NULL, 1 },
 
 /* -O3 parameters.  */
 { OPT_LEVELS_3_PLUS, OPT__param_max_inline_insns_auto_, NULL, 30 },
-- 
2.36.3

[PATCH] RISC-V: Fix infinite compilation of VSETVL PASS

2024-02-05 Thread Juzhe-Zhong

This patch fixes issue reported by Jeff.

Testing is running. Ok for trunk if I passed the testing with no regression ?

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pre_vsetvl::emit_vsetvl): Fix inifinite 
compilation.
(pre_vsetvl::remove_vsetvl_pre_insns): Ditto.

---
 gcc/config/riscv/riscv-vsetvl.cc | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 2c0dcdf18c5..32f262de199 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2281,9 +2281,8 @@ private:
   }
   }
 
-  void remove_vsetvl_insn (const vsetvl_info &info)
+  void remove_vsetvl_insn (rtx_insn *rinsn)
   {
-rtx_insn *rinsn = info.get_insn ()->rtl ();
 if (dump_file)
   {
fprintf (dump_file, "  Eliminate insn %d:\n", INSN_UID (rinsn));
@@ -3231,7 +3230,7 @@ pre_vsetvl::emit_vsetvl ()
  if (curr_info.delete_p ())
{
  if (vsetvl_insn_p (insn->rtl ()))
-   remove_vsetvl_insn (curr_info);
+   remove_vsetvl_insn (curr_info.get_insn ()->rtl ());
  continue;
}
  else if (curr_info.valid_p ())
@@ -3269,7 +3268,7 @@ pre_vsetvl::emit_vsetvl ()
   for (const vsetvl_info &item : m_delete_list)
 {
   gcc_assert (vsetvl_insn_p (item.get_insn ()->rtl ()));
-  remove_vsetvl_insn (item);
+  remove_vsetvl_insn (item.get_insn ()->rtl ());
 }
 
   /* Insert vsetvl info that was not deleted after lift up.  */
@@ -3434,7 +3433,7 @@ pre_vsetvl::remove_vsetvl_pre_insns ()
   INSN_UID (rinsn));
  print_rtl_single (dump_file, rinsn);
}
- remove_insn (rinsn);
+ remove_vsetvl_insn (rinsn);
}
 }
 
-- 
2.36.3

[PATCH] RISC-V: Expand VLMAX scalar move in reduction

2024-02-01 Thread Juzhe-Zhong

This patch fixes the following:

vsetvli a5,a1,e32,m1,tu,ma
sllia4,a5,2
sub a1,a1,a5
vle32.v v2,0(a0)
add a0,a0,a4
vadd.vv v1,v2,v1
bne a1,zero,.L3
vsetivlizero,1,e32,m1,ta,ma
vmv.s.x v2,zero
vsetvli a5,zero,e32,m1,ta,ma  ---> Redundant vsetvl.
vredsum.vs  v1,v1,v2
vmv.x.s a0,v1
ret

VSETVL PASS is able to fuse avl = 1 of scalar move and VLMAX avl of reduction.

However, this following RTL blocks the fusion in dependence analysis in VSETVL 
PASS:

(insn 49 24 50 5 (set (reg:RVVM1SI 98 v2 [148])
(if_then_else:RVVM1SI (unspec:RVVMF32BI [
(const_vector:RVVMF32BI [
(const_int 1 [0x1])
repeat [
(const_int 0 [0])
]
])
(const_int 1 [0x1])
(const_int 2 [0x2]) repeated x2
(const_int 0 [0])
(reg:SI 66 vl)
(reg:SI 67 vtype)
] UNSPEC_VPREDICATE)
(const_vector:RVVM1SI repeat [
(const_int 0 [0])
])
(unspec:RVVM1SI [
(reg:DI 0 zero)
] UNSPEC_VUNDEF))) 3813 {*pred_broadcastrvvm1si_zero}
 (nil))
(insn 50 49 51 5 (set (reg:DI 15 a5 [151])  >  It 
set a5, blocks the following VLMAX into the scalar move above.
(unspec:DI [
(const_int 32 [0x20])
] UNSPEC_VLMAX)) 2566 {vlmax_avldi}
 (expr_list:REG_EQUIV (unspec:DI [
(const_int 32 [0x20])
] UNSPEC_VLMAX)
(nil)))
(insn 51 50 52 5 (set (reg:RVVM1SI 97 v1 [150])
(unspec:RVVM1SI [
(unspec:RVVMF32BI [
(const_vector:RVVMF32BI repeat [
(const_int 1 [0x1])
])
(reg:DI 15 a5 [151])
(const_int 2 [0x2])
(const_int 1 [0x1])
(reg:SI 66 vl)
(reg:SI 67 vtype)
] UNSPEC_VPREDICATE)
(unspec:RVVM1SI [
(reg:RVVM1SI 97 v1 [orig:134 vect_result_14.6 ] [134])
(reg:RVVM1SI 98 v2 [148])
] UNSPEC_REDUC_SUM)
(unspec:RVVM1SI [
(reg:DI 0 zero)
] UNSPEC_VUNDEF)
] UNSPEC_REDUC)) 17541 {pred_redsumrvvm1si}
 (expr_list:REG_DEAD (reg:RVVM1SI 98 v2 [148])
(expr_list:REG_DEAD (reg:SI 66 vl)
(expr_list:REG_DEAD (reg:DI 15 a5 [151])
(expr_list:REG_DEAD (reg:DI 0 zero)
(nil))

Such situation can only happen on auto-vectorization, never happen on intrinsic 
codes.
Since the reduction is passed VLMAX AVL, it should be more natural to pass 
VLMAX to the scalar move which initial the value of the reduction.

After this patch:

vsetvli a5,a1,e32,m1,tu,ma
sllia4,a5,2
sub a1,a1,a5
vle32.v v2,0(a0)
add a0,a0,a4
vadd.vv v1,v2,v1
bne a1,zero,.L3
vsetvli a5,zero,e32,m1,ta,ma
vmv.s.x v2,zero
vredsum.vs  v1,v1,v2
vmv.x.s a0,v1
ret

Tested on both RV32/RV64 no regression.

PR target/113697

gcc/ChangeLog:

* config/riscv/riscv-v.cc (expand_reduction): Pass VLMAX avl to scalar 
move.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113697.c: New test.

---
 gcc/config/riscv/riscv-v.cc| 12 +++-
 .../gcc.target/riscv/rvv/autovec/pr113697.c| 14 ++
 2 files changed, 21 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113697.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 4bacb7fea45..0cfbd21ce6f 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -4151,13 +4151,15 @@ expand_reduction (unsigned unspec, unsigned insn_flags, 
rtx *ops, rtx init)
 
   rtx m1_tmp = gen_reg_rtx (m1_mode);
   rtx scalar_move_ops[] = {m1_tmp, init};
-  emit_nonvlmax_insn (code_for_pred_broadcast (m1_mode), SCALAR_MOVE_OP,
- scalar_move_ops,
- need_mask_operand_p (insn_flags) ? ops[3]
-  : CONST1_RTX (Pmode));
+  insn_code icode = code_for_pred_broadcast (m1_mode);
+  if (need_mask_operand_p (insn_flags))
+emit_nonvlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops, ops[3]);
+  else
+emit_vlmax_insn (icode, SCALAR_MOVE_OP, scalar_move_ops);
+
   rtx m1_tmp2 = gen_reg_rtx (m1_mode);
   rtx reduc_ops[] = {m1_tmp2, vector_src, m1_tmp};
-  insn_code icode = code_for_pred (unspec, vmode);
+

[PATCH] RISC-V: Allow LICM hoist POLY_INT configuration code sequence

2024-02-01 Thread Juzhe-Zhong

Realize in recent benchmark evaluation (coremark-pro zip-test):

vid.v   v2
vmv.v.i v5,0
.L9:
vle16.v v3,0(a4)
vrsub.vxv4,v2,a6   ---> LICM failed to hoist it outside the 
loop.

The root cause is:

(insn 56 47 57 4 (set (subreg:DI (reg:HI 220) 0)
(reg:DI 223)) "rvv.c":11:9 208 {*movdi_64bit}  -> Its result used by 
the following vrsub.vx then supress the hoist of the vrsub.vx
 (nil))  

(insn 57 56 59 4 (set (reg:RVVMF2HI 216)
(if_then_else:RVVMF2HI (unspec:RVVMF32BI [
(const_vector:RVVMF32BI repeat [
(const_int 1 [0x1])
])
(reg:DI 350)
(const_int 2 [0x2]) repeated x2
(const_int 1 [0x1])
(reg:SI 66 vl)
(reg:SI 67 vtype)
] UNSPEC_VPREDICATE)
(minus:RVVMF2HI (vec_duplicate:RVVMF2HI (reg:HI 220))
(reg:RVVMF2HI 217))
(unspec:RVVMF2HI [
(reg:DI 0 zero)
] UNSPEC_VUNDEF))) "rvv.c":11:9 6938 
{pred_subrvvmf2hi_reverse_scalar}
 (expr_list:REG_DEAD (reg:HI 220)
(nil)))

This patch fixes it generate (set (reg:HI) (subreg:HI (reg:DI))) instead of 
(set (subreg:DI (reg:DI)) (reg:DI)).

After this patch:

vid.v   v2
vrsub.vxv2,v2,a7
vmv.v.i v4,0
.L3:
vle16.v v3,0(a4)

Tested on both RV32 and RV64 no regression.

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_legitimize_move): Fix poly_int dest 
generation.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/poly_licm-1.c: New test.
* gcc.target/riscv/rvv/autovec/poly_licm-2.c: New test.

---
 gcc/config/riscv/riscv.cc |  9 ---
 .../riscv/rvv/autovec/poly_licm-1.c   | 18 +
 .../riscv/rvv/autovec/poly_licm-2.c   | 27 +++
 3 files changed, 50 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 529ef5e84b7..6e22b43e618 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -2711,16 +2711,17 @@ riscv_legitimize_move (machine_mode mode, rtx dest, rtx 
src)
(const_poly_int:HI [m, n])
(const_poly_int:SI [m, n]).  */
  rtx tmp = gen_reg_rtx (Pmode);
- riscv_legitimize_poly_move (Pmode, gen_lowpart (Pmode, dest), tmp,
- src);
+ rtx tmp2 = gen_reg_rtx (Pmode);
+ riscv_legitimize_poly_move (Pmode, tmp2, tmp, src);
+ emit_move_insn (dest, gen_lowpart (mode, tmp2));
}
   else
{
  /* In RV32 system, handle (const_poly_int:SI [m, n])
(const_poly_int:DI [m, n]).
 In RV64 system, handle (const_poly_int:DI [m, n]).
-   FIXME: Maybe we could gen SImode in RV32 and then sign-extend to DImode,
-   the offset should not exceed 4GiB in general.  */
+FIXME: Maybe we could gen SImode in RV32 and then sign-extend to
+DImode, the offset should not exceed 4GiB in general.  */
  rtx tmp = gen_reg_rtx (mode);
  riscv_legitimize_poly_move (mode, dest, tmp, src);
}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c
new file mode 100644
index 000..b7da65f0996
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns 
-fno-schedule-insns2" } */
+
+extern int wsize;
+
+typedef unsigned short Posf;
+#define NIL 0
+
+void foo (Posf *p)
+{
+  register unsigned n, m;
+  do {
+  m = *--p;
+  *p = (Posf)(m >= wsize ? m-wsize : NIL);
+  } while (--n);
+}
+
+/* { dg-final { scan-assembler-times 
{vid\.v\s+v[0-9]+\s+addi\s+\s*[a-x0-9]+,\s*[a-x0-9]+,\s*-1\s+vrsub\.vx\s+} 1 } 
} */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c
new file mode 100644
index 000..ffb3c63149f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns 
-fno-schedule-insns2" } */
+
+typedef unsigned short uint16_t;
+
+void AAA (uint16_t *x, uint16_t *y, unsigned wsize, unsigned count)
+{
+  unsigned m = 0, n = count;
+  register uint16_t *p;
+
+  p = x;
+
+  do {
+m = *--p;
+*p = (uint16_t)(m >= wsize ? m-wsize : 0);
+  } while (--n);
+
+  n = wsize;
+  p = y;
+
+  do {
+  m = *--p;
+

[PATCH] RISC-V: Remove vsetvl_pre bogus instructions in VSETVL PASS

2024-02-01 Thread Juzhe-Zhong

I realize there is a RTL regression between GCC-14 and GCC-13.
https://godbolt.org/z/Ga7K6MqaT

GCC-14:
(insn 9 13 31 2 (set (reg:DI 15 a5 [138])
(unspec:DI [
(const_int 64 [0x40])
] UNSPEC_VLMAX)) "/app/example.c":5:15 2566 {vlmax_avldi}
 (expr_list:REG_EQUIV (unspec:DI [
(const_int 64 [0x40])
] UNSPEC_VLMAX)
(nil)))
(insn 31 9 10 2 (parallel [
(set (reg:DI 15 a5 [138])
(unspec:DI [
(reg:DI 0 zero)
(const_int 32 [0x20])
(const_int 7 [0x7])
(const_int 1 [0x1]) repeated x2
] UNSPEC_VSETVL))
(set (reg:SI 66 vl)
(unspec:SI [
(reg:DI 0 zero)
(const_int 32 [0x20])
(const_int 7 [0x7])
] UNSPEC_VSETVL))
(set (reg:SI 67 vtype)
(unspec:SI [
(const_int 32 [0x20])
(const_int 7 [0x7])
(const_int 1 [0x1]) repeated x2
] UNSPEC_VSETVL))
]) "/app/example.c":5:15 3281 {vsetvldi}
 (nil))

GCC-13:
(insn 10 7 26 2 (set (reg/f:DI 11 a1 [139])
(plus:DI (reg:DI 11 a1 [142])
(const_int 800 [0x320]))) "/app/example.c":6:32 5 {adddi3}
 (nil))
(insn 26 10 9 2 (parallel [
(set (reg:DI 15 a5)
(unspec:DI [
(reg:DI 0 zero)
(const_int 32 [0x20])
(const_int 7 [0x7])
(const_int 1 [0x1]) repeated x2
] UNSPEC_VSETVL))
(set (reg:SI 66 vl)
(unspec:SI [
(reg:DI 0 zero)
(const_int 32 [0x20])
(const_int 7 [0x7])
] UNSPEC_VSETVL))
(set (reg:SI 67 vtype)
(unspec:SI [
(const_int 32 [0x20])
(const_int 7 [0x7])
(const_int 1 [0x1]) repeated x2
] UNSPEC_VSETVL))
]) "/app/example.c":5:15 792 {vsetvldi}
 (nil))

GCC-13 doesn't have:
(insn 9 13 31 2 (set (reg:DI 15 a5 [138])
(unspec:DI [
(const_int 64 [0x40])
] UNSPEC_VLMAX)) "/app/example.c":5:15 2566 {vlmax_avldi}
 (expr_list:REG_EQUIV (unspec:DI [
(const_int 64 [0x40])
] UNSPEC_VLMAX)
(nil)))

vsetvl_pre doesn't emit any assembler which is just used for occupying scalar 
register.
It should be removed in VSETVL PASS.

Tested on both RV32 and RV64 no regression.

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (vsetvl_pre_insn_p): New function.
(pre_vsetvl::cleaup): Remove vsetvl_pre.
(pre_vsetvl::remove_vsetvl_pre_insns): New function.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/vsetvl_pre-1.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 64 +++
 .../riscv/rvv/vsetvl/vsetvl_pre-1.c   | 12 
 2 files changed, 76 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vsetvl_pre-1.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 28b7534d970..4732d4fc77f 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -315,6 +315,48 @@ vsetvl_insn_p (rtx_insn *rinsn)
  || INSN_CODE (rinsn) == CODE_FOR_vsetvlsi);
 }
 
+/* Return true if it is the bogus vsetvl_pre instruction:
+
+   (define_insn "@vlmax_avl"
+ [(set (match_operand:P 0 "register_operand" "=r")
+   (unspec:P [(match_operand:P 1 "const_int_operand" "i")] UNSPEC_VLMAX))]
+ "TARGET_VECTOR"
+ ""
+ [(set_attr "type" "vsetvl_pre")])
+
+   As described above, it's the bogus instruction which doesn't any assembler
+   and should be removed eventually.  It's used for occupying a scalar register
+   for VLMAX avl RVV instruction before register allocation.
+
+   Before RA:
+
+   ...
+   vsetvl_pre (set r136)
+   vadd.vv (use r136 with VLMAX avl)
+   ...
+
+   After RA:
+
+   ...
+   vsetvl_pre (set a5)
+   vadd.vv (use r136 with VLMAX avl)
+   ...
+
+   VSETVL PASS:
+
+   ...
+   vsetvl_pre (set a5) ---> removed.
+   vsetvl a5,zero,...  ---> Inserted.
+   vadd.vv
+   ...
+*/
+static bool
+vsetvl_pre_insn_p (rtx_insn *rinsn)
+{
+  return recog_memoized (rinsn) >= 0
+&& get_attr_type (rinsn) == TYPE_VSETVL_PRE;
+}
+
 /* Return true if it is vsetvl zero, rs1.  */
 static bool
 vsetvl_discard_result_insn_p (rtx_insn *rinsn)
@@ -2376,6 +2418,7 @@ public:
   void cleaup ();
   void remove_avl_operand ();
   void remove_unused_dest_operand ();
+  void remove_vsetvl_pre_insns ();
 
   void dump (FILE *file, const char *title) const
   {
@@ -3332,6 +3375,7 @@ pre_vsetvl::cleaup ()
 {
   remove_avl_operand (

[PATCH v2] RISC-V: Suppress the vsetvl fusion for conflict successors

2024-02-01 Thread Juzhe-Zhong

Update in v2: Add dump information.

This patch fixes the following ineffective vsetvl insertion:

#include "riscv_vector.h"

void f (int32_t * restrict in, int32_t * restrict out, size_t n, size_t cond, 
size_t cond2)
{
  for (size_t i = 0; i < n; i++)
{
  if (i == cond) {
vint8mf8_t v = *(vint8mf8_t*)(in + i + 100);
*(vint8mf8_t*)(out + i + 100) = v;
  } else if (i == cond2) {
vfloat32mf2_t v = *(vfloat32mf2_t*)(in + i + 200);
*(vfloat32mf2_t*)(out + i + 200) = v;
  } else if (i == (cond2 - 1)) {
vuint16mf2_t v = *(vuint16mf2_t*)(in + i + 300);
*(vuint16mf2_t*)(out + i + 300) = v;
  } else {
vint8mf4_t v = *(vint8mf4_t*)(in + i + 400);
*(vint8mf4_t*)(out + i + 400) = v;
  }
}
}

Before this patch:

f:
.LFB0:
.cfi_startproc
beq a2,zero,.L12
addia7,a0,400
addia6,a1,400
addia0,a0,1600
addia1,a1,1600
li  a5,0
addit6,a4,-1
vsetvli t3,zero,e8,mf8,ta,ma ---> ineffective uplift
.L7:
beq a3,a5,.L15
beq a4,a5,.L16
beq t6,a5,.L17
vsetvli t1,zero,e8,mf4,ta,ma
vle8.v  v1,0(a0)
vse8.v  v1,0(a1)
vsetvli t3,zero,e8,mf8,ta,ma
.L4:
addia5,a5,1
addia7,a7,4
addia6,a6,4
addia0,a0,4
addia1,a1,4
bne a2,a5,.L7
.L12:
ret
.L15:
vle8.v  v1,0(a7)
vse8.v  v1,0(a6)
j   .L4
.L17:
vsetvli t1,zero,e8,mf4,ta,ma
addit5,a0,-400
addit4,a1,-400
vle16.v v1,0(t5)
vse16.v v1,0(t4)
vsetvli t3,zero,e8,mf8,ta,ma
j   .L4
.L16:
addit5,a0,-800
addit4,a1,-800
vle32.v v1,0(t5)
vse32.v v1,0(t4)
j   .L4

It's obvious that we are hoisting the e8mf8 vsetvl to the top. It's ineffective 
since e8mf8 comes from
low probability block which is if (i == cond).

For this case, we disable such fusion.

After this patch:

f:
beq a2,zero,.L12
addia7,a0,400
addia6,a1,400
addia0,a0,1600
addia1,a1,1600
li  a5,0
addit6,a4,-1
.L7:
beq a3,a5,.L15
beq a4,a5,.L16
beq t6,a5,.L17
vsetvli t1,zero,e8,mf4,ta,ma
vle8.v  v1,0(a0)
vse8.v  v1,0(a1)
.L4:
addia5,a5,1
addia7,a7,4
addia6,a6,4
addia0,a0,4
addia1,a1,4
bne a2,a5,.L7
.L12:
ret
.L15:
vsetvli t3,zero,e8,mf8,ta,ma
vle8.v  v1,0(a7)
vse8.v  v1,0(a6)
j   .L4
.L17:
addit5,a0,-400
addit4,a1,-400
vsetvli t1,zero,e8,mf4,ta,ma
vle16.v v1,0(t5)
vse16.v v1,0(t4)
j   .L4
.L16:
addit5,a0,-800
addit4,a1,-800
vsetvli t3,zero,e32,mf2,ta,ma
vle32.v v1,0(t5)
vse32.v v1,0(t4)
j   .L4

Tested on both RV32/RV64 no regression. Ok for trunk ?

PR target/113696

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pre_vsetvl::earliest_fuse_vsetvl_info): 
Suppress vsetvl fusion.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/pr113696.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 25 ++
 .../gcc.target/riscv/rvv/vsetvl/pr113696.c| 26 +++
 2 files changed, 51 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113696.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index cec862329c5..28b7534d970 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2959,6 +2959,31 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter)
  src_block_info.set_empty_info ();
  src_block_info.probability
= profile_probability::uninitialized ();
+ /* See PR113696, we should reset immediate dominator to
+empty since we may uplift ineffective vsetvl which
+locate at low probability block.  */
+ basic_block dom
+   = get_immediate_dominator (CDI_DOMINATORS, eg->src);
+ auto &dom_block_info = get_block_info (dom);
+ if (dom_block_info.has_info ()
+ && !m_dem.compatible_p (
+   dom_block_info.get_exit_info (), curr_info))
+   {
+ dom_block_info.set_empty_info ();
+ dom_block_info.probability
+   = profile_probability::uninitialized ();
+ if (dump_file && (dump_flags & TDF_DETAILS))
+   {
+ fprintf (dump_file,
+

[PATCH] RISC-V: Disable the vsetvl fusion for conflict successors

2024-02-01 Thread Juzhe-Zhong

This patch fixes the following ineffective vsetvl insertion:

#include "riscv_vector.h"

void f (int32_t * restrict in, int32_t * restrict out, size_t n, size_t cond, 
size_t cond2)
{
  for (size_t i = 0; i < n; i++)
{
  if (i == cond) {
vint8mf8_t v = *(vint8mf8_t*)(in + i + 100);
*(vint8mf8_t*)(out + i + 100) = v;
  } else if (i == cond2) {
vfloat32mf2_t v = *(vfloat32mf2_t*)(in + i + 200);
*(vfloat32mf2_t*)(out + i + 200) = v;
  } else if (i == (cond2 - 1)) {
vuint16mf2_t v = *(vuint16mf2_t*)(in + i + 300);
*(vuint16mf2_t*)(out + i + 300) = v;
  } else {
vint8mf4_t v = *(vint8mf4_t*)(in + i + 400);
*(vint8mf4_t*)(out + i + 400) = v;
  }
}
}

Before this patch:

f:
.LFB0:
.cfi_startproc
beq a2,zero,.L12
addia7,a0,400
addia6,a1,400
addia0,a0,1600
addia1,a1,1600
li  a5,0
addit6,a4,-1
vsetvli t3,zero,e8,mf8,ta,ma ---> ineffective uplift
.L7:
beq a3,a5,.L15
beq a4,a5,.L16
beq t6,a5,.L17
vsetvli t1,zero,e8,mf4,ta,ma
vle8.v  v1,0(a0)
vse8.v  v1,0(a1)
vsetvli t3,zero,e8,mf8,ta,ma
.L4:
addia5,a5,1
addia7,a7,4
addia6,a6,4
addia0,a0,4
addia1,a1,4
bne a2,a5,.L7
.L12:
ret
.L15:
vle8.v  v1,0(a7)
vse8.v  v1,0(a6)
j   .L4
.L17:
vsetvli t1,zero,e8,mf4,ta,ma
addit5,a0,-400
addit4,a1,-400
vle16.v v1,0(t5)
vse16.v v1,0(t4)
vsetvli t3,zero,e8,mf8,ta,ma
j   .L4
.L16:
addit5,a0,-800
addit4,a1,-800
vle32.v v1,0(t5)
vse32.v v1,0(t4)
j   .L4

It's obvious that we are hoisting the e8mf8 vsetvl to the top. It's ineffective 
since e8mf8 comes from
low probability block which is if (i == cond).

For this case, we disable such fusion.

After this patch:

f:
beq a2,zero,.L12
addia7,a0,400
addia6,a1,400
addia0,a0,1600
addia1,a1,1600
li  a5,0
addit6,a4,-1
.L7:
beq a3,a5,.L15
beq a4,a5,.L16
beq t6,a5,.L17
vsetvli t1,zero,e8,mf4,ta,ma
vle8.v  v1,0(a0)
vse8.v  v1,0(a1)
.L4:
addia5,a5,1
addia7,a7,4
addia6,a6,4
addia0,a0,4
addia1,a1,4
bne a2,a5,.L7
.L12:
ret
.L15:
vsetvli t3,zero,e8,mf8,ta,ma
vle8.v  v1,0(a7)
vse8.v  v1,0(a6)
j   .L4
.L17:
addit5,a0,-400
addit4,a1,-400
vsetvli t1,zero,e8,mf4,ta,ma
vle16.v v1,0(t5)
vse16.v v1,0(t4)
j   .L4
.L16:
addit5,a0,-800
addit4,a1,-800
vsetvli t3,zero,e32,mf2,ta,ma
vle32.v v1,0(t5)
vse32.v v1,0(t4)
j   .L4

Tested on both RV32/RV64 no regression. Ok for trunk ?

PR target/113696

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pre_vsetvl::earliest_fuse_vsetvl_info): 
Suppress vsetvl fusion.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/pr113696.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 14 ++
 .../gcc.target/riscv/rvv/vsetvl/pr113696.c| 26 +++
 2 files changed, 40 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113696.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index cec862329c5..79fc2ec2401 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2959,6 +2959,20 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter)
  src_block_info.set_empty_info ();
  src_block_info.probability
= profile_probability::uninitialized ();
+ /* See PR113696, we should reset immediate dominator to
+empty since we may uplift ineffective vsetvl which
+locate at low probability block.  */
+ basic_block dom
+   = get_immediate_dominator (CDI_DOMINATORS, eg->src);
+ auto &dom_block_info = get_block_info (dom);
+ if (dom_block_info.has_info ()
+ && !m_dem.compatible_p (
+   dom_block_info.get_exit_info (), curr_info))
+   {
+ dom_block_info.set_empty_info ();
+ dom_block_info.probability
+   = profile_probability::uninitialized ();
+   }
  changed = true;
}
  /* Choose the one with higher probability. */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113696.c 
b

[PATCH] middle-end: Enhance conditional reduction vectorization by re-association in ifcvt [PR109088]

2024-01-30 Thread Juzhe-Zhong

This patch targets GCC-15.

Consider this following case:

unsigned int
single_loop_with_if_condition (unsigned int *restrict a, unsigned int *restrict 
b,
   unsigned int *restrict c, unsigned int loop_size)
{
  unsigned int result = 0;
  for (unsigned int i = 0; i < loop_size; i++)
{
  if (a[i] > b[i])
{
  result += a[i] + 0xa - c[i];
}
}
  return result;
}

After investigation of LLVM, I find LLVM re-associate such case to make it 
easier
to be vectorized.

Take RISC-V ASM as example.

Before this patch:

beq a3,zero,.L5
sllia5,a3,32
srlia3,a5,30
mv  a4,a0
add a7,a0,a3
li  a0,0
.L4:
lw  a3,0(a4)
addiw   a5,a0,10
lw  a6,0(a1)
addia4,a4,4
addwa5,a5,a3
bgeua6,a3,.L3
lw  a0,0(a2)
subwa0,a5,a0
.L3:
addia1,a1,4
addia2,a2,4
bne a7,a4,.L4
ret
.L5:
li  a0,0
ret

After this patch:

beq a3,zero,.L4
sllia3,a3,32
srlia3,a3,32
vsetvli a5,zero,e32,m1,ta,ma
vmv.v.i v2,0
.L3:
vsetvli a5,a3,e32,m1,tu,mu
sllia4,a5,2
sub a3,a3,a5
vle32.v v3,0(a0)
vle32.v v0,0(a1)
add a0,a0,a4
vmsgtu.vv   v0,v3,v0
add a1,a1,a4
vle32.v v1,0(a2),v0.t
add a2,a2,a4
vadd.vi v1,v1,-10
vsub.vv v1,v1,v3
vadd.vv v2,v2,v1,v0.t
bne a3,zero,.L3
li  a5,0
vsetivlizero,1,e32,m1,ta,ma
vmv.s.x v1,a5
vsetvli a5,zero,e32,m1,ta,ma
vredsum.vs  v2,v2,v1
vmv.x.s a0,v2
ret

PR middle-end/109088

gcc/ChangeLog:

* tree-if-conv.cc (is_cond_scalar_reduction): Enhance conditional 
reduction.
(convert_scalar_cond_reduction): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr109088-1.c: New test.
* gcc.dg/vect/pr109088-2.c: New test.
* gcc.dg/vect/pr109088-3.c: New test.
* gcc.dg/vect/pr109088-4.c: New test.
* gcc.dg/vect/pr109088-5.c: New test.

---
 gcc/testsuite/gcc.dg/vect/pr109088-1.c | 201 
 gcc/testsuite/gcc.dg/vect/pr109088-2.c | 202 
 gcc/testsuite/gcc.dg/vect/pr109088-3.c | 314 +
 gcc/testsuite/gcc.dg/vect/pr109088-4.c |  84 +++
 gcc/testsuite/gcc.dg/vect/pr109088-5.c |  96 
 gcc/tree-if-conv.cc| 150 +++-
 6 files changed, 1042 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr109088-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr109088-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr109088-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr109088-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr109088-5.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr109088-1.c 
b/gcc/testsuite/gcc.dg/vect/pr109088-1.c
new file mode 100644
index 000..6772e908535
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr109088-1.c
@@ -0,0 +1,201 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_condition } */
+
+#include "tree-vect.h"
+
+#define N 27
+
+#define COND_REDUC(NAME, TYPE, OP) 
\
+  TYPE __attribute__ ((noipa)) 
\
+  cond_##NAME (TYPE *__restrict a, int *__restrict cond1,  
\
+  int *__restrict cond2, TYPE init, int n)\
+  {
\
+TYPE result = init;
\
+for (int i = 0; i < n; i++)
\
+  if (cond1[i] > cond2[i]) 
\
+   result OP a[i];\
+return result; 
\
+  }
+
+COND_REDUC (reduc_sum_char, char, +=)
+COND_REDUC (reduc_sum_short, short, +=)
+COND_REDUC (reduc_sum_int, int, +=)
+COND_REDUC (reduc_sum_long, long, +=)
+COND_REDUC (reduc_and_char, char, &=)
+COND_REDUC (reduc_and_short, short, &=)
+COND_REDUC (reduc_and_int, int, &=)
+COND_REDUC (reduc_and_long, long, &=)
+COND_REDUC (reduc_ior_char, char, |=)
+COND_REDUC (reduc_ior_short, short, |=)
+COND_REDUC (reduc_ior_int, int, |=)
+COND_REDUC (reduc_ior_long, long, |=)
+COND_REDUC (reduc_xor_char, char, ^=)
+COND_REDUC (reduc_xor_short, short, ^=)
+COND_REDUC (reduc_xor_int, int, ^=)
+COND_REDUC (reduc_xor_long, long, ^=)
+
+int
+main (void)
+{
+  check_vect ();
+  int cond1[N] = {11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1,  2,  3, 4,
+ 5,  6,  7,  8,  9,  10, 21, 22, 23, 24, 25, 26, 27};
+  int cond2[N] = {15, 5,  6,  7,  8,

[Committed] RISC-V: Fix regression

2024-01-29 Thread Juzhe-Zhong

Due to recent middle-end loop vectorizer changes, these tests have regression 
and
the changes are reasonable. Adapt test to fix the regression.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt test.
* gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/mod-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/shift-1.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/shift-2.c: Ditto.

---
 .../gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c  | 2 +-
 .../gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c  | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mod-1.c  | 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-1.c| 2 +-
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-2.c| 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
index befa4b85e8f..d5348855aa0 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
@@ -4,5 +4,5 @@
 #include "shift-template.h"
 
 /* { dg-final { scan-assembler-times {\tvsll\.vv} 8 } } */
-/* { dg-final { scan-assembler-times {\tvsrl\.vv} 4 } } */
+/* { dg-final { scan-assembler-times {\tvsrl\.vv} 2 } } */
 /* { dg-final { scan-assembler-times {\tvsra\.vv} 4 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c
index 976b29fa356..a533dc79bc0 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv64gcv.c
@@ -4,5 +4,5 @@
 #include "shift-template.h"
 
 /* { dg-final { scan-assembler-times {\tvsll\.vv} 8 } } */
-/* { dg-final { scan-assembler-times {\tvsrl\.vv} 4 } } */
+/* { dg-final { scan-assembler-times {\tvsrl\.vv} 2 } } */
 /* { dg-final { scan-assembler-times {\tvsra\.vv} 4 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mod-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mod-1.c
index 57bbf8fbc68..17d2784b90d 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mod-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/mod-1.c
@@ -53,5 +53,5 @@ DEF_OP_VV (mod, 128, int64_t, %)
 DEF_OP_VV (mod, 256, int64_t, %)
 DEF_OP_VV (mod, 512, int64_t, %)
 
-/* { dg-final { scan-assembler-times 
{vremu?\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 42 } } */
+/* { dg-final { scan-assembler-times 
{vremu?\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 47 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-1.c
index cb5a1dbc9ff..ee8da2573c7 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-1.c
@@ -53,5 +53,5 @@ DEF_OP_VV (shift, 128, int64_t, >>)
 DEF_OP_VV (shift, 256, int64_t, >>)
 DEF_OP_VV (shift, 512, int64_t, >>)
 
-/* { dg-final { scan-assembler-times 
{vsra\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 42 } } */
+/* { dg-final { scan-assembler-times 
{vsra\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 35 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-2.c
index e626a52c2d8..ebd5575f267 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/shift-2.c
@@ -53,5 +53,5 @@ DEF_OP_VV (shift, 128, uint64_t, >>)
 DEF_OP_VV (shift, 256, uint64_t, >>)
 DEF_OP_VV (shift, 512, uint64_t, >>)
 
-/* { dg-final { scan-assembler-times 
{vsrl\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 42 } } */
+/* { dg-final { scan-assembler-times 
{vsrl\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
-- 
2.36.1

[PATCH] RISC-V: Fix VSETLV PASS compile-time issue

2024-01-29 Thread Juzhe-Zhong

The compile time issue was discovered in SPEC 2017 wrf:

Use time and -ftime-report to analyze the profile data of SPEC 2017 wrf 
compilation .

Before this patch (Lazy vsetvl):

scheduling : 121.89 ( 15%)   0.53 ( 11%) 122.72 ( 15%)  
  13M (  1%)
machine dep reorg  : 424.61 ( 53%)   1.84 ( 37%) 427.44 ( 53%)  
5290k (  0%)
real13m27.074s
user13m19.539s
sys 0m5.180s

Simple vsetvl:

machine dep reorg  :   0.10 (  0%)   0.00 (  0%)   0.11 (  0%)  
4138k (  0%)
real6m5.780s
user6m2.396s
sys 0m2.373s

The machine dep reorg is the compile time of VSETVL PASS (424 seconds) which 
counts 53% of
the compilation time, spends much more time than scheduling.

After investigation, the critical patch of VSETVL pass is 
compute_lcm_local_properties which
is called every iteration of phase 2 (earliest fusion) and phase 3 (global lcm).

This patch optimized the codes of compute_lcm_local_properties to reduce the 
compilation time.

After this patch:

scheduling : 117.51 ( 27%)   0.21 (  6%) 118.04 ( 27%)  
  13M (  1%)
machine dep reorg  :  80.13 ( 18%)   0.91 ( 26%)  81.26 ( 18%)  
5290k (  0%)
real7m25.374s
user7m20.116s
sys 0m3.795s

The optimization of this patch is very obvious, lazy VSETVL PASS: 424s (53%) -> 
80s (18%) which
spend less time than scheduling.

Tested on both RV32 and RV64 no regression.  Ok for trunk ?
 
PR target/113495

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (extract_single_source): Remove.
(pre_vsetvl::compute_vsetvl_def_data): Fix compile time issue.
(pre_vsetvl::compute_transparent): New function.
(pre_vsetvl::compute_lcm_local_properties): Fix compile time time issue.

---
 gcc/config/riscv/riscv-vsetvl.cc | 184 ++-
 1 file changed, 60 insertions(+), 124 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index d7b40a5c813..cec862329c5 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -599,14 +599,6 @@ extract_single_source (set_info *set)
   return first_insn;
 }
 
-static insn_info *
-extract_single_source (def_info *def)
-{
-  if (!def)
-return nullptr;
-  return extract_single_source (dyn_cast (def));
-}
-
 static bool
 same_equiv_note_p (set_info *set1, set_info *set2)
 {
@@ -2374,6 +2366,7 @@ public:
   }
 
   void compute_vsetvl_def_data ();
+  void compute_transparent (const bb_info *);
   void compute_lcm_local_properties ();
 
   void fuse_local_vsetvl_info ();
@@ -2452,20 +2445,16 @@ pre_vsetvl::compute_vsetvl_def_data ()
{
  for (unsigned i = 0; i < m_vsetvl_def_exprs.length (); i += 1)
{
- const vsetvl_info &info = *m_vsetvl_def_exprs[i];
- if (!info.has_nonvlmax_reg_avl ())
-   continue;
- unsigned int regno;
- sbitmap_iterator sbi;
- EXECUTE_IF_SET_IN_BITMAP (m_reg_def_loc[bb->index ()], 0, regno,
-   sbi)
-   if (regno == REGNO (info.get_avl ()))
- {
-   bitmap_set_bit (m_kill[bb->index ()], i);
-   bitmap_set_bit (def_loc[bb->index ()],
-   get_expr_index (m_vsetvl_def_exprs,
-   m_unknow_info));
- }
+ auto *info = m_vsetvl_def_exprs[i];
+ if (info->has_nonvlmax_reg_avl ()
+ && bitmap_bit_p (m_reg_def_loc[bb->index ()],
+  REGNO (info->get_avl (
+   {
+ bitmap_set_bit (m_kill[bb->index ()], i);
+ bitmap_set_bit (def_loc[bb->index ()],
+ get_expr_index (m_vsetvl_def_exprs,
+ m_unknow_info));
+   }
}
  continue;
}
@@ -2516,6 +2505,36 @@ pre_vsetvl::compute_vsetvl_def_data ()
   sbitmap_vector_free (m_kill);
 }
 
+/* Subroutine of compute_lcm_local_properties which Compute local transparent
+   BB. Note that the compile time is very sensitive to compute_transparent and
+   compute_lcm_local_properties, any change of these 2 functions should be
+   aware of the compile time changing of the program which has a large number 
of
+   blocks, e.g SPEC 2017 wrf.
+
+   Current compile time profile of SPEC 2017 wrf:
+
+ 1. scheduling - 27%
+ 2. machine dep reorg (VSETVL PASS) - 18%
+
+   VSETVL pass should not spend more time than scheduling in compilation.  */
+void
+pre_vsetvl::compute_transparent (const bb_info *bb)
+{
+  int num_exprs = m_exprs.length ();
+  unsigned bb_index = bb->index ();
+  for (int i = 0; i < num_exprs; i++)
+{
+  auto *info = m_exprs[i];
+  if (info->has_nonvlmax_reg_avl ()
+ && bitmap_bit_p (m_reg_def_loc[bb_index], REGNO (

[Committed] RISC-V: Refine some codes of VSETVL PASS [NFC]

2024-01-26 Thread Juzhe-Zhong

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pre_vsetvl::earliest_fuse_vsetvl_info): 
Refine some codes.
(pre_vsetvl::emit_vsetvl): Ditto.

---
 gcc/config/riscv/riscv-vsetvl.cc | 69 +---
 1 file changed, 27 insertions(+), 42 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 1a398f02596..d7b40a5c813 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2930,28 +2930,19 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter)
   EXECUTE_IF_SET_IN_BITMAP (e, 0, expr_index, sbi)
{
  vsetvl_info &curr_info = *m_exprs[expr_index];
- if (!curr_info.valid_p ())
-   continue;
-
  edge eg = INDEX_EDGE (m_edges, ed);
- if (eg->probability == profile_probability::never ())
-   continue;
- if (eg->src == ENTRY_BLOCK_PTR_FOR_FN (cfun)
- || eg->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
-   continue;
-
- /* When multiple set bits in earliest edge, such edge may
-have infinite loop in preds or succs or multiple conflict
-vsetvl expression which make such edge is unrelated.  We
-don't perform fusion for such situation.  */
- if (bitmap_count_bits (e) != 1)
-   continue;
-
  vsetvl_block_info &src_block_info = get_block_info (eg->src);
  vsetvl_block_info &dest_block_info = get_block_info (eg->dest);
 
- if (src_block_info.probability
- == profile_probability::uninitialized ())
+ if (!curr_info.valid_p ()
+ || eg->probability == profile_probability::never ()
+ || src_block_info.probability
+  == profile_probability::uninitialized ()
+ /* When multiple set bits in earliest edge, such edge may
+have infinite loop in preds or succs or multiple conflict
+vsetvl expression which make such edge is unrelated.  We
+don't perform fusion for such situation.  */
+ || bitmap_count_bits (e) != 1)
continue;
 
  if (src_block_info.empty_p ())
@@ -3058,29 +3049,27 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter)
{
  vsetvl_info &prev_info = src_block_info.get_exit_info ();
  if (!prev_info.valid_p ()
- || m_dem.available_p (prev_info, curr_info))
+ || m_dem.available_p (prev_info, curr_info)
+ || !m_dem.compatible_p (prev_info, curr_info))
continue;
 
- if (m_dem.compatible_p (prev_info, curr_info))
+ if (dump_file && (dump_flags & TDF_DETAILS))
{
- if (dump_file && (dump_flags & TDF_DETAILS))
-   {
- fprintf (dump_file, "Fuse curr info since prev info "
- "compatible with it:\n");
- fprintf (dump_file, "  prev_info: ");
- prev_info.dump (dump_file, "");
- fprintf (dump_file, "  curr_info: ");
- curr_info.dump (dump_file, "");
-   }
- m_dem.merge (prev_info, curr_info);
- if (dump_file && (dump_flags & TDF_DETAILS))
-   {
- fprintf (dump_file, "  prev_info after fused: ");
- prev_info.dump (dump_file, "");
- fprintf (dump_file, "\n");
-   }
- changed = true;
+ fprintf (dump_file, "Fuse curr info since prev info "
+ "compatible with it:\n");
+ fprintf (dump_file, "  prev_info: ");
+ prev_info.dump (dump_file, "");
+ fprintf (dump_file, "  curr_info: ");
+ curr_info.dump (dump_file, "");
+   }
+ m_dem.merge (prev_info, curr_info);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+   {
+ fprintf (dump_file, "  prev_info after fused: ");
+ prev_info.dump (dump_file, "");
+ fprintf (dump_file, "\n");
}
+ changed = true;
}
}
 }
@@ -3344,15 +,11 @@ pre_vsetvl::emit_vsetvl ()
 {
   edge eg = INDEX_EDGE (m_edges, ed);
   sbitmap i = m_insert[ed];
-  if (bitmap_count_bits (i) < 1)
-   continue;
-
-  if (bitmap_count_bits (i) > 1)
+  if (bitmap_count_bits (i) != 1)
/* For code with infinite loop (e.g. pr61634.c), The data flow is
   completely wrong.  */
continue;
 
-  gcc_assert (bitmap_count_bits (i) == 1);
   unsigned expr_index = bitmap_first_set_bit (i);
   const vsetvl_info &info = *m_exprs[expr_index];
   gcc_assert (info.valid_p ());
-

[Committed V2] RISC-V: Fix incorrect LCM delete bug [VSETVL PASS]

2024-01-25 Thread Juzhe-Zhong

This patch fixes the recent noticed bug in RV32 glibc.

We incorrectly deleted a vsetvl:

...
and a4,a4,a3
vmv.v.i v1,0 ---> Missed vsetvl cause illegal 
instruction report.
vse8.v  v1,0(a5)

The root cause the laterin in LCM is incorrect.

  BB 358:
avloc: n_bits = 2, set = {}
kill: n_bits = 2, set = {}
antloc: n_bits = 2, set = {}
transp: n_bits = 2, set = {}
avin: n_bits = 2, set = {}
avout: n_bits = 2, set = {}
del: n_bits = 2, set = {}

cause LCM let BB 360 delete the vsetvl:

  BB 360:
avloc: n_bits = 2, set = {}
kill: n_bits = 2, set = {}
antloc: n_bits = 2, set = {}
transp: n_bits = 2, set = {0 1 }
avin: n_bits = 2, set = {}
avout: n_bits = 2, set = {}
del: n_bits = 2, set = {1}

Also, remove unknown vsetvl info into local computation since it is unnecessary.

Tested on both RV32/RV64 no regression.

PR target/113469

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc 
(pre_vsetvl::compute_lcm_local_properties): Fix bug.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113469.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 19 +++
 .../gcc.target/riscv/rvv/autovec/pr113469.c   | 54 +++
 2 files changed, 64 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index da258b964fc..1a398f02596 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2543,8 +2543,10 @@ pre_vsetvl::compute_lcm_local_properties ()
   vsetvl_info &header_info = block_info.get_entry_info ();
   vsetvl_info &footer_info = block_info.get_exit_info ();
   gcc_assert (footer_info.valid_p () || footer_info.unknown_p ());
-  add_expr (m_exprs, header_info);
-  add_expr (m_exprs, footer_info);
+  if (header_info.valid_p ())
+   add_expr (m_exprs, header_info);
+  if (footer_info.valid_p ())
+   add_expr (m_exprs, footer_info);
 }
 
   int num_exprs = m_exprs.length ();
@@ -2699,13 +2701,6 @@ pre_vsetvl::compute_lcm_local_properties ()
  }
 }
 
-  for (const bb_info *bb : crtl->ssa->bbs ())
-{
-  unsigned bb_index = bb->index ();
-  bitmap_ior (m_kill[bb_index], m_transp[bb_index], m_avloc[bb_index]);
-  bitmap_not (m_kill[bb_index], m_kill[bb_index]);
-}
-
   for (const bb_info *bb : crtl->ssa->bbs ())
 {
   unsigned bb_index = bb->index ();
@@ -2714,6 +2709,12 @@ pre_vsetvl::compute_lcm_local_properties ()
  bitmap_clear (m_antloc[bb_index]);
  bitmap_clear (m_transp[bb_index]);
}
+  /* Compute ae_kill for each basic block using:
+
+~(TRANSP | COMP)
+  */
+  bitmap_ior (m_kill[bb_index], m_transp[bb_index], m_avloc[bb_index]);
+  bitmap_not (m_kill[bb_index], m_kill[bb_index]);
 }
 }
 
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c
new file mode 100644
index 000..d1c118c02d6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 -fno-vect-cost-model" } */
+
+struct a {
+ int b;
+ int c : 1;
+ int : 1;
+} d();
+typedef struct
+{
+ int e;
+ struct {
+   int f;
+ };
+} g;
+int i;
+char k, l, n;
+void *m;
+char *o;
+void h();
+char *j();
+void p(int buf, __builtin_va_list ab, int q) {
+ do {
+   void *r[] = {&&s, &&t, &&u, &&v, &&w};
+   int c;
+   goto *m;
+ s:
+   c = 1;
+   while (1) {
+   t:
+   u:
+   ae:
+ void *af = __builtin_va_arg(ab, void *);
+ h(p);
+ o = j(i);
+ if (o == 0)
+   goto ae;
+ l = 'S';
+ break;
+   v:
+ g ah;
+ __builtin_memset(&ah, '\0', sizeof(g));
+ h(n, __builtin_va_arg(ab, int), &ah);
+ break;
+   w:
+ if (__builtin_expect(q, 0))
+   c = 0;
+ struct a ai = {'S', c};
+ d(buf, ai, af);
+   }
+ } while (k);
+}
+
+/* { dg-final { scan-assembler-times 
{vsetivli\tzero,\s*4,\s*e8,\s*mf4,\s*t[au],\s*m[au]} 2 } } */
-- 
2.36.3

[PATCH] RISC-V: Fix incorrect LCM delete bug [VSETVL PASS]

2024-01-25 Thread Juzhe-Zhong

This patch fixes the recent noticed bug in RV32 glibc.

We incorrectly deleted a vsetvl:

...
and a4,a4,a3
vmv.v.i v1,0 ---> Missed vsetvl cause illegal 
instruction report.
vse8.v  v1,0(a5)

The root cause the laterin in LCM is incorrect.

  BB 358:
avloc: n_bits = 2, set = {}
kill: n_bits = 2, set = {}
antloc: n_bits = 2, set = {}
transp: n_bits = 2, set = {}
avin: n_bits = 2, set = {}
avout: n_bits = 2, set = {}
del: n_bits = 2, set = {}

cause LCM let BB 360 delete the vsetvl:

  BB 360:
avloc: n_bits = 2, set = {}
kill: n_bits = 2, set = {}
antloc: n_bits = 2, set = {}
transp: n_bits = 2, set = {0 1 }
avin: n_bits = 2, set = {}
avout: n_bits = 2, set = {}
del: n_bits = 2, set = {1}

Also, remove unknown vsetvl info into local computation since it is unnecessary.

Tested on both RV32/RV64 no regression.

PR target/113469

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc 
(pre_vsetvl::compute_lcm_local_properties): Fix bug.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113469.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  |   21 +-
 .../gcc.target/riscv/rvv/autovec/pr113469.c   | 1841 +
 2 files changed, 1853 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index da258b964fc..f300f00e62a 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2543,8 +2543,10 @@ pre_vsetvl::compute_lcm_local_properties ()
   vsetvl_info &header_info = block_info.get_entry_info ();
   vsetvl_info &footer_info = block_info.get_exit_info ();
   gcc_assert (footer_info.valid_p () || footer_info.unknown_p ());
-  add_expr (m_exprs, header_info);
-  add_expr (m_exprs, footer_info);
+  if (header_info.valid_p ())
+   add_expr (m_exprs, header_info);
+  if (footer_info.valid_p ())
+   add_expr (m_exprs, footer_info);
 }
 
   int num_exprs = m_exprs.length ();
@@ -2699,13 +2701,6 @@ pre_vsetvl::compute_lcm_local_properties ()
  }
 }
 
-  for (const bb_info *bb : crtl->ssa->bbs ())
-{
-  unsigned bb_index = bb->index ();
-  bitmap_ior (m_kill[bb_index], m_transp[bb_index], m_avloc[bb_index]);
-  bitmap_not (m_kill[bb_index], m_kill[bb_index]);
-}
-
   for (const bb_info *bb : crtl->ssa->bbs ())
 {
   unsigned bb_index = bb->index ();
@@ -2713,8 +2708,16 @@ pre_vsetvl::compute_lcm_local_properties ()
{
  bitmap_clear (m_antloc[bb_index]);
  bitmap_clear (m_transp[bb_index]);
+ bitmap_clear (m_avloc[bb_index]);
}
 }
+
+  for (const bb_info *bb : crtl->ssa->bbs ())
+{
+  unsigned bb_index = bb->index ();
+  bitmap_ior (m_kill[bb_index], m_transp[bb_index], m_avloc[bb_index]);
+  bitmap_not (m_kill[bb_index], m_kill[bb_index]);
+}
 }
 
 void
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c
new file mode 100644
index 000..2502040772b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113469.c
@@ -0,0 +1,1841 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3" } */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static int read_int (const unsigned char * *pstr) {};
+static const char null[] = "(null)";
+extern size_t __strnlen (const char *__string, size_t __maxlen) __attribute__ 
((__pure__));
+
+struct printf_info
+{
+  int prec;
+  int width;
+  wchar_t spec;
+  unsigned int is_long_double:1;
+  unsigned int is_short:1;
+  unsigned int is_long:1;
+  unsigned int alt:1;
+  unsigned int space:1;
+  unsigned int left:1;
+  unsigned int showsign:1;
+  unsigned int group:1;
+  unsigned int extra:1;
+  unsigned int is_char:1;
+  unsigned int wide:1;
+  unsigned int i18n:1;
+  unsigned int is_binary128:1;
+
+  unsigned int __pad:3;
+  unsigned short int user;
+  wchar_t pad;
+};
+
+enum {
+  ABDAY_1 = (((2) << 16) | (0)),
+  ABDAY_2,
+  ABDAY_3,
+  ABDAY_4,
+  ABDAY_5,
+  ABDAY_6,
+  ABDAY_7,
+  DAY_1,
+  DAY_2,
+  DAY_3,
+  DAY_4,
+  DAY_5,
+  DAY_6,
+  DAY_7,
+  ABMON_1,
+  ABMON_2,
+  ABMON_3,
+  ABMON_4,
+  ABMON_5,
+  ABMON_6,
+  ABMON_7,
+  ABMON_8,
+  ABMON_9,
+  ABMON_10,
+  ABMON_11,
+  ABMON_12,
+  MON_1,
+  MON_2,
+  MON_3,
+  MON_4,
+  MON_5,
+  MON_6,
+  MON_7,
+  MON_8,
+  MON_9,
+  MON_10,
+  MON_11,
+  MON_12,
+  AM_STR,
+  PM_STR,
+  D_T_FMT,
+  D_FMT,
+  T_FMT,
+  T_FMT_AMPM,
+  ERA,
+  __ERA_YEAR,
+  ERA_D_FMT,
+
+  ALT_DIGITS,
+
+  ERA_D_T_FMT,
+
+  ERA_T_FMT,
+  _NL_TIME_ERA_NUM_ENTRIES,
+  _NL_TIME_ERA_ENTRIES,
+
+  _NL_WABDAY_1,
+  _NL_WABDAY_2,
+  _NL_WABDAY_3,
+  _NL_WABDAY_4,
+  _NL_WABDAY_5,
+  _NL_WABDAY_6,
+  _NL_WABDAY_7,
+  _NL_WDA

[PATCH] RISC-V: Add LCM delete block predecessors dump information

2024-01-25 Thread Juzhe-Zhong

While looking into PR113469, I notice the LCM delete a vsetvl incorrectly.

This patch add dump information of all predecessors for LCM delete vsetvl block
for better debugging.

Tested no regression.

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (get_all_predecessors): New function.
(pre_vsetvl::pre_global_vsetvl_info): Add LCM delete block all 
predecessors dump information.

---
 gcc/config/riscv/riscv-vsetvl.cc | 42 
 1 file changed, 42 insertions(+)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 72c5a127d9e..da258b964fc 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -654,6 +654,31 @@ invalid_opt_bb_p (basic_block cfg_bb)
   return false;
 }
 
+/* Get all predecessors of BB.  */
+static hash_set
+get_all_predecessors (basic_block bb)
+{
+  hash_set blocks;
+  auto_vec work_list;
+  hash_set visited_list;
+  work_list.safe_push (bb);
+
+  while (!work_list.is_empty ())
+{
+  basic_block new_bb = work_list.pop ();
+  visited_list.add (new_bb);
+  edge e;
+  edge_iterator ei;
+  FOR_EACH_EDGE (e, ei, new_bb->preds)
+   {
+ if (!visited_list.contains (e->src))
+   work_list.safe_push (e->src);
+ blocks.add (e->src);
+   }
+}
+  return blocks;
+}
+
 /* This flags indicates the minimum demand of the vl and vtype values by the
RVV instruction. For example, DEMAND_RATIO_P indicates that this RVV
instruction only needs the SEW/LMUL ratio to remain the same, and does not
@@ -3142,6 +3167,23 @@ pre_vsetvl::pre_global_vsetvl_info ()
   const vsetvl_block_info &block_info = get_block_info (info.get_bb ());
   gcc_assert (block_info.get_entry_info () == info);
   info.set_delete ();
+  if (dump_file && (dump_flags & TDF_DETAILS))
+   {
+ fprintf (dump_file,
+  "\nLCM deleting vsetvl of block %d, it has predecessors: \n",
+  bb->index ());
+ hash_set all_preds
+   = get_all_predecessors (bb->cfg_bb ());
+ int i = 0;
+ for (const auto pred : all_preds)
+   {
+ fprintf (dump_file, "%d ", pred->index);
+ i++;
+ if (i % 32 == 0)
+   fprintf (dump_file, "\n");
+   }
+ fprintf (dump_file, "\n");
+   }
 }
 
   /* Remove vsetvl infos if all precessors are available to the block.  */
-- 
2.36.3

[Committed] RISC-V: Remove redundant full available computation [NFC]

2024-01-25 Thread Juzhe-Zhong

Notice full available is computed evey round of earliest fusion which is 
redundant.
Actually we only need to compute it once in phase 3.

It's NFC patch and tested no regression. Committed.
 
gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pre_vsetvl::compute_vsetvl_def_data): 
Remove redundant full available computation.
(pre_vsetvl::pre_global_vsetvl_info): Ditto.

---
 gcc/config/riscv/riscv-vsetvl.cc | 57 +---
 1 file changed, 23 insertions(+), 34 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 53d954e1dff..72c5a127d9e 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -1256,9 +1256,7 @@ public:
   vsetvl_info global_info;
   bb_info *bb;
 
-  bool full_available;
-
-  vsetvl_block_info () : bb (nullptr), full_available (false)
+  vsetvl_block_info () : bb (nullptr)
   {
 local_infos.safe_grow_cleared (0);
 global_info.set_empty ();
@@ -2489,34 +2487,6 @@ pre_vsetvl::compute_vsetvl_def_data ()
}
 }
 
-  for (const bb_info *bb : crtl->ssa->bbs ())
-{
-  vsetvl_block_info &block_info = get_block_info (bb);
-  if (block_info.empty_p ())
-   continue;
-  vsetvl_info &curr_info = block_info.get_entry_info ();
-  if (!curr_info.valid_p ())
-   continue;
-
-  unsigned int expr_index;
-  sbitmap_iterator sbi;
-  gcc_assert (
-   !bitmap_empty_p (m_vsetvl_def_in[curr_info.get_bb ()->index ()]));
-  bool full_available = true;
-  EXECUTE_IF_SET_IN_BITMAP (m_vsetvl_def_in[bb->index ()], 0, expr_index,
-   sbi)
-   {
- vsetvl_info &prev_info = *m_vsetvl_def_exprs[expr_index];
- if (!prev_info.valid_p ()
- || !m_dem.available_p (prev_info, curr_info))
-   {
- full_available = false;
- break;
-   }
-   }
-  block_info.full_available = full_available;
-}
-
   sbitmap_vector_free (def_loc);
   sbitmap_vector_free (m_kill);
 }
@@ -3178,11 +3148,30 @@ pre_vsetvl::pre_global_vsetvl_info ()
   for (const bb_info *bb : crtl->ssa->bbs ())
 {
   vsetvl_block_info &block_info = get_block_info (bb);
-  if (block_info.empty_p () || !block_info.full_available)
+  if (block_info.empty_p ())
+   continue;
+  vsetvl_info &curr_info = block_info.get_entry_info ();
+  if (!curr_info.valid_p ())
continue;
 
-  vsetvl_info &info = block_info.get_entry_info ();
-  info.set_delete ();
+  unsigned int expr_index;
+  sbitmap_iterator sbi;
+  gcc_assert (
+   !bitmap_empty_p (m_vsetvl_def_in[curr_info.get_bb ()->index ()]));
+  bool full_available = true;
+  EXECUTE_IF_SET_IN_BITMAP (m_vsetvl_def_in[bb->index ()], 0, expr_index,
+   sbi)
+   {
+ vsetvl_info &prev_info = *m_vsetvl_def_exprs[expr_index];
+ if (!prev_info.valid_p ()
+ || !m_dem.available_p (prev_info, curr_info))
+   {
+ full_available = false;
+ break;
+   }
+   }
+  if (full_available)
+   curr_info.set_delete ();
 }
 
   for (const bb_info *bb : crtl->ssa->bbs ())
-- 
2.36.3

[Committed] RISC-V: Add optim-no-fusion compile option [VSETVL PASS]

2024-01-24 Thread Juzhe-Zhong

This patch adds no fusion compile option to disable phase 2 global fusion.

It can help us to analyze the compile-time and debugging.

Committed.

gcc/ChangeLog:

* config/riscv/riscv-opts.h (enum vsetvl_strategy_enum): Add 
optim-no-fusion option.
* config/riscv/riscv-vsetvl.cc (pass_vsetvl::lazy_vsetvl): Ditto.
(pass_vsetvl::execute): Ditto.
* config/riscv/riscv.opt: Ditto.

---
 gcc/config/riscv/riscv-opts.h|  8 +---
 gcc/config/riscv/riscv-vsetvl.cc | 22 --
 gcc/config/riscv/riscv.opt   |  5 -
 3 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/gcc/config/riscv/riscv-opts.h b/gcc/config/riscv/riscv-opts.h
index ca57dddf1d9..1500f8811ef 100644
--- a/gcc/config/riscv/riscv-opts.h
+++ b/gcc/config/riscv/riscv-opts.h
@@ -118,11 +118,13 @@ enum stringop_strategy_enum {
 
 /* Behavior of VSETVL Pass.  */
 enum vsetvl_strategy_enum {
-  /* Simple: Insert a vsetvl* instruction for each Vector instruction.  */
-  VSETVL_SIMPLE = 1,
   /* Optimized: Run LCM dataflow analysis to reduce vsetvl* insns and
  delete any redundant ones generated in the process.  */
-  VSETVL_OPT = 2
+  VSETVL_OPT,
+  /* Simple: Insert a vsetvl* instruction for each Vector instruction.  */
+  VSETVL_SIMPLE,
+  /* No fusion: Disable Phase 2 earliest global fusion.  */
+  VSETVL_OPT_NO_FUSION,
 };
 
 #define TARGET_ZICOND_LIKE (TARGET_ZICOND || (TARGET_XVENTANACONDOPS && 
TARGET_64BIT))
diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 170fc7f003d..53d954e1dff 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -3495,16 +3495,18 @@ pass_vsetvl::lazy_vsetvl ()
   /* Phase 2:  Fuse header and footer vsetvl infos between basic blocks.  */
   if (dump_file)
 fprintf (dump_file, "\nPhase 2: Lift up vsetvl info.\n\n");
-  bool changed;
-  int fused_count = 0;
-  do
+  if (vsetvl_strategy != VSETVL_OPT_NO_FUSION)
 {
-  if (dump_file)
-   fprintf (dump_file, "  Try lift up %d.\n\n", fused_count);
-  changed = pre.earliest_fuse_vsetvl_info (fused_count);
-  fused_count += 1;
-  } while (changed);
-
+  bool changed = true;
+  int fused_count = 0;
+  do
+   {
+ if (dump_file)
+   fprintf (dump_file, "  Try lift up %d.\n\n", fused_count);
+ changed = pre.earliest_fuse_vsetvl_info (fused_count);
+ fused_count += 1;
+  } while (changed);
+}
   if (dump_file && (dump_flags & TDF_DETAILS))
 pre.dump (dump_file, "phase 2");
 
@@ -3545,7 +3547,7 @@ pass_vsetvl::execute (function *)
   if (!has_vector_insn (cfun))
 return 0;
 
-  if (!optimize || vsetvl_strategy & VSETVL_SIMPLE)
+  if (!optimize || vsetvl_strategy == VSETVL_SIMPLE)
 simple_vsetvl ();
   else
 lazy_vsetvl ();
diff --git a/gcc/config/riscv/riscv.opt b/gcc/config/riscv/riscv.opt
index 65c656204ca..7c2292d8f91 100644
--- a/gcc/config/riscv/riscv.opt
+++ b/gcc/config/riscv/riscv.opt
@@ -552,11 +552,14 @@ Enum
 Name(vsetvl_strategy) Type(enum vsetvl_strategy_enum)
 Valid arguments to -param=vsetvl-strategy=:
 
+EnumValue
+Enum(vsetvl_strategy) String(optim) Value(VSETVL_OPT)
+
 EnumValue
 Enum(vsetvl_strategy) String(simple) Value(VSETVL_SIMPLE)
 
 EnumValue
-Enum(vsetvl_strategy) String(optim) Value(VSETVL_OPT)
+Enum(vsetvl_strategy) String(optim-no-fusion) Value(VSETVL_OPT_NO_FUSION)
 
 -param=vsetvl-strategy=
 Target Undocumented RejectNegative Joined Enum(vsetvl_strategy) 
Var(vsetvl_strategy) Init(VSETVL_OPT)
-- 
2.36.3

[PATCH] RISC-V: Fix large memory usage of VSETVL PASS [PR113495]

2024-01-23 Thread Juzhe-Zhong

SPEC 2017 wrf benchmark expose unreasonble memory usage of VSETVL PASS
that is, VSETVL PASS consume over 33 GB memory which make use impossible
to compile SPEC 2017 wrf in a laptop.

The root cause is wasting-memory variables:

unsigned num_exprs = num_bbs * num_regs;
sbitmap *avl_def_loc = sbitmap_vector_alloc (num_bbs, num_exprs);
sbitmap *m_kill = sbitmap_vector_alloc (num_bbs, num_exprs);
m_avl_def_in = sbitmap_vector_alloc (num_bbs, num_exprs);
m_avl_def_out = sbitmap_vector_alloc (num_bbs, num_exprs);

I find that compute_avl_def_data can be achieved by RTL_SSA framework.
Replace the code implementation base on RTL_SSA framework.

After this patch, the memory-hog issue is fixed.

simple vsetvl memory usage (valgrind --tool=massif --pages-as-heap=yes 
--massif-out-file=massif.out)
is 1.673 GB.

lazy vsetvl memory usage (valgrind --tool=massif --pages-as-heap=yes 
--massif-out-file=massif.out)
is 2.441 GB. 

Tested on both RV32 and RV64, no regression.

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (get_expr_id): Remove.
(get_regno): Ditto.
(get_bb_index): Ditto.
(pre_vsetvl::compute_avl_def_data): Ditto.
(pre_vsetvl::earliest_fuse_vsetvl_info): Fix large memory usage.
(pre_vsetvl::pre_global_vsetvl_info): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/avl_single-107.c: Adapt test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 233 --
 .../riscv/rvv/vsetvl/avl_single-107.c |   2 +-
 2 files changed, 52 insertions(+), 183 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 54c85ffb7d5..170fc7f003d 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -617,22 +617,6 @@ same_equiv_note_p (set_info *set1, set_info *set2)
   return source_equal_p (insn1, insn2);
 }
 
-static unsigned
-get_expr_id (unsigned bb_index, unsigned regno, unsigned num_bbs)
-{
-  return regno * num_bbs + bb_index;
-}
-static unsigned
-get_regno (unsigned expr_id, unsigned num_bb)
-{
-  return expr_id / num_bb;
-}
-static unsigned
-get_bb_index (unsigned expr_id, unsigned num_bb)
-{
-  return expr_id % num_bb;
-}
-
 /* Return true if the SET result is not used by any instructions.  */
 static bool
 has_no_uses (basic_block cfg_bb, rtx_insn *rinsn, int regno)
@@ -1337,9 +1321,6 @@ public:
 class demand_system
 {
 private:
-  sbitmap *m_avl_def_in;
-  sbitmap *m_avl_def_out;
-
   /* predictors.  */
 
   inline bool always_true (const vsetvl_info &prev ATTRIBUTE_UNUSED,
@@ -1743,14 +1724,6 @@ private:
   }
 
 public:
-  demand_system () : m_avl_def_in (nullptr), m_avl_def_out (nullptr) {}
-
-  void set_avl_in_out_data (sbitmap *m_avl_def_in, sbitmap *m_avl_def_out)
-  {
-m_avl_def_in = m_avl_def_in;
-m_avl_def_out = m_avl_def_out;
-  }
-
   /* Can we move vsetvl info between prev_insn and next_insn safe? */
   bool avl_vl_unmodified_between_p (insn_info *prev_insn, insn_info *next_insn,
const vsetvl_info &info,
@@ -1778,32 +1751,66 @@ public:
   }
 else
   {
+   basic_block prev_cfg_bb = prev_insn->bb ()->cfg_bb ();
if (!ignore_vl && info.has_vl ())
  {
-   bitmap live_out = df_get_live_out (prev_insn->bb ()->cfg_bb ());
+   bitmap live_out = df_get_live_out (prev_cfg_bb);
if (bitmap_bit_p (live_out, REGNO (info.get_vl (
  return false;
  }
 
-   if (info.has_nonvlmax_reg_avl () && m_avl_def_in && m_avl_def_out)
+   /* Find set_info at location of PREV_INSN and NEXT_INSN, Return
+  false if those 2 set_info are different.
+
+PREV_INSN --- multiple nested blocks --- NEXT_INSN.
+
+  Return false if there is any modifications of AVL inside those
+  multiple nested blocks.  */
+   if (info.has_nonvlmax_reg_avl ())
  {
-   bool has_avl_out = false;
-   unsigned regno = REGNO (info.get_avl ());
-   unsigned expr_id;
-   sbitmap_iterator sbi;
-   EXECUTE_IF_SET_IN_BITMAP (m_avl_def_out[prev_insn->bb ()->index ()],
- 0, expr_id, sbi)
+   resource_info resource = full_register (REGNO (info.get_avl ()));
+   def_lookup dl1 = crtl->ssa->find_def (resource, prev_insn);
+   def_lookup dl2 = crtl->ssa->find_def (resource, next_insn);
+   if (dl2.matching_set ())
+ return false;
+
+   auto is_phi_or_real
+ = [&] (insn_info *h) { return h->is_real () || h->is_phi (); };
+
+   def_info *def1 = dl1.matching_set_or_last_def_of_prev_group ();
+   def_info *def2 = dl2.prev_def (next_insn);
+   set_info *set1 = safe_dyn_cast (def1);
+   set_info *set2 = safe_dyn_cast (def2);
+   if (!set1 || !set2)
+ return false;
+
+   auto is_same_ultimate_def = [&] (set_info *s1, set_info *s2) {
+

[PATCH] RISC-V: Fix regressions due to 86de9b66480b710202a2898cf513db105d8c432f

2024-01-22 Thread Juzhe-Zhong

This patch fixes the recent regression:

FAIL: gcc.dg/torture/float32-tg-2.c   -O1  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg-2.c   -O1  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg-2.c   -O2  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg-2.c   -O2  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg-2.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none  (internal compiler error: in reg_or_subregno, at 
jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg-2.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg-2.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  (internal compiler error: in reg_or_subregno, at 
jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg-2.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg-2.c   -O3 -g  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg-2.c   -O3 -g  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg-2.c   -Os  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg-2.c   -Os  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg.c   -O1  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg.c   -O1  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg.c   -O2  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg.c   -O2  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none  (internal compiler error: in reg_or_subregno, at 
jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  (internal compiler error: in reg_or_subregno, at 
jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg.c   -O3 -g  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg.c   -O3 -g  (test for excess errors)
FAIL: gcc.dg/torture/float32-tg.c   -Os  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/float32-tg.c   -Os  (test for excess errors)
FAIL: gcc.dg/torture/pr48124-4.c   -O1  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/pr48124-4.c   -O1  (test for excess errors)
FAIL: gcc.dg/torture/pr48124-4.c   -O2  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/pr48124-4.c   -O2  (test for excess errors)
FAIL: gcc.dg/torture/pr48124-4.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none  (internal compiler error: in reg_or_subregno, at 
jump.cc:1895)
FAIL: gcc.dg/torture/pr48124-4.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none  (test for excess errors)
FAIL: gcc.dg/torture/pr48124-4.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  (internal compiler error: in reg_or_subregno, at 
jump.cc:1895)
FAIL: gcc.dg/torture/pr48124-4.c   -O2 -flto -fuse-linker-plugin 
-fno-fat-lto-objects  (test for excess errors)
FAIL: gcc.dg/torture/pr48124-4.c   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/pr48124-4.c   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  (test for excess errors)
FAIL: gcc.dg/torture/pr48124-4.c   -O3 -g  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/pr48124-4.c   -O3 -g  (test for excess errors)
FAIL: gcc.dg/torture/pr48124-4.c   -Os  (internal compiler error: in 
reg_or_subregno, at jump.cc:1895)
FAIL: gcc.dg/torture/pr48124-4.c   -Os  (test for excess errors)

due to commit 86de9b66480b710202a2898cf513db105d8c432f.

The root cause is register_operand and reg_or_subregno are consistent so we 
reach the assertion fail.

We shouldn't worry about subreg:...VL_REGNUM since it's impossible that we can 
have such situation,
that is, we only have (set (reg) (reg:VL_REGNUM)) which generate "csrr vl" ASM 
for first fault load instructions (vleff).
So, using REG_P and REGNO must be totally solid and robostic.

Since we don't allow VL_RENUM involved into register allocation and we don't 
have such constraint, we always use this
following pattern to generate "csrr vl" ASM:

(define_insn "read_vlsi"
  [(set (match_operand:SI 0 "register_operand" "=r")
(reg:SI VL_REGNUM))]
  "TARGET_VECTOR"
  "csrr\t%0,vl"
  [(set_attr "type" "rdvl")
   (set_attr "mode" "SI")])

So the check in riscv.md is to disallow such situation fall into move pattern 
in riscv.md

Tested on both RV32/RV64 no

[PATCH] RISC-V: Lower vmv.v.x (avl = 1) into vmv.s.x

2024-01-21 Thread Juzhe-Zhong

Notice there is a AI benchmark, GCC vs Clang has 3% performance drop.

It's because Clang/LLVM has a simplification transform vmv.v.x (avl = 1) into 
vmv.s.x.

Since vmv.s.x has more flexible vsetvl demand than vmv.v.x that can allow us to 
have
better chances to fuse vsetvl.

Consider this following case:

void
foo (uint32_t *outputMat, uint32_t *inputMat)
{
  vuint32m1_t matRegIn0 = __riscv_vle32_v_u32m1 (inputMat, 4);
  vuint32m1_t matRegIn1 = __riscv_vle32_v_u32m1 (inputMat + 4, 4);
  vuint32m1_t matRegIn2 = __riscv_vle32_v_u32m1 (inputMat + 8, 4);
  vuint32m1_t matRegIn3 = __riscv_vle32_v_u32m1 (inputMat + 12, 4);

  vbool32_t oddMask
= __riscv_vreinterpret_v_u32m1_b32 (__riscv_vmv_v_x_u32m1 (0x, 1));

  vuint32m1_t smallTransposeMat0
= __riscv_vslideup_vx_u32m1_tumu (oddMask, matRegIn0, matRegIn1, 1, 4);
  vuint32m1_t smallTransposeMat2
= __riscv_vslideup_vx_u32m1_tumu (oddMask, matRegIn2, matRegIn3, 1, 4);

  vuint32m1_t outMat0 = __riscv_vslideup_vx_u32m1_tu (smallTransposeMat0,
  smallTransposeMat2, 2, 4);

  __riscv_vse32_v_u32m1 (outputMat, outMat0, 4);
}

Before this patch:

vsetivlizero,4,e32,m1,ta,ma
li  a5,45056
addia2,a1,16
addia3,a1,32
addia4,a1,48
vle32.v v1,0(a1)
vle32.v v4,0(a2)
vle32.v v2,0(a3)
vle32.v v3,0(a4)
addiw   a5,a5,-1366
vsetivlizero,1,e32,m1,ta,ma
vmv.v.x v0,a5 ---> Since it avl = 1, we can 
transform it into vmv.s.x
vsetivlizero,4,e32,m1,tu,mu
vslideup.vi v1,v4,1,v0.t
vslideup.vi v2,v3,1,v0.t
vslideup.vi v1,v2,2
vse32.v v1,0(a0)
ret

After this patch:

li  a5,45056
addia2,a1,16
vsetivlizero,4,e32,m1,tu,mu
addiw   a5,a5,-1366
vle32.v v3,0(a2)
addia3,a1,32
addia4,a1,48
vle32.v v1,0(a1)
vmv.s.x v0,a5
vle32.v v2,0(a3)
vslideup.vi v1,v3,1,v0.t
vle32.v v3,0(a4)
vslideup.vi v2,v3,1,v0.t
vslideup.vi v1,v2,2
vse32.v v1,0(a0)
ret

Tested on both RV32 and RV64 no regression.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (splat_to_scalar_move_p): New function.
* config/riscv/riscv-v.cc (splat_to_scalar_move_p): Ditto.
* config/riscv/vector.md: Simplify vmv.v.x. into vmv.s.x.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/attribute-2.c: New test.
* gcc.target/riscv/rvv/vsetvl/attribute-3.c: New test.

---
 gcc/config/riscv/riscv-protos.h   |  1 +
 gcc/config/riscv/riscv-v.cc   | 12 ++
 gcc/config/riscv/vector.md|  9 -
 .../gcc.target/riscv/rvv/vsetvl/attribute-2.c | 37 +++
 .../gcc.target/riscv/rvv/vsetvl/attribute-3.c | 36 ++
 5 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-3.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 7fe26fcd939..b3f0bdb9924 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -708,6 +708,7 @@ bool can_be_broadcasted_p (rtx);
 bool gather_scatter_valid_offset_p (machine_mode);
 HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
 bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
+bool splat_to_scalar_move_p (rtx *);
 }
 
 /* We classify builtin types into two classes:
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 93a1238a5ab..4bacb7fea45 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -5151,4 +5151,16 @@ whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, 
int avl_type_index)
   return false;
 }
 
+/* Return true if we can transform vmv.v.x/vfmv.v.f to vmv.s.x/vfmv.s.f.  */
+bool
+splat_to_scalar_move_p (rtx *ops)
+{
+  return satisfies_constraint_Wc1 (ops[1])
+&& satisfies_constraint_vu (ops[2])
+&& !MEM_P (ops[3])
+&& satisfies_constraint_c01 (ops[4])
+&& INTVAL (ops[7]) == NONVLMAX
+&& known_ge (GET_MODE_SIZE (Pmode), GET_MODE_SIZE (GET_MODE (ops[3])));
+}
+
 } // namespace riscv_vector
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 307d9a8c952..ab6e099852d 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1977,8 +1977,15 @@
  (match_operand:V_VLS 2 "vector_merge_operand")))]
   "TARGET_VECTOR"
 {
+  /* Transform vmv.v.x/vfmv.v.f (avl = 1) into vmv.s.x since vmv.s.x/vfmv.s.f
+ has better chances to do vsetvl fusion in vsetvl pass.  */
+  if (riscv_vector::splat_to_scalar_move_p (operands))
+{
+  operands[1] = riscv_vector::gen_scalar_move_mask (mode);
+  operands[3]

[PATCH] RISC-V: Fix vfirst/vmsbf/vmsif/vmsof ratio attributes

2024-01-21 Thread Juzhe-Zhong

vfirst/vmsbf/vmsif/vmsof instructions are supposed to demand ratio instead of 
demanding sew_lmul.
But my previous typo makes VSETVL PASS miss honor the risc-v v spec.

Consider this following simple case:

int foo4 (void * in, void * out)
{
  vint32m1_t v = __riscv_vle32_v_i32m1 (in, 4);
  v = __riscv_vadd_vv_i32m1 (v, v, 4);
  vbool32_t mask = __riscv_vreinterpret_v_i32m1_b32(v);
  mask = __riscv_vmsof_m_b32(mask, 4);
  return __riscv_vfirst_m_b32(mask, 4);
}

Before this patch:

foo4:
vsetivlizero,4,e32,m1,ta,ma
vle32.v v1,0(a0)
vadd.vv v1,v1,v1
vsetvli zero,zero,e8,mf4,ta,ma> redundant.
vmsof.m v2,v1
vfirst.ma0,v2
ret

After this patch:

foo4:
vsetivlizero,4,e32,m1,ta,ma
vle32.v v1,0(a0)
vadd.vv v1,v1,v1
vmsof.m v2,v1
vfirst.ma0,v2
ret

Confirm RVV spec and Clang, this patch makes VSETVL PASS match the correct 
behavior.

Tested on both RV32/RV64, no regression.

gcc/ChangeLog:

* config/riscv/vector.md: Fix vfirst/vmsbf/vmsof ratio attributes.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/attribute-1.c: New test.

---
 gcc/config/riscv/vector.md|  2 +-
 .../gcc.target/riscv/rvv/vsetvl/attribute-1.c | 47 +++
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-1.c

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index cfc54ae5eac..307d9a8c952 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -433,7 +433,7 @@
  vialu,vshift,vicmp,vimul,vidiv,vsalu,\
  vext,viwalu,viwmul,vicalu,vnshift,\
  vimuladd,vimerge,vaalu,vsmul,vsshift,\
- vnclip,viminmax,viwmuladd,vmffs,vmsfs,\
+ vnclip,viminmax,viwmuladd,\
  vmiota,vmidx,vfalu,vfmul,vfminmax,vfdiv,\
  vfwalu,vfwmul,vfsqrt,vfrecp,vfsgnj,vfcmp,\
  vfmerge,vfcvtitof,vfcvtftoi,vfwcvtitof,\
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-1.c
new file mode 100644
index 000..28dcf986bac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/attribute-1.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "riscv_vector.h"
+
+int
+foo (void *in, void *out)
+{
+  vint32m1_t v = __riscv_vle32_v_i32m1 (in, 4);
+  v = __riscv_vadd_vv_i32m1 (v, v, 4);
+  vbool32_t mask = __riscv_vreinterpret_v_i32m1_b32 (v);
+  return __riscv_vfirst_m_b32 (mask, 4);
+}
+
+int
+foo2 (void *in, void *out)
+{
+  vint32m1_t v = __riscv_vle32_v_i32m1 (in, 4);
+  v = __riscv_vadd_vv_i32m1 (v, v, 4);
+  vbool32_t mask = __riscv_vreinterpret_v_i32m1_b32 (v);
+  mask = __riscv_vmsbf_m_b32 (mask, 4);
+  return __riscv_vfirst_m_b32 (mask, 4);
+}
+
+int
+foo3 (void *in, void *out)
+{
+  vint32m1_t v = __riscv_vle32_v_i32m1 (in, 4);
+  v = __riscv_vadd_vv_i32m1 (v, v, 4);
+  vbool32_t mask = __riscv_vreinterpret_v_i32m1_b32 (v);
+  mask = __riscv_vmsif_m_b32 (mask, 4);
+  return __riscv_vfirst_m_b32 (mask, 4);
+}
+
+int
+foo4 (void *in, void *out)
+{
+  vint32m1_t v = __riscv_vle32_v_i32m1 (in, 4);
+  v = __riscv_vadd_vv_i32m1 (v, v, 4);
+  vbool32_t mask = __riscv_vreinterpret_v_i32m1_b32 (v);
+  mask = __riscv_vmsof_m_b32 (mask, 4);
+  return __riscv_vfirst_m_b32 (mask, 4);
+}
+
+/* { dg-final { scan-assembler-times 
{vsetivli\s+zero,\s*4,\s*e32,\s*m1,\s*t[au],\s*m[au]} 4 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 4 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
-- 
2.36.3

[Committed] RISC-V: Suppress warning

2024-01-19 Thread Juzhe-Zhong

../../gcc/config/riscv/riscv.cc: In function 'void 
riscv_init_cumulative_args(CUMULATIVE_ARGS*, tree, rtx, tree, int)':
../../gcc/config/riscv/riscv.cc:4879:34: error: unused parameter 'fndecl' 
[-Werror=unused-parameter]
4879 | tree fndecl,
  | ~^~
../../gcc/config/riscv/riscv.cc: In function 'bool 
riscv_vector_mode_supported_any_target_p(machine_mode)':
../../gcc/config/riscv/riscv.cc:10537:56: error: unused parameter 'mode' 
[-Werror=unused-parameter]
10537 | riscv_vector_mode_supported_any_target_p (machine_mode mode)
  |   ~^~~~
cc1plus: all warnings being treated as errors
make[3]: *** [Makefile:2559: riscv.o] Error 1

Suppress these warnings.

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_init_cumulative_args): Suppress warning.
(riscv_vector_mode_supported_any_target_p): Ditto.

---
 gcc/config/riscv/riscv.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index dd6e68a08c2..1f9546f4d3e 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -4876,7 +4876,7 @@ void
 riscv_init_cumulative_args (CUMULATIVE_ARGS *cum,
tree fntype ATTRIBUTE_UNUSED,
rtx libname ATTRIBUTE_UNUSED,
-   tree fndecl,
+   tree fndecl ATTRIBUTE_UNUSED,
int caller ATTRIBUTE_UNUSED)
 {
   memset (cum, 0, sizeof (*cum));
@@ -10534,7 +10534,7 @@ extract_base_offset_in_addr (rtx mem, rtx *base, rtx 
*offset)
 /* Implements target hook vector_mode_supported_any_target_p.  */
 
 static bool
-riscv_vector_mode_supported_any_target_p (machine_mode mode)
+riscv_vector_mode_supported_any_target_p (machine_mode)
 {
   if (TARGET_XTHEADVECTOR)
 return false;
-- 
2.36.3

[PATCH V2] RISC-V: Fix RVV_VLMAX

2024-01-19 Thread Juzhe-Zhong

This patch fixes memory hog found in SPEC2017 wrf benchmark which caused by
RVV_VLMAX since RVV_VLMAX generate brand new rtx by gen_rtx_REG (Pmode, 
X0_REGNUM)
every time we call RVV_VLMAX, that is, we are always generating garbage and 
redundant
(reg:DI 0 zero) rtx.

After this patch fix, the memory hog is gone.

Time variable   usr   sys  wall 
  GGC
 machine dep reorg  :   1.99 (  9%)   0.35 ( 56%)   2.33 ( 10%) 
  939M ( 80%) [Before this patch]
 machine dep reorg  :   1.71 (  6%)   0.16 ( 27%)   3.77 (  6%) 
  659k (  0%) [After this patch]
 
Time variable   usr   sys  wall 
  GGC
 machine dep reorg  :  75.93 ( 18%)  14.23 ( 88%)  90.15 ( 21%) 
33383M ( 95%) [Before this patch]
 machine dep reorg  :  56.00 ( 14%)   7.92 ( 77%)  63.93 ( 15%) 
 4361k (  0%) [After this patch]

Test is running. Ok for trunk if I passed the test with no regresion ?

PR target/113495

gcc/ChangeLog:

* config/riscv/riscv-protos.h (RVV_VLMAX): Change to 
regno_reg_rtx[X0_REGNUM].
(RVV_VUNDEF): Ditto.
* config/riscv/riscv-vsetvl.cc: Add timevar.

---
 gcc/config/riscv/riscv-protos.h  | 5 ++---
 gcc/config/riscv/riscv-vsetvl.cc | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 7853b488838..7fe26fcd939 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -299,10 +299,9 @@ void riscv_run_selftests (void);
 #endif
 
 namespace riscv_vector {
-#define RVV_VLMAX gen_rtx_REG (Pmode, X0_REGNUM)
+#define RVV_VLMAX regno_reg_rtx[X0_REGNUM]
 #define RVV_VUNDEF(MODE)   
\
-  gen_rtx_UNSPEC (MODE, gen_rtvec (1, gen_rtx_REG (SImode, X0_REGNUM)),
\
- UNSPEC_VUNDEF)
+  gen_rtx_UNSPEC (MODE, gen_rtvec (1, RVV_VLMAX), UNSPEC_VUNDEF)
 
 /* These flags describe how to pass the operands to a rvv insn pattern.
e.g.:
diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 2067073185f..54c85ffb7d5 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -3556,7 +3556,7 @@ const pass_data pass_data_vsetvl = {
   RTL_PASS, /* type */
   "vsetvl", /* name */
   OPTGROUP_NONE, /* optinfo_flags */
-  TV_NONE,  /* tv_id */
+  TV_MACH_DEP,  /* tv_id */
   0,/* properties_required */
   0,/* properties_provided */
   0,/* properties_destroyed */
-- 
2.36.3

[PATCH] RISC-V: Fix RVV_VLMAX

2024-01-19 Thread Juzhe-Zhong

This patch fixes memory hog found in SPEC2017 wrf benchmark which caused by
RVV_VLMAX since RVV_VLMAX generate brand new rtx by gen_rtx_REG (Pmode, 
X0_REGNUM)
every time we call RVV_VLMAX, that is, we are always generating garbage and 
redundant
(reg:DI 0 zero) rtx.

After this patch fix, the memory hog is gone.

Time variable   usr   sys  wall 
  GGC
 machine dep reorg  :   1.99 (  9%)   0.35 ( 56%)   2.33 ( 10%) 
  939M ( 80%) [Before this patch]
 machine dep reorg  :   1.71 (  6%)   0.16 ( 27%)   3.77 (  6%) 
  659k (  0%) [After this patch]
 
Time variable   usr   sys  wall 
  GGC
 machine dep reorg  :  75.93 ( 18%)  14.23 ( 88%)  90.15 ( 21%) 
33383M ( 95%) [Before this patch]
 machine dep reorg  :  56.00 ( 14%)   7.92 ( 77%)  63.93 ( 15%) 
 4361k (  0%) [After this patch]

Test is running. Ok for trunk if I passed the test with no regresion ?

gcc/ChangeLog:

* config/riscv/riscv-protos.h (RVV_VLMAX): Change to 
regno_reg_rtx[X0_REGNUM].
(RVV_VUNDEF): Ditto.
* config/riscv/riscv-vsetvl.cc: Add timevar.

---
 gcc/config/riscv/riscv-protos.h  | 5 ++---
 gcc/config/riscv/riscv-vsetvl.cc | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 7853b488838..7fe26fcd939 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -299,10 +299,9 @@ void riscv_run_selftests (void);
 #endif
 
 namespace riscv_vector {
-#define RVV_VLMAX gen_rtx_REG (Pmode, X0_REGNUM)
+#define RVV_VLMAX regno_reg_rtx[X0_REGNUM]
 #define RVV_VUNDEF(MODE)   
\
-  gen_rtx_UNSPEC (MODE, gen_rtvec (1, gen_rtx_REG (SImode, X0_REGNUM)),
\
- UNSPEC_VUNDEF)
+  gen_rtx_UNSPEC (MODE, gen_rtvec (1, RVV_VLMAX), UNSPEC_VUNDEF)
 
 /* These flags describe how to pass the operands to a rvv insn pattern.
e.g.:
diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 2067073185f..54c85ffb7d5 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -3556,7 +3556,7 @@ const pass_data pass_data_vsetvl = {
   RTL_PASS, /* type */
   "vsetvl", /* name */
   OPTGROUP_NONE, /* optinfo_flags */
-  TV_NONE,  /* tv_id */
+  TV_MACH_DEP,  /* tv_id */
   0,/* properties_required */
   0,/* properties_provided */
   0,/* properties_destroyed */
-- 
2.36.3

[PATCH] RISC-V: Support vi variant for vec_cmp

2024-01-18 Thread Juzhe-Zhong

While running various benchmarks, I notice we miss vi variant support for 
integer comparison.
That is, we can vectorize code into vadd.vi but we can't vectorize into 
vmseq.vi.

Consider this following case:

void
foo (int n, int **__restrict a)
{
  int b;
  int c;
  int d;
  for (b = 0; b < n; b++)
for (long e = 8; e > 0; e--)
  a[b][e] = a[b][e] == 15;
}

Before this patch:

vsetivlizero,4,e32,m1,ta,ma
vmv.v.i v4,15
vmv.v.i v3,1
vmv.v.i v2,0
.L3:
ld  a5,0(a1)
addia4,a5,4
addia5,a5,20
vle32.v v1,0(a5)
vle32.v v0,0(a4)
vmseq.vvv0,v0,v4

After this patch:

ld  a5,0(a1)
addia4,a5,4
addia5,a5,20
vle32.v v1,0(a5)
vle32.v v0,0(a4)
vmseq.viv0,v0,15

It's the missing feature caused by our some mistakes, support vi variant for 
vec_cmp like other patterns (add, sub, ..., etc).

Tested with no regression, ok for trunk ?

gcc/ChangeLog:

* config/riscv/autovec.md: Support vi variant.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/cmp/cmp_vi-1.c: New test.
* gcc.target/riscv/rvv/autovec/cmp/cmp_vi-2.c: New test.
* gcc.target/riscv/rvv/autovec/cmp/cmp_vi-3.c: New test.
* gcc.target/riscv/rvv/autovec/cmp/cmp_vi-4.c: New test.
* gcc.target/riscv/rvv/autovec/cmp/cmp_vi-5.c: New test.
* gcc.target/riscv/rvv/autovec/cmp/cmp_vi-6.c: New test.
* gcc.target/riscv/rvv/autovec/cmp/cmp_vi-7.c: New test.
* gcc.target/riscv/rvv/autovec/cmp/cmp_vi-8.c: New test.
* gcc.target/riscv/rvv/autovec/cmp/cmp_vi-9.c: New test.
* gcc.target/riscv/rvv/autovec/cmp/macro.h: New test.

---
 gcc/config/riscv/autovec.md   |  4 +--
 .../riscv/rvv/autovec/cmp/cmp_vi-1.c  | 16 +++
 .../riscv/rvv/autovec/cmp/cmp_vi-2.c  | 16 +++
 .../riscv/rvv/autovec/cmp/cmp_vi-3.c  | 28 +++
 .../riscv/rvv/autovec/cmp/cmp_vi-4.c  | 28 +++
 .../riscv/rvv/autovec/cmp/cmp_vi-5.c  | 16 +++
 .../riscv/rvv/autovec/cmp/cmp_vi-6.c  | 16 +++
 .../riscv/rvv/autovec/cmp/cmp_vi-7.c  | 28 +++
 .../riscv/rvv/autovec/cmp/cmp_vi-8.c  | 28 +++
 .../riscv/rvv/autovec/cmp/cmp_vi-9.c  | 18 
 .../gcc.target/riscv/rvv/autovec/cmp/macro.h  | 11 
 11 files changed, 207 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-3.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-4.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-5.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-6.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-7.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-8.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-9.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/macro.h

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 706cd9717cb..5ec1c59bdd4 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -664,7 +664,7 @@
   [(set (match_operand: 0 "register_operand")
(match_operator: 1 "comparison_operator"
  [(match_operand:V_VLSI 2 "register_operand")
-  (match_operand:V_VLSI 3 "register_operand")]))]
+  (match_operand:V_VLSI 3 "nonmemory_operand")]))]
   "TARGET_VECTOR"
   {
 riscv_vector::expand_vec_cmp (operands[0], GET_CODE (operands[1]),
@@ -677,7 +677,7 @@
   [(set (match_operand: 0 "register_operand")
(match_operator: 1 "comparison_operator"
  [(match_operand:V_VLSI 2 "register_operand")
-  (match_operand:V_VLSI 3 "register_operand")]))]
+  (match_operand:V_VLSI 3 "nonmemory_operand")]))]
   "TARGET_VECTOR"
   {
 riscv_vector::expand_vec_cmp (operands[0], GET_CODE (operands[1]),
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-1.c
new file mode 100644
index 000..10c232f77bd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/cmp/cmp_vi-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include "macro.h"
+
+CMP_VI (ne_char, char, n, !=, 15)
+CMP_VI (ne_short, short, n, !=, 15)
+CMP_VI (ne_int, int, n, !=, 15)
+CMP_VI (ne_long, long, n, !=, 15)
+CMP_VI (ne_unsigned_char, unsigned char, n, !=, 15)
+CMP_VI (ne_unsigned_short, unsigned short, n, !=, 15)
+CMP_VI (ne_unsigned_int, unsigned int, n, !=, 15)
+CMP_VI (ne_unsigned_long, unsigned long, n, !=, 15)

[PATCH v2] test regression fix: Add !vect128 for variable length targets of bb-slp-subgroups-3.c

2024-01-17 Thread Juzhe-Zhong

gcc/testsuite/ChangeLog:

* gcc.dg/vect/bb-slp-subgroups-3.c: Add !vect128.

---
 gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c
index fb719915db7..d1d79125731 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c
@@ -42,7 +42,7 @@ main (int argc, char **argv)
 /* Because we disable the cost model, targets with variable-length
vectors can end up vectorizing the store to a[0..7] on its own.
With the cost model we do something sensible.  */
-/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { 
target { ! amdgcn-*-* } xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { 
target { ! amdgcn-*-* } xfail { vect_variable_length && { ! vect128 } } } } } */
 
 /* amdgcn can do this in one vector.  */
 /* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp2" { 
target amdgcn-*-* } } } */
-- 
2.36.3

[Committed V3] RISC-V: Add has compatible check for conflict vsetvl fusion

2024-01-17 Thread Juzhe-Zhong

V3: Rebase to trunk and commit it.

This patch fixes SPEC2017 cam4 mismatch issue due to we miss has compatible 
check
for conflict vsetvl fusion.

Buggy assembler before this patch:

.L69:
vsetvli a5,s1,e8,mf4,ta,ma  -> buggy vsetvl
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.i v1,0
vse8.v  v1,0(a5)
j   .L37
.L68:
vsetvli a5,s1,e8,mf4,ta,ma  -> buggy vsetvl
vsetivlizero,8,e8,mf2,ta,ma
addia3,a5,8
vmv.v.i v1,0
vse8.v  v1,0(a5)
vse8.v  v1,0(a3)
addia4,a4,-16
li  a3,8
bltua4,a3,.L37
j   .L69
.L67:
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.i v1,0
vse8.v  v1,0(a5)
addia5,sp,56
vse8.v  v1,0(a5)
addis4,sp,64
addia3,sp,72
vse8.v  v1,0(s4)
vse8.v  v1,0(a3)
addia4,a4,-32
li  a3,16
bltua4,a3,.L36
j   .L68

After this patch:

.L63:
ble s1,zero,.L49
sllia4,s1,3
li  a3,32
addia5,sp,48
bltua4,a3,.L62
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.i v1,0
vse8.v  v1,0(a5)
addia5,sp,56
vse8.v  v1,0(a5)
addis4,sp,64
addia3,sp,72
vse8.v  v1,0(s4)
addia4,a4,-32
addia5,sp,80
vse8.v  v1,0(a3)
.L35:
li  a3,16
bltua4,a3,.L36
addia3,a5,8
vmv.v.i v1,0
addia4,a4,-16
vse8.v  v1,0(a5)
addia5,a5,16
vse8.v  v1,0(a3)
.L36:
li  a3,8
bltua4,a3,.L37
vmv.v.i v1,0
vse8.v  v1,0(a5)

Tested on both RV32/RV64 no regression, Ok for trunk ?

PR target/113429

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pre_vsetvl::earliest_fuse_vsetvl_info): 
Fix bug.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/vlmax_conflict-4.c: Adapt test.
* gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Ditto.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 43 +++
 .../riscv/rvv/vsetvl/vlmax_conflict-4.c   |  5 +--
 .../riscv/rvv/vsetvl/vlmax_conflict-5.c   | 10 ++---
 3 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 41d4b80648f..2067073185f 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2254,6 +2254,22 @@ private:
 return true;
   }
 
+  bool has_compatible_reaching_vsetvl_p (vsetvl_info info)
+  {
+unsigned int index;
+sbitmap_iterator sbi;
+EXECUTE_IF_SET_IN_BITMAP (m_vsetvl_def_in[info.get_bb ()->index ()], 0,
+ index, sbi)
+  {
+   const auto prev_info = *m_vsetvl_def_exprs[index];
+   if (!prev_info.valid_p ())
+ continue;
+   if (m_dem.compatible_p (prev_info, info))
+ return true;
+  }
+return false;
+  }
+
   bool preds_all_same_avl_and_ratio_p (const vsetvl_info &curr_info)
   {
 gcc_assert (
@@ -3076,22 +3092,8 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter)
{
  vsetvl_info new_curr_info = curr_info;
  new_curr_info.set_bb (crtl->ssa->bb (eg->dest));
- bool has_compatible_p = false;
- unsigned int def_expr_index;
- sbitmap_iterator sbi2;
- EXECUTE_IF_SET_IN_BITMAP (
-   m_vsetvl_def_in[new_curr_info.get_bb ()->index ()], 0,
-   def_expr_index, sbi2)
-   {
- vsetvl_info &prev_info = *m_vsetvl_def_exprs[def_expr_index];
- if (!prev_info.valid_p ())
-   continue;
- if (m_dem.compatible_p (prev_info, new_curr_info))
-   {
- has_compatible_p = true;
- break;
-   }
-   }
+ bool has_compatible_p
+   = has_compatible_reaching_vsetvl_p (new_curr_info);
  if (!has_compatible_p)
{
  if (dump_file && (dump_flags & TDF_DETAILS))
@@ -3146,7 +3148,10 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter)
  else
{
  /* Cancel lift up if probabilities are equal.  */
- if (successors_probability_equal_p (eg->src))
+ if (successors_probability_equal_p (eg->src)
+ || (dest_block_info.probability
+   > src_block_info.probability
+ && !has_compatible_reaching_vsetvl_p (curr_info)))
{
  if (dump_file && (dump_flags & TDF_DETAILS))
{
@@ -3154,8 +3159,8 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter)
   "  Reset bb %u:",

[PATCH V2] RISC-V: Add has compatible check for conflict vsetvl fusion

2024-01-17 Thread Juzhe-Zhong

This patch fixes SPEC2017 cam4 mismatch issue due to we miss has compatible 
check
for conflict vsetvl fusion.

Buggy assembler before this patch:

.L69:
vsetvli a5,s1,e8,mf4,ta,ma  -> buggy vsetvl
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.i v1,0
vse8.v  v1,0(a5)
j   .L37
.L68:
vsetvli a5,s1,e8,mf4,ta,ma  -> buggy vsetvl
vsetivlizero,8,e8,mf2,ta,ma
addia3,a5,8
vmv.v.i v1,0
vse8.v  v1,0(a5)
vse8.v  v1,0(a3)
addia4,a4,-16
li  a3,8
bltua4,a3,.L37
j   .L69
.L67:
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.i v1,0
vse8.v  v1,0(a5)
addia5,sp,56
vse8.v  v1,0(a5)
addis4,sp,64
addia3,sp,72
vse8.v  v1,0(s4)
vse8.v  v1,0(a3)
addia4,a4,-32
li  a3,16
bltua4,a3,.L36
j   .L68

After this patch:

.L63:
ble s1,zero,.L49
sllia4,s1,3
li  a3,32
addia5,sp,48
bltua4,a3,.L62
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.i v1,0
vse8.v  v1,0(a5)
addia5,sp,56
vse8.v  v1,0(a5)
addis4,sp,64
addia3,sp,72
vse8.v  v1,0(s4)
addia4,a4,-32
addia5,sp,80
vse8.v  v1,0(a3)
.L35:
li  a3,16
bltua4,a3,.L36
addia3,a5,8
vmv.v.i v1,0
addia4,a4,-16
vse8.v  v1,0(a5)
addia5,a5,16
vse8.v  v1,0(a3)
.L36:
li  a3,8
bltua4,a3,.L37
vmv.v.i v1,0
vse8.v  v1,0(a5)

Tested on both RV32/RV64 no regression, Ok for trunk ?

PR target/113429

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pre_vsetvl::earliest_fuse_vsetvl_info): 
Fix conflict vsetvl fusion.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/vlmax_conflict-4.c: Adapt test.
* gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Ditto.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 39 +++
 .../riscv/rvv/vsetvl/vlmax_conflict-4.c   |  5 +--
 .../riscv/rvv/vsetvl/vlmax_conflict-5.c   | 10 ++---
 3 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index df7ed149388..76e3d2eb471 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2254,6 +2254,22 @@ private:
 return true;
   }
 
+  bool has_compatible_reaching_vsetvl_p (vsetvl_info info)
+  {
+unsigned int index;
+sbitmap_iterator sbi;
+EXECUTE_IF_SET_IN_BITMAP (m_vsetvl_def_in[info.get_bb ()->index ()], 0,
+ index, sbi)
+  {
+   const auto prev_info = *m_vsetvl_def_exprs[index];
+   if (!prev_info.valid_p ())
+ continue;
+   if (m_dem.compatible_p (prev_info, info))
+ return true;
+  }
+return false;
+  }
+
   bool preds_all_same_avl_and_ratio_p (const vsetvl_info &curr_info)
   {
 gcc_assert (
@@ -3075,22 +3091,8 @@ pre_vsetvl::earliest_fuse_vsetvl_info ()
{
  vsetvl_info new_curr_info = curr_info;
  new_curr_info.set_bb (crtl->ssa->bb (eg->dest));
- bool has_compatible_p = false;
- unsigned int def_expr_index;
- sbitmap_iterator sbi2;
- EXECUTE_IF_SET_IN_BITMAP (
-   m_vsetvl_def_in[new_curr_info.get_bb ()->index ()], 0,
-   def_expr_index, sbi2)
-   {
- vsetvl_info &prev_info = *m_vsetvl_def_exprs[def_expr_index];
- if (!prev_info.valid_p ())
-   continue;
- if (m_dem.compatible_p (prev_info, new_curr_info))
-   {
- has_compatible_p = true;
- break;
-   }
-   }
+ bool has_compatible_p
+   = has_compatible_reaching_vsetvl_p (new_curr_info);
  if (!has_compatible_p)
{
  if (dump_file && (dump_flags & TDF_DETAILS))
@@ -3146,7 +3148,10 @@ pre_vsetvl::earliest_fuse_vsetvl_info ()
   && !m_dem.compatible_p (prev_info, curr_info))
{
  /* Cancel lift up if probabilities are equal.  */
- if (successors_probability_equal_p (eg->src))
+ if (successors_probability_equal_p (eg->src)
+ || (dest_block_info.probability
+   > src_block_info.probability
+ && !has_compatible_reaching_vsetvl_p (curr_info)))
{
  if (dump_file && (dump_flags & TDF_DETAILS))
{
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vlmax_conflict-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/vlm

[PATCH] RISC-V: Add has compatible check for conflict vsetvl fusion

2024-01-17 Thread Juzhe-Zhong

This patch fixes SPEC2017 cam4 mismatch issue due to we miss has compatible 
check
for conflict vsetvl fusion.

Buggy assembler before this patch:

.L69:
vsetvli a5,s1,e8,mf4,ta,ma  -> buggy vsetvl
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.i v1,0
vse8.v  v1,0(a5)
j   .L37
.L68:
vsetvli a5,s1,e8,mf4,ta,ma  -> buggy vsetvl
vsetivlizero,8,e8,mf2,ta,ma
addia3,a5,8
vmv.v.i v1,0
vse8.v  v1,0(a5)
vse8.v  v1,0(a3)
addia4,a4,-16
li  a3,8
bltua4,a3,.L37
j   .L69
.L67:
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.i v1,0
vse8.v  v1,0(a5)
addia5,sp,56
vse8.v  v1,0(a5)
addis4,sp,64
addia3,sp,72
vse8.v  v1,0(s4)
vse8.v  v1,0(a3)
addia4,a4,-32
li  a3,16
bltua4,a3,.L36
j   .L68

After this patch:

.L63:
ble s1,zero,.L49
sllia4,s1,3
li  a3,32
addia5,sp,48
bltua4,a3,.L62
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.i v1,0
vse8.v  v1,0(a5)
addia5,sp,56
vse8.v  v1,0(a5)
addis4,sp,64
addia3,sp,72
vse8.v  v1,0(s4)
addia4,a4,-32
addia5,sp,80
vse8.v  v1,0(a3)
.L35:
li  a3,16
bltua4,a3,.L36
addia3,a5,8
vmv.v.i v1,0
addia4,a4,-16
vse8.v  v1,0(a5)
addia5,a5,16
vse8.v  v1,0(a3)
.L36:
li  a3,8
bltua4,a3,.L37
vmv.v.i v1,0
vse8.v  v1,0(a5)

Tested on both RV32/RV64 no regression, Ok for trunk ?

PR target/113429

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc: Fix bug of conflict vsetvl fusion.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/spec2017_cam4/ppgrid.mod: New test.
* gcc.target/riscv/rvv/spec2017_cam4/shr_kind_mod.mod: New test.
* gcc.target/riscv/rvv/spec2017_cam4/pr113429.f90: New test.
* gcc.target/riscv/rvv/vsetvl/vlmax_conflict-4.c: Adapt test.
* gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Ditto.

---
 gcc/config/riscv/riscv-vsetvl.cc  |  39 ---
 .../rvv/fortran/spec2017_cam4/ppgrid.mod  | Bin 0 -> 296 bytes
 .../rvv/fortran/spec2017_cam4/pr113429.f90| 110 ++
 .../fortran/spec2017_cam4/shr_kind_mod.mod| Bin 0 -> 499 bytes
 .../gcc.target/riscv/rvv/rvv-fortran.exp  |   2 +
 .../riscv/rvv/vsetvl/vlmax_conflict-4.c   |   5 +-
 .../riscv/rvv/vsetvl/vlmax_conflict-5.c   |  10 +-
 7 files changed, 140 insertions(+), 26 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/fortran/spec2017_cam4/ppgrid.mod
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/fortran/spec2017_cam4/pr113429.f90
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/fortran/spec2017_cam4/shr_kind_mod.mod

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index df7ed149388..76e3d2eb471 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2254,6 +2254,22 @@ private:
 return true;
   }
 
+  bool has_compatible_reaching_vsetvl_p (vsetvl_info info)
+  {
+unsigned int index;
+sbitmap_iterator sbi;
+EXECUTE_IF_SET_IN_BITMAP (m_vsetvl_def_in[info.get_bb ()->index ()], 0,
+ index, sbi)
+  {
+   const auto prev_info = *m_vsetvl_def_exprs[index];
+   if (!prev_info.valid_p ())
+ continue;
+   if (m_dem.compatible_p (prev_info, info))
+ return true;
+  }
+return false;
+  }
+
   bool preds_all_same_avl_and_ratio_p (const vsetvl_info &curr_info)
   {
 gcc_assert (
@@ -3075,22 +3091,8 @@ pre_vsetvl::earliest_fuse_vsetvl_info ()
{
  vsetvl_info new_curr_info = curr_info;
  new_curr_info.set_bb (crtl->ssa->bb (eg->dest));
- bool has_compatible_p = false;
- unsigned int def_expr_index;
- sbitmap_iterator sbi2;
- EXECUTE_IF_SET_IN_BITMAP (
-   m_vsetvl_def_in[new_curr_info.get_bb ()->index ()], 0,
-   def_expr_index, sbi2)
-   {
- vsetvl_info &prev_info = *m_vsetvl_def_exprs[def_expr_index];
- if (!prev_info.valid_p ())
-   continue;
- if (m_dem.compatible_p (prev_info, new_curr_info))
-   {
- has_compatible_p = true;
- break;
-   }
-   }
+ bool has_compatible_p
+   = has_compatible_reaching_vsetvl_p (new_curr_info);
  if (!has_compatible_p)
{
  if (dump_file && (dump_flags & TDF_DETAILS))
@@ -3146,7 +3148,10 @@ pre_vsetvl::earliest_fuse_vsetvl_info ()

[PATCH v2] test regression fix: Add vect128 for bb-slp-43.c

2024-01-16 Thread Juzhe-Zhong

gcc/testsuite/ChangeLog:

* gcc.dg/vect/bb-slp-43.c: Add vect128.

---
 gcc/testsuite/gcc.dg/vect/bb-slp-43.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c
index dad2d24262d..8aedb06bf72 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c
@@ -14,4 +14,4 @@ f (int *restrict x, short *restrict y)
 }
 
 /* { dg-final { scan-tree-dump-not "mixed mask and nonmask" "slp2" } } */
-/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { 
target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } 
xfail { vect_variable_length && { ! vect256 } } } } } */
+/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { 
target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } 
xfail { vect_variable_length && { { ! vect128 } && { ! vect256 } } } } } } */
-- 
2.36.3

[PATCH] test regression fix: Remove xfail for variable length targets of bb-slp-subgroups-3.c

2024-01-15 Thread Juzhe-Zhong

Notice there is a regression recently:
XPASS: gcc.dg/vect/bb-slp-subgroups-3.c -flto -ffat-lto-objects  
scan-tree-dump-times slp2 "optimized: basic block" 2
XPASS: gcc.dg/vect/bb-slp-subgroups-3.c scan-tree-dump-times slp2 "optimized: 
basic block" 2

Checked on both ARM SVE an RVV:

https://godbolt.org/z/jz4cYbqc8

"optimized: basic block" appears twice.

I guess ARM SVE has the same XPASS as RVV.

Hi, Andrew. Could you confirm about it ?

gcc/testsuite/ChangeLog:

* gcc.dg/vect/bb-slp-subgroups-3.c: Remove XFAIL of variable length.

---
 gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c
index fb719915db7..3f0d45ce4a1 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c
@@ -42,7 +42,7 @@ main (int argc, char **argv)
 /* Because we disable the cost model, targets with variable-length
vectors can end up vectorizing the store to a[0..7] on its own.
With the cost model we do something sensible.  */
-/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { 
target { ! amdgcn-*-* } xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { 
target { ! amdgcn-*-* } } } } */
 
 /* amdgcn can do this in one vector.  */
 /* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp2" { 
target amdgcn-*-* } } } */
-- 
2.36.3

[PATCH] test regression fix: Remove xfail for variable length targets

2024-01-15 Thread Juzhe-Zhong

Recently notice there is a XPASS in RISC-V:
XPASS: gcc.dg/vect/bb-slp-43.c -flto -ffat-lto-objects  scan-tree-dump-not slp2 
"vector operands from scalars"
XPASS: gcc.dg/vect/bb-slp-43.c scan-tree-dump-not slp2 "vector operands from 
scalars"

And checked both ARM SVE and RVV:

https://godbolt.org/z/T9cPa7fh3

both has the same dump slp2. So I guess ARM SVE has the same XPASS in this test.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/bb-slp-43.c: Remove xfail for variable length.

---
 gcc/testsuite/gcc.dg/vect/bb-slp-43.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c
index dad2d24262d..40bd2e0dfbf 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c
@@ -14,4 +14,4 @@ f (int *restrict x, short *restrict y)
 }
 
 /* { dg-final { scan-tree-dump-not "mixed mask and nonmask" "slp2" } } */
-/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { 
target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } 
xfail { vect_variable_length && { ! vect256 } } } } } */
+/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { 
target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } 
} } } */
-- 
2.36.3

[PATCH] RISC-V: Report Sorry when users enable RVV in big-endian mode [PR113404]

2024-01-15 Thread Juzhe-Zhong

As PR113404 mentioned: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113404

We have ICE when we enable RVV in big-endian mode:

during RTL pass: expand
a-float-point-dynamic-frm-66.i:2:14: internal compiler error: in to_constant, 
at poly-int.h:588
0xab4c2c poly_int<2u, unsigned short>::to_constant() const
/repo/gcc-trunk/gcc/poly-int.h:588
0xab4de1 poly_int<2u, unsigned short>::to_constant() const
/repo/gcc-trunk/gcc/tree.h:4055
0xab4de1 default_function_arg_padding(machine_mode, tree_node const*)
/repo/gcc-trunk/gcc/targhooks.cc:844
0x12e2327 locate_and_pad_parm(machine_mode, tree_node*, int, int, int, 
tree_node*, args_size*, locate_and_pad_arg_data*)
/repo/gcc-trunk/gcc/function.cc:4061
0x12e2aca assign_parm_find_entry_rtl
/repo/gcc-trunk/gcc/function.cc:2614
0x12e2c89 assign_parms
/repo/gcc-trunk/gcc/function.cc:3693
0x12e59df expand_function_start(tree_node*)
/repo/gcc-trunk/gcc/function.cc:5152
0x112fafb execute
/repo/gcc-trunk/gcc/cfgexpand.cc:6739

Report users that we don't support RVV in big-endian mode for the following 
reasons:
1. big-endian in RISC-V is pretty rare case.
2. We didn't test RVV in big-endian and we don't have enough time to test it 
since it's stage 4 now.

Naive disallow RVV in big-endian.

Tested no regression, ok for trunk ?

PR target/113404

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_override_options_internal): Report sorry 
for RVV in big-endian mode.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/big_endian-1.c: New test.
* gcc.target/riscv/rvv/base/big_endian-2.c: New test.

---
 gcc/config/riscv/riscv.cc  | 5 +
 gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-1.c | 5 +
 gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-2.c | 5 +
 3 files changed, 15 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-2.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 89caf156f03..41626fa34e4 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -8787,6 +8787,11 @@ riscv_override_options_internal (struct gcc_options 
*opts)
 sorry ("Current RISC-V GCC cannot support VLEN greater than 4096bit for "
   "'V' Extension");
 
+  /* FIXME: We don't support RVV in big-endian for now, we may enable RVV with
+ big-endian after finishing full coverage testing.  */
+  if (TARGET_VECTOR && TARGET_BIG_ENDIAN)
+sorry ("Current RISC-V GCC cannot support RVV in big-endian mode");
+
   /* Convert -march to a chunks count.  */
   riscv_vector_chunks = riscv_convert_vector_bits (opts);
 }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-1.c
new file mode 100644
index 000..9eaf7ad33b2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-1.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -mbig-endian -O3" } */
+
+#pragma riscv intrinsic "vector"
+vfloat32m1_t foo (vfloat32m1_t) {} // { dg-excess-errors "sorry, 
unimplemented: Current RISC-V GCC cannot support RVV in big-endian mode" }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-2.c
new file mode 100644
index 000..86cf58370bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/big_endian-2.c
@@ -0,0 +1,5 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gc_zve32x -mabi=lp64d -mbig-endian -O3" } */
+
+#pragma riscv intrinsic "vector"
+vint32m1_t foo (vint32m1_t) {} // { dg-excess-errors "sorry, unimplemented: 
Current RISC-V GCC cannot support RVV in big-endian mode" }
-- 
2.36.3

[Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro

2024-01-15 Thread Juzhe-Zhong

This patch fixes -70% performance drop from GCC-13.2 to GCC-14 with 
-march=rv64gcv in real hardware.

The root cause is incorrect cost model cause inefficient vectorization which 
makes us performance drop significantly.

So this patch does:

1. Adjust vector to scalar cost by introducing v to scalar reg move.
2. Adjust vec_construct cost since we does spend NUNITS instructions to 
construct the vector.

Tested on both RV32/RV64 no regression, Rebase to the trunk and commit it as it 
is approved by Robin.

PR target/113247

gcc/ChangeLog:

* config/riscv/riscv-protos.h (struct regmove_vector_cost): Add vector 
to scalar regmove.
* config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Ditto.
* config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Adjust 
vec_construct cost.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Adapt test.
* gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c: New test.

---
 gcc/config/riscv/riscv-protos.h   |   2 +
 gcc/config/riscv/riscv-vector-costs.cc|   3 +
 gcc/config/riscv/riscv.cc |   4 +-
 .../vect/costmodel/riscv/rvv/pr113247-1.c | 195 ++
 .../vect/costmodel/riscv/rvv/pr113247-2.c |   6 +
 .../vect/costmodel/riscv/rvv/pr113247-3.c |   6 +
 .../vect/costmodel/riscv/rvv/pr113247-4.c |   6 +
 .../riscv/rvv/autovec/vls/reduc-19.c  |   2 +-
 .../riscv/rvv/autovec/vls/reduc-20.c  |   2 +-
 .../riscv/rvv/autovec/vls/reduc-21.c  |   2 +-
 10 files changed, 224 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 4f3b677f4f9..21f6dadf113 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -255,6 +255,8 @@ struct regmove_vector_cost
 {
   const int GR2VR;
   const int FR2VR;
+  const int VR2GR;
+  const int VR2FR;
 };
 
 /* Cost for vector insn classes.  */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 90ab93b7506..7c9840df4e9 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1056,6 +1056,9 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree 
vectype, int stmt_cost)
 case scalar_to_vec:
   return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
  : costs->regmove->GR2VR);
+case vec_to_scalar:
+  return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
+ : costs->regmove->VR2GR);
 default:
   break;
 }
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index ee1a57b321d..568db90a27d 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -395,6 +395,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
 static const regmove_vector_cost rvv_regmove_vector_cost = {
   2, /* GR2VR  */
   2, /* FR2VR  */
+  2, /* VR2GR  */
+  2, /* VR2FR  */
 };
 
 /* Generic costs for vector insn classes.  It is supposed to be the vector cost
@@ -10522,7 +10524,7 @@ riscv_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
   return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
 
 case vec_construct:
-  return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)) - 1;
+  return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
 
 default:
   gcc_unreachable ();
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
new file mode 100644
index 000..0d09a624a00
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
@@ -0,0 +1,195 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
--param=riscv-autovec-lmul=dynamic" } */
+
+#include 
+
+#define Ch(x,y,z)   (z ^ (x & (y ^ z)))
+#define Maj(x,y,z)  ((x & y) | (z & (x | y)))
+
+#define SHR(x, n)(x >> n)
+#define ROTR(x,n)(SHR(x,n) | (x << (32 - n)))
+#define S1(x)(ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
+#define S0(x)(ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
+
+#define s1(x)(ROTR(x,17) ^ ROTR(x,19) ^  SHR(x,10))
+#define s0(x)(ROTR

[Committed V3] RISC-V: Adjust loop len by costing 1 when NITER < VF

2024-01-15 Thread Juzhe-Zhong

Rebase in v3: Rebase to the trunk and commit it as it's approved by Robin.
Update in v2: Add dynmaic lmul test.

This patch fixes the regression between GCC 13.2.0 and trunk GCC (GCC-14)

GCC 13.2.0:

lui a5,%hi(a)
li  a4,19
sb  a4,%lo(a)(a5)
li  a0,0
ret

Trunk GCC:

vsetvli a5,zero,e8,mf2,ta,ma
li  a4,-32768
vid.v   v1
vsetvli zero,zero,e16,m1,ta,ma
addiw   a4,a4,104
vmv.v.i v3,15
lui a1,%hi(a)
li  a0,19
vsetvli zero,zero,e8,mf2,ta,ma
vadd.vi v1,v1,1
sb  a0,%lo(a)(a1)
vsetvli zero,zero,e16,m1,ta,ma
vzext.vf2   v2,v1
vmv.v.x v1,a4
vminu.vvv2,v2,v3
vsrl.vv v1,v1,v2
vslidedown.vi   v1,v1,17
vmv.x.s a0,v1
sneza0,a0
ret

The root cause we are vectorizing the codes inefficiently since we doesn't cost 
len when NITERS < VF.
Leverage loop control of mask targets or rs6000 fixes the regression.

Tested no regression. Ok for trunk ?

PR target/113281

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc 
(costs::adjust_vect_cost_per_loop): New function.
(costs::finish_cost): Adjust cost for LOOP LEN with NITERS < VF.
* config/riscv/riscv-vector-costs.h: New function.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-5.c: New test.
---
 gcc/config/riscv/riscv-vector-costs.cc| 57 +++
 gcc/config/riscv/riscv-vector-costs.h |  2 +
 .../vect/costmodel/riscv/rvv/pr113281-3.c | 18 ++
 .../vect/costmodel/riscv/rvv/pr113281-4.c | 18 ++
 .../vect/costmodel/riscv/rvv/pr113281-5.c | 18 ++
 5 files changed, 113 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-5.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 090275c7efe..90ab93b7506 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1097,9 +1097,66 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
   return record_stmt_cost (stmt_info, where, count * stmt_cost);
 }
 
+/* For some target specific vectorization cost which can't be handled per stmt,
+   we check the requisite conditions and adjust the vectorization cost
+   accordingly if satisfied.  One typical example is to model model and adjust
+   loop_len cost for known_lt (NITERS, VF).  */
+
+void
+costs::adjust_vect_cost_per_loop (loop_vec_info loop_vinfo)
+{
+  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
+  && !LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+{
+  /* In middle-end loop vectorizer, we don't count the loop_len cost in
+vect_estimate_min_profitable_iters when NITERS < VF, that is, we only
+count cost of len that we need to iterate loop more than once with VF.
+It's correct for most of the cases:
+
+E.g. VF = [4, 4]
+  for (int i = 0; i < 3; i ++)
+a[i] += b[i];
+
+We don't need to cost MIN_EXPR or SELECT_VL for the case above.
+
+However, for some inefficient vectorized cases, it does use MIN_EXPR
+to generate len.
+
+E.g. VF = [256, 256]
+
+Loop body:
+  # loop_len_110 = PHI <18(2), _119(11)>
+  ...
+  _117 = MIN_EXPR ;
+  _118 = 18 - _117;
+  _119 = MIN_EXPR <_118, POLY_INT_CST [256, 256]>;
+  ...
+
+Epilogue:
+  ...
+  _112 = .VEC_EXTRACT (vect_patt_27.14_109, _111);
+
+We cost 1 unconditionally for this situation like other targets which
+apply mask as the loop control.  */
+  rgroup_controls *rgc;
+  unsigned int num_vectors_m1;
+  unsigned int body_stmts = 0;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
+   if (rgc->type)
+ body_stmts += num_vectors_m1 + 1;
+
+  add_stmt_cost (body_stmts, scalar_stmt, NULL, NULL, NULL_TREE, 0,
+vect_body);
+}
+}
+
 void
 costs::finish_cost (const vector_costs *scalar_costs)
 {
+  if (loop_vec_info loop_vinfo = dyn_cast (m_vinfo))
+{
+  adjust_vect_cost_per_loop (loop_vinfo);
+}
   vector_costs::finish_cost (scalar_costs);
 }
 
diff --git a/gcc/config/riscv/riscv-vector-costs.h 
b/gcc/config/riscv/riscv-vector-costs.h
index dc0d61f5d4a..4e2bbfd5ca9 100644
--- a/gcc/config/riscv/riscv-vector-costs.h
+++ b/gcc/config/riscv/riscv-vector-costs.h
@@ -96,6 +96,8 @@ private:
  V_REGS spills according to the analysis.  */
   bool m_has_unexpected_spills_p = false;
   void record_potential_une

[Committed] RISC-V: Add optimized dump check of VLS reduc tests

2024-01-15 Thread Juzhe-Zhong

Add more dump check to robostify the tests.

Committed.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/reduc-1.c: Add dump check.
* gcc.target/riscv/rvv/autovec/vls/reduc-10.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-11.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-12.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-13.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-14.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-15.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-17.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-18.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-7.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-9.c: Ditto.

---
 .../gcc.target/riscv/rvv/autovec/vls/reduc-1.c | 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-10.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-11.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-12.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-13.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-14.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-15.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-16.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-17.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-18.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-19.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-2.c | 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-20.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-21.c| 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-3.c | 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-4.c | 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-5.c | 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-6.c | 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-7.c | 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-8.c | 14 +-
 .../gcc.target/riscv/rvv/autovec/vls/reduc-9.c | 14 +-
 21 files changed, 273 insertions(+), 21 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-1.c
index 2db25a2b05d..b6d8e6a51ed 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-1.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
--param=riscv-autovec-lmul=m8" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
--param=riscv-autovec-lmul=m8 -fdump-tree-optimized-details" } */
 
 #include "def.h"
 
@@ -29,3 +29,15 @@ DEF_REDUC_PLUS (uint8_t, 4096)
 
 /* { dg-final { scan-assembler-times {vredsum\.vs} 22 } } */
 /* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-10.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-10.c
index cdbbe11f611..22aace423cf 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-10.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-10.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
--param=riscv-autovec-lmul=m8" } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 
--param=riscv-autove

[Committed] RISC-V: Fix attributes bug configuration of ternary instructions

2024-01-14 Thread Juzhe-Zhong

This patch fixes the following FAILs:

Running target 
riscv-sim/-march=rv64gcv/-mabi=lp64d/-mcmodel=medlow/--param=riscv-autovec-preference=fixed-vlmax
FAIL: gcc.c-torture/execute/pr68532.c   -O0  execution test
FAIL: gcc.c-torture/execute/pr68532.c   -O1  execution test
FAIL: gcc.c-torture/execute/pr68532.c   -O2  execution test
FAIL: gcc.c-torture/execute/pr68532.c   -O3 -fomit-frame-pointer -funroll-loops 
-fpeel-loops -ftracer -finline-functions  execution test
FAIL: gcc.c-torture/execute/pr68532.c   -O3 -g  execution test
FAIL: gcc.c-torture/execute/pr68532.c   -Os  execution test
FAIL: gcc.c-torture/execute/pr68532.c   -O2 -flto -fno-use-linker-plugin 
-flto-partition=none  execution test

Running target 
riscv-sim/-march=rv64gcv/-mabi=lp64d/-mcmodel=medlow/--param=riscv-autovec-lmul=m2/--param=riscv-autovec-preference=fixed-vlmax
FAIL: gcc.dg/vect/pr60196-1.c execution test
FAIL: gcc.dg/vect/pr60196-1.c -flto -ffat-lto-objects execution test

Running target 
riscv-sim/-march=rv64gcv/-mabi=lp64d/-mcmodel=medlow/--param=riscv-autovec-preference=fixed-vlmax
FAIL: gcc.dg/vect/pr60196-1.c execution test
FAIL: gcc.dg/vect/pr60196-1.c -flto -ffat-lto-objects execution test

Running target 
riscv-sim/-march=rv64gcv_zvl256b/-mabi=lp64d/-mcmodel=medlow/--param=riscv-autovec-preference=fixed-vlmax
FAIL: gcc.dg/vect/pr60196-1.c execution test
FAIL: gcc.dg/vect/pr60196-1.c -flto -ffat-lto-objects execution test

The root cause is attributes of ternary intructions are incorrect which cause 
AVL prop PASS and VSETVL PASS behave
incorrectly.

Tested no regression and committed.

PR target/113393

gcc/ChangeLog:

* config/riscv/vector.md: Fix ternary attributes.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113393-1.c: New test.
* gcc.target/riscv/rvv/autovec/pr113393-2.c: New test.
* gcc.target/riscv/rvv/autovec/pr113393-3.c: New test.

---
 gcc/config/riscv/vector.md| 42 +--
 .../gcc.target/riscv/rvv/autovec/pr113393-1.c | 24 +++
 .../gcc.target/riscv/rvv/autovec/pr113393-2.c | 29 +
 .../gcc.target/riscv/rvv/autovec/pr113393-3.c |  5 +++
 4 files changed, 79 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113393-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113393-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113393-3.c

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index c1a282a27b3..ee4ee059a50 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -715,7 +715,7 @@
   (const_int 1)
 
   (eq_attr "type" "vimuladd,vfmuladd")
-  (const_int 5)]
+  (const_int 2)]
(const_int INVALID_ATTRIBUTE)))
 
 ;; The index of operand[] represents the machine mode of the instruction.
@@ -5308,7 +5308,7 @@
vmv.v.v\t%0,%2\;vmadd.vv\t%0,%3,%4%p1"
   [(set_attr "type" "vimuladd")
(set_attr "mode" "")
-   (set_attr "merge_op_idx" "4")
+   (set_attr "merge_op_idx" "2")
(set_attr "vl_op_idx" "5")
(set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[6])"))
(set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[7])"))
@@ -5339,7 +5339,7 @@
vmv.v.v\t%0,%4\;vmacc.vv\t%0,%2,%3%p1"
   [(set_attr "type" "vimuladd")
(set_attr "mode" "")
-   (set_attr "merge_op_idx" "2")
+   (set_attr "merge_op_idx" "4")
(set_attr "vl_op_idx" "5")
(set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[6])"))
(set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[7])"))
@@ -5392,7 +5392,7 @@
vmv.v.v\t%0,%3\;vmadd.vx\t%0,%2,%4%p1"
   [(set_attr "type" "vimuladd")
(set_attr "mode" "")
-   (set_attr "merge_op_idx" "4")
+   (set_attr "merge_op_idx" "3")
(set_attr "vl_op_idx" "5")
(set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[6])"))
(set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[7])"))
@@ -5424,7 +5424,7 @@
vmv.v.v\t%0,%4\;vmacc.vx\t%0,%2,%3%p1"
   [(set_attr "type" "vimuladd")
(set_attr "mode" "")
-   (set_attr "merge_op_idx" "2")
+   (set_attr "merge_op_idx" "4")
(set_attr "vl_op_idx" "5")
(set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[6])"))
(set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[7])"))
@@ -5492,7 +5492,7 @@
vmv.v.v\t%0,%2\;vmadd.vx\t%0,%2,%4%p1"
   [(set_attr "type" "vimuladd")
(set_attr "mode" "")
-   (set_attr "merge_op_idx" "4")
+   (set_attr "merge_op_idx" "3")
(set_attr "vl_op_idx" "5")
(set (attr "ta") (symbol_ref "riscv_vector::get_ta(operands[6])"))
(set (attr "ma") (symbol_ref "riscv_vector::get_ma(operands[7])"))
@@ -5525,7 +5525,7 @@
vmv.v.v\t%0,%4\;vmacc.vx\t%0,%2,%3%p1"
   [(set_attr "type" "vimuladd")
(set_attr "mode" "")
-   (set_attr "merge_op_idx" "2")
+   (set_attr "merge_op_idx" "4")
(set_attr "vl_op_idx" "5")
(set (attr "ta") (symbol_ref "riscv_vector

[PATCH] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro

2024-01-14 Thread Juzhe-Zhong

This patch fixes -70% performance drop from GCC-13.2 to GCC-14 with 
-march=rv64gcv in real hardware.

The root cause is incorrect cost model cause inefficient vectorization which 
makes us performance drop significantly.

So this patch does:

1. Adjust vector to scalar cost by introducing v to scalar reg move.
2. Adjust vec_construct cost since we does spend NUNITS instructions to 
construct the vector.

Tested on both RV32/RV64 no regression, ok for trunk ?
 
PR target/113247

gcc/ChangeLog:

* config/riscv/riscv-protos.h (struct regmove_vector_cost): Add vector 
to scalar regmove.
* config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Ditto.
* config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Adjust 
vec_construct cost.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Adapt test.
* gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c: New test.

---
 gcc/config/riscv/riscv-protos.h   |   2 +
 gcc/config/riscv/riscv-vector-costs.cc|   3 +
 gcc/config/riscv/riscv.cc |   4 +-
 .../vect/costmodel/riscv/rvv/pr113247-1.c | 195 ++
 .../vect/costmodel/riscv/rvv/pr113247-2.c |   6 +
 .../vect/costmodel/riscv/rvv/pr113247-3.c |   6 +
 .../vect/costmodel/riscv/rvv/pr113247-4.c |   6 +
 .../riscv/rvv/autovec/vls/reduc-19.c  |  11 +-
 .../riscv/rvv/autovec/vls/reduc-20.c  |  11 +-
 .../riscv/rvv/autovec/vls/reduc-21.c  |  11 +-
 10 files changed, 251 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 4f3b677f4f9..21f6dadf113 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -255,6 +255,8 @@ struct regmove_vector_cost
 {
   const int GR2VR;
   const int FR2VR;
+  const int VR2GR;
+  const int VR2FR;
 };
 
 /* Cost for vector insn classes.  */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 8adf5700890..298702d2807 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1069,6 +1069,9 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree 
vectype, int stmt_cost)
 case scalar_to_vec:
   return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
  : costs->regmove->GR2VR);
+case vec_to_scalar:
+  return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
+ : costs->regmove->VR2GR);
 default:
   break;
 }
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index ee1a57b321d..568db90a27d 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -395,6 +395,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
 static const regmove_vector_cost rvv_regmove_vector_cost = {
   2, /* GR2VR  */
   2, /* FR2VR  */
+  2, /* VR2GR  */
+  2, /* VR2FR  */
 };
 
 /* Generic costs for vector insn classes.  It is supposed to be the vector cost
@@ -10522,7 +10524,7 @@ riscv_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
   return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
 
 case vec_construct:
-  return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)) - 1;
+  return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
 
 default:
   gcc_unreachable ();
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
new file mode 100644
index 000..0d09a624a00
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
@@ -0,0 +1,195 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize 
--param=riscv-autovec-lmul=dynamic" } */
+
+#include 
+
+#define Ch(x,y,z)   (z ^ (x & (y ^ z)))
+#define Maj(x,y,z)  ((x & y) | (z & (x | y)))
+
+#define SHR(x, n)(x >> n)
+#define ROTR(x,n)(SHR(x,n) | (x << (32 - n)))
+#define S1(x)(ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
+#define S0(x)(ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
+
+#define s1(x)(ROTR(x,17) ^ ROTR(x,19) ^  SHR(x,10))
+#define s0(x)(ROTR(x, 7) ^ ROTR(x,18) ^  SHR(x, 3))
+
+#define SH

[PATCH] RISC-V: Adjust loop len by costing 1 when NITER < VF

2024-01-14 Thread Juzhe-Zhong

Update in v2: Add dynmaic lmul test.

This patch fixes the regression between GCC 13.2.0 and trunk GCC (GCC-14)

GCC 13.2.0:

lui a5,%hi(a)
li  a4,19
sb  a4,%lo(a)(a5)
li  a0,0
ret

Trunk GCC:

vsetvli a5,zero,e8,mf2,ta,ma
li  a4,-32768
vid.v   v1
vsetvli zero,zero,e16,m1,ta,ma
addiw   a4,a4,104
vmv.v.i v3,15
lui a1,%hi(a)
li  a0,19
vsetvli zero,zero,e8,mf2,ta,ma
vadd.vi v1,v1,1
sb  a0,%lo(a)(a1)
vsetvli zero,zero,e16,m1,ta,ma
vzext.vf2   v2,v1
vmv.v.x v1,a4
vminu.vvv2,v2,v3
vsrl.vv v1,v1,v2
vslidedown.vi   v1,v1,17
vmv.x.s a0,v1
sneza0,a0
ret

The root cause we are vectorizing the codes inefficiently since we doesn't cost 
len when NITERS < VF.
Leverage loop control of mask targets or rs6000 fixes the regression.

Tested no regression. Ok for trunk ?

PR target/113281

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc 
(costs::adjust_vect_cost_per_loop): New function.
(costs::finish_cost): Adjust cost for LOOP LEN with NITERS < VF.
* config/riscv/riscv-vector-costs.h: New function.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-5.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 57 +++
 gcc/config/riscv/riscv-vector-costs.h |  2 +
 .../vect/costmodel/riscv/rvv/pr113281-3.c | 18 ++
 .../vect/costmodel/riscv/rvv/pr113281-4.c | 18 ++
 .../vect/costmodel/riscv/rvv/pr113281-5.c | 18 ++
 5 files changed, 113 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-5.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 1c3708f23a0..8adf5700890 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1110,9 +1110,66 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
   return record_stmt_cost (stmt_info, where, count * stmt_cost);
 }
 
+/* For some target specific vectorization cost which can't be handled per stmt,
+   we check the requisite conditions and adjust the vectorization cost
+   accordingly if satisfied.  One typical example is to model model and adjust
+   loop_len cost for known_lt (NITERS, VF).  */
+
+void
+costs::adjust_vect_cost_per_loop (loop_vec_info loop_vinfo)
+{
+  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
+  && !LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
+{
+  /* In middle-end loop vectorizer, we don't count the loop_len cost in
+vect_estimate_min_profitable_iters when NITERS < VF, that is, we only
+count cost of len that we need to iterate loop more than once with VF
+(m_num_vector_iterations > 1).  It's correct for most of the cases:
+
+E.g. VF = [4, 4]
+  for (int i = 0; i < 3; i ++)
+a[i] += b[i];
+
+We don't need to cost MIN_EXPR or SELECT_VL for the case above.
+
+However, for some inefficient vectorized cases, it does use MIN_EXPR
+to generate len.
+
+E.g. VF = [256, 256]
+
+Loop body:
+  # loop_len_110 = PHI <18(2), _119(11)>
+  ...
+  _117 = MIN_EXPR ;
+  _118 = 18 - _117;
+  _119 = MIN_EXPR <_118, POLY_INT_CST [256, 256]>;
+  ...
+
+Epilogue:
+  ...
+  _112 = .VEC_EXTRACT (vect_patt_27.14_109, _111);
+
+We cost 1 unconditionally for this situation like other targets which
+apply mask as the loop control.  */
+  rgroup_controls *rgc;
+  unsigned int num_vectors_m1;
+  unsigned int body_stmts = 0;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
+   if (rgc->type)
+ body_stmts += num_vectors_m1 + 1;
+
+  add_stmt_cost (body_stmts, scalar_stmt, NULL, NULL, NULL_TREE, 0,
+vect_body);
+}
+}
+
 void
 costs::finish_cost (const vector_costs *scalar_costs)
 {
+  if (loop_vec_info loop_vinfo = dyn_cast (m_vinfo))
+{
+  adjust_vect_cost_per_loop (loop_vinfo);
+}
   vector_costs::finish_cost (scalar_costs);
 }
 
diff --git a/gcc/config/riscv/riscv-vector-costs.h 
b/gcc/config/riscv/riscv-vector-costs.h
index 9bf041bb65c..3defd45fd4c 100644
--- a/gcc/config/riscv/riscv-vector-costs.h
+++ b/gcc/config/riscv/riscv-vector-costs.h
@@ -101,6 +101,8 @@ private:
  V_REGS spills according to the analysis.  */
   bool m_has_unexpected_spills_p = false;
   void record_potential_unexpected_spills (loop_vec_info);
+
+  void

[PATCH] RISC-V: Adjust loop len by costing 1 when NITER < VF [GCC 14 regression]

2024-01-12 Thread Juzhe-Zhong

This patch fixes the regression between GCC 13.2.0 and trunk GCC (GCC-14)

GCC 13.2.0:

lui a5,%hi(a)
li  a4,19
sb  a4,%lo(a)(a5)
li  a0,0
ret

Trunk GCC:

vsetvli a5,zero,e8,mf2,ta,ma
li  a4,-32768
vid.v   v1
vsetvli zero,zero,e16,m1,ta,ma
addiw   a4,a4,104
vmv.v.i v3,15
lui a1,%hi(a)
li  a0,19
vsetvli zero,zero,e8,mf2,ta,ma
vadd.vi v1,v1,1
sb  a0,%lo(a)(a1)
vsetvli zero,zero,e16,m1,ta,ma
vzext.vf2   v2,v1
vmv.v.x v1,a4
vminu.vvv2,v2,v3
vsrl.vv v1,v1,v2
vslidedown.vi   v1,v1,17
vmv.x.s a0,v1
sneza0,a0
ret

The root cause we are vectorizing the codes inefficiently since we doesn't cost 
len when NITERS < VF.
Leverage loop control of mask targets or rs6000 fixes the regression.

Tested no regression. Ok for trunk ?

PR target/113281

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc 
(costs::adjust_vect_cost_per_loop): New function.
(costs::finish_cost): Adjust cost
* config/riscv/riscv-vector-costs.h: New function.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 61 +++
 gcc/config/riscv/riscv-vector-costs.h |  2 +
 .../vect/costmodel/riscv/rvv/pr113281-3.c | 18 ++
 .../vect/costmodel/riscv/rvv/pr113281-4.c | 18 ++
 4 files changed, 99 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-3.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-4.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 1c3708f23a0..9c0b9a874de 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1110,9 +1110,70 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
   return record_stmt_cost (stmt_info, where, count * stmt_cost);
 }
 
+/* For some target specific vectorization cost which can't be handled per stmt,
+   we check the requisite conditions and adjust the vectorization cost
+   accordingly if satisfied.  One typical example is to model model and adjust
+   loop_len cost for known_lt (NITERS, VF).  */
+
+void
+costs::adjust_vect_cost_per_loop (loop_vec_info loop_vinfo)
+{
+  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)
+  && !LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo)
+  && m_num_vector_iterations == 1
+  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+  && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
+  LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+{
+  /* In middle-end loop vectorizer, we don't count the loop_len cost in
+vect_estimate_min_profitable_iters when NITERS < VF, that is, we only
+count cost of len that we need to iterate loop more than once with VF
+(m_num_vector_iterations > 1).  It's correct for most of the cases:
+
+E.g. VF = [4, 4]
+  for (int i = 0; i < 3; i ++)
+a[i] += b[i];
+
+We don't need to cost MIN_EXPR or SELECT_VL for the case above.
+
+However, for some inefficient vectorized cases, it does use MIN_EXPR
+to generate len.
+
+E.g. VF = [256, 256]
+
+Loop body:
+  # loop_len_110 = PHI <18(2), _119(11)>
+  ...
+  _117 = MIN_EXPR ;
+  _118 = 18 - _117;
+  _119 = MIN_EXPR <_118, POLY_INT_CST [256, 256]>;
+  ...
+
+Epilogue:
+  ...
+  _112 = .VEC_EXTRACT (vect_patt_27.14_109, _111);
+
+We cost 1 unconditionally for this situation like other targets which
+apply mask as the loop control.  */
+  rgroup_controls *rgc;
+  unsigned int num_vectors_m1;
+  unsigned int body_stmts = 0;
+  FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
+   if (rgc->type)
+ body_stmts += num_vectors_m1 + 1;
+
+  add_stmt_cost (body_stmts, scalar_stmt, NULL, NULL, NULL_TREE, 0,
+vect_body);
+}
+}
+
 void
 costs::finish_cost (const vector_costs *scalar_costs)
 {
+  if (loop_vec_info loop_vinfo = dyn_cast (m_vinfo))
+{
+  adjust_vect_cost_per_loop (loop_vinfo);
+}
   vector_costs::finish_cost (scalar_costs);
 }
 
diff --git a/gcc/config/riscv/riscv-vector-costs.h 
b/gcc/config/riscv/riscv-vector-costs.h
index 9bf041bb65c..3defd45fd4c 100644
--- a/gcc/config/riscv/riscv-vector-costs.h
+++ b/gcc/config/riscv/riscv-vector-costs.h
@@ -101,6 +101,8 @@ private:
  V_REGS spills according to the analysis.  */
   bool m_has_unexpected_spills_p = false;
   void record_potential_unexpected_spills (loop_vec_info);
+
+  void adjust_vect_cost_per_loop (loop_vec_info);
 };
 
 } // namespace riscv_ve

[PATCH V3] RISC-V: Adjust scalar_to_vec cost

2024-01-12 Thread Juzhe-Zhong

1. Introduce vector regmove new tune info.
2. Adjust scalar_to_vec cost in add_stmt_cost.

We will get optimal codegen after this patch with -march=rv64gcv_zvl256b:

lui a5,%hi(a)
li  a4,19
sb  a4,%lo(a)(a5)
li  a0,0
ret

Tested on both RV32/RV64 no regression, Ok for trunk ?

PR target/113281

gcc/ChangeLog:

* config/riscv/riscv-protos.h (struct regmove_vector_cost): New struct.
(struct cpu_vector_cost): Add regmove struct.
(get_vector_costs): Export as global.
* config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Adjust 
scalar_to_vec cost.
(costs::add_stmt_cost): Ditto.
* config/riscv/riscv.cc (get_common_costs): Export global function.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113209.c: Adapt test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c: New test.

---
 gcc/config/riscv/riscv-protos.h   | 11 
 gcc/config/riscv/riscv-vector-costs.cc| 23 +
 gcc/config/riscv/riscv.cc | 25 ---
 .../vect/costmodel/riscv/rvv/pr113281-1.c | 18 +
 .../vect/costmodel/riscv/rvv/pr113281-2.c | 18 +
 .../gcc.target/riscv/rvv/autovec/pr113209.c   |  2 +-
 6 files changed, 87 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index e8c54c5be50..4f3b677f4f9 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -250,6 +250,13 @@ struct scalable_vector_cost : common_vector_cost
  E.g. fold_left reduction cost, lanes load/store cost, ..., etc.  */
 };
 
+/* Additional costs for register copies.  Cost is for one register.  */
+struct regmove_vector_cost
+{
+  const int GR2VR;
+  const int FR2VR;
+};
+
 /* Cost for vector insn classes.  */
 struct cpu_vector_cost
 {
@@ -276,6 +283,9 @@ struct cpu_vector_cost
 
   /* Cost of an VLA modes operations.  */
   const scalable_vector_cost *vla;
+
+  /* Cost of vector register move operations.  */
+  const regmove_vector_cost *regmove;
 };
 
 /* Routines implemented in riscv-selftests.cc.  */
@@ -764,5 +774,6 @@ struct riscv_tune_info {
 
 const struct riscv_tune_info *
 riscv_parse_tune (const char *, bool);
+const cpu_vector_cost *get_vector_costs ();
 
 #endif /* ! GCC_RISCV_PROTOS_H */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 58ec0b9b503..1c3708f23a0 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1055,6 +1055,26 @@ costs::better_main_loop_than_p (const vector_costs 
*uncast_other) const
   return vector_costs::better_main_loop_than_p (other);
 }
 
+/* Adjust vectorization cost after calling riscv_builtin_vectorization_cost.
+   For some statement, we would like to further fine-grain tweak the cost on
+   top of riscv_builtin_vectorization_cost handling which doesn't have any
+   information on statement operation codes etc.  */
+
+static unsigned
+adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost)
+{
+  const cpu_vector_cost *costs = get_vector_costs ();
+  switch (kind)
+{
+case scalar_to_vec:
+  return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
+ : costs->regmove->GR2VR);
+default:
+  break;
+}
+  return stmt_cost;
+}
+
 unsigned
 costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
  stmt_vec_info stmt_info, slp_tree, tree vectype,
@@ -1082,6 +1102,9 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 as one iteration of the VLA loop.  */
   if (where == vect_body && m_unrolled_vls_niters)
m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
+
+  if (vectype)
+   stmt_cost = adjust_stmt_cost (kind, vectype, stmt_cost);
 }
 
   return record_stmt_cost (stmt_info, where, count * stmt_cost);
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index f829014a589..ee1a57b321d 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -391,17 +391,24 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
   },
 };
 
+/* RVV register move cost.   */
+static const regmove_vector_cost rvv_regmove_vector_cost = {
+  2, /* GR2VR  */
+  2, /* FR2VR  */
+};
+
 /* Generic costs for vector insn classes.  It is supposed to be the vector cost
models used by default if no other cost model was specified.  */
 static const struct cpu_vector_cost generic_vector_cost = {
-  1,   /* scalar_int_stmt_cost  */
-  1,   /* scalar_fp_stmt_cost  */
-  1,   /* sca

[Committed] RISC-V: Enhance a testcase

2024-01-11 Thread Juzhe-Zhong

This test should pass no matter how we adjust cost model.

Remove -fno-vect-cost-model.

Committed.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/fold-min-poly.c: Remove 
-fno-vect-cost-model

---
 gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c
index de4c472c76e..3f524dba868 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/fold-min-poly.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options " -march=rv64gcv_zvl128b -mabi=lp64d -O3 --param 
riscv-autovec-preference=scalable --param riscv-autovec-lmul=m1 
-fno-vect-cost-model" } */
+/* { dg-options " -march=rv64gcv_zvl128b -mabi=lp64d -O3 --param 
riscv-autovec-preference=scalable --param riscv-autovec-lmul=m1" } */
 
 void foo1 (int* restrict a, int* restrict b, int n)
 {
-- 
2.36.3

[PATCH V2] RISC-V: Adjust scalar_to_vec cost accurately

2024-01-11 Thread Juzhe-Zhong

1. This patch set scalar_to_vec cost as 2 instead 1 since scalar move
   instruction is slightly more costly than normal rvv instructions (e.g. 
vadd.vv).

2. Adjust scalar_to_vec cost accurately according to the splat value, for 
example,
   a value like 32872, needs 2 more scalar instructions:
   so the cost = 2 (scalar instructions) + 2 (scalar move).
   We adjust the cost like this since it doesn need such many instructions in 
vectorized codes,
   wheras they are not needed in scalar codes.

After this patch, no matter -march=rv64gcv_zvl256b or -march=rv64gcv_zvl4096b.
We have optimal codgen:

lui a5,%hi(a)
li  a4,19
sb  a4,%lo(a)(a5)
li  a0,0
ret

PR target/113281

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Adjust 
scalar_to_vec cost accurately.
(costs::add_stmt_cost): Ditto.
* config/riscv/riscv.cc: Ditto.
* config/riscv/t-riscv: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113209.c: Adapt test.
* gcc.target/riscv/rvv/autovec/zve32f-1.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 50 ++-
 gcc/config/riscv/riscv.cc |  4 +-
 gcc/config/riscv/t-riscv  |  2 +-
 .../vect/costmodel/riscv/rvv/pr113281-1.c | 18 +++
 .../vect/costmodel/riscv/rvv/pr113281-2.c | 18 +++
 .../gcc.target/riscv/rvv/autovec/pr113209.c   |  2 +-
 .../gcc.target/riscv/rvv/autovec/zve32f-1.c   |  2 +-
 7 files changed, 90 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-2.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 58ec0b9b503..fc377435e53 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -42,6 +42,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "backend.h"
 #include "tree-data-ref.h"
 #include "tree-ssa-loop-niter.h"
+#include "emit-rtl.h"
 
 /* This file should be included last.  */
 #include "riscv-vector-costs.h"
@@ -1055,6 +1056,50 @@ costs::better_main_loop_than_p (const vector_costs 
*uncast_other) const
   return vector_costs::better_main_loop_than_p (other);
 }
 
+/* Adjust vectorization cost after calling
+   targetm.vectorize.builtin_vectorization_cost. For some statement, we would
+   like to further fine-grain tweak the cost on top of
+   targetm.vectorize.builtin_vectorization_cost handling which doesn't have any
+   information on statement operation codes etc.  */
+
+static unsigned
+adjust_stmt_cost (enum vect_cost_for_stmt kind,
+ struct _stmt_vec_info *stmt_info, int count, int stmt_cost)
+{
+  gimple *stmt = stmt_info->stmt;
+  switch (kind)
+{
+  case scalar_to_vec: {
+   stmt_cost *= count;
+   gcall *call = dyn_cast (stmt);
+   /* Adjust cost by counting the scalar value initialization.  */
+   unsigned int num
+ = call ? gimple_call_num_args (call) : gimple_num_ops (stmt);
+   unsigned int start = call ? 0 : 1;
+
+   for (unsigned int i = start; i < num; i++)
+ {
+   tree op = call ? gimple_call_arg (call, i) : gimple_op (stmt, i);
+   if (TREE_CODE (op) == INTEGER_CST)
+ {
+   HOST_WIDE_INT value = tree_fits_shwi_p (op) ? tree_to_shwi (op)
+   : tree_to_uhwi (op);
+   /* We don't need to count scalar costs if it
+  is in range of [-16, 15] since we can use
+  vmv.v.i.  */
+   if (!IN_RANGE (value, -16, 15))
+ stmt_cost += riscv_const_insns (gen_int_mode (value, Pmode));
+ }
+   /* TODO: We don't count CONST_POLY_INT value for now.  */
+ }
+   return stmt_cost;
+  }
+default:
+  break;
+}
+  return count * stmt_cost;
+}
+
 unsigned
 costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
  stmt_vec_info stmt_info, slp_tree, tree vectype,
@@ -1082,9 +1127,12 @@ costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
 as one iteration of the VLA loop.  */
   if (where == vect_body && m_unrolled_vls_niters)
m_unrolled_vls_stmts += count * m_unrolled_vls_niters;
+
+  if (vectype)
+   stmt_cost = adjust_stmt_cost (kind, stmt_info, count, stmt_cost);
 }
 
-  return record_stmt_cost (stmt_info, where, count * stmt_cost);
+  return record_stmt_cost (stmt_info, where, stmt_cost);
 }
 
 void
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index df9799d9c5e..a14fb36817a 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc

[PATCH] RISC-V: Increase scalar_to_vec_cost from 1 to 3

2024-01-11 Thread Juzhe-Zhong

This patch fixes the following inefficient vectorized codes:

vsetvli a5,zero,e8,mf2,ta,ma
li  a2,17
vid.v   v1
li  a4,-32768
vsetvli zero,zero,e16,m1,ta,ma
addiw   a4,a4,104
vmv.v.i v3,15
lui a1,%hi(a)
li  a0,19
vsetvli zero,zero,e8,mf2,ta,ma
vadd.vx v1,v1,a2
sb  a0,%lo(a)(a1)
vsetvli zero,zero,e16,m1,ta,ma
vzext.vf2   v2,v1
vmv.v.x v1,a4
vminu.vvv2,v2,v3
vsrl.vv v1,v1,v2
vslidedown.vi   v1,v1,1
vmv.x.s a0,v1
sneza0,a0
ret

The reason is scalar_to_vec_cost is too low.

Consider in VEC_SET, we always have a slide + scalar move instruction,
scalar_to_vec_cost = 1 (current cost) is not reasonable.

I tried to set it as 2 but failed fix this case, that is, I need to
set it as 3 to fix this case.

No matter scalar move or slide instruction, I believe they are more costly
than normal vector instructions (e.g. vadd.vv). So set it as 3 looks reasonable
to me.

After this patch:

lui a5,%hi(a)
li  a4,19
sb  a4,%lo(a)(a5)
li  a0,0
ret

Tested on both RV32/RV64 no regression, Ok for trunk ?

PR target/113281

gcc/ChangeLog:

* config/riscv/riscv.cc: Set scalar_to_vec_cost as 3.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113209.c: Adapt test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c: New test.

---
 gcc/config/riscv/riscv.cc  |  4 ++--
 .../vect/costmodel/riscv/rvv/pr113281-1.c  | 18 ++
 .../gcc.target/riscv/rvv/autovec/pr113209.c|  2 +-
 3 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index df9799d9c5e..bcfb3c15a39 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -366,7 +366,7 @@ static const common_vector_cost rvv_vls_vector_cost = {
   1, /* gather_load_cost  */
   1, /* scatter_store_cost  */
   1, /* vec_to_scalar_cost  */
-  1, /* scalar_to_vec_cost  */
+  3, /* scalar_to_vec_cost  */
   1, /* permute_cost  */
   1, /* align_load_cost  */
   1, /* align_store_cost  */
@@ -382,7 +382,7 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
 1, /* gather_load_cost  */
 1, /* scatter_store_cost  */
 1, /* vec_to_scalar_cost  */
-1, /* scalar_to_vec_cost  */
+3, /* scalar_to_vec_cost  */
 1, /* permute_cost  */
 1, /* align_load_cost  */
 1, /* align_store_cost  */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c
new file mode 100644
index 000..331cf961a1f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113281-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3 -ftree-vectorize 
-fdump-tree-vect-details" } */
+
+unsigned char a;
+
+int main() {
+  short b = a = 0;
+  for (; a != 19; a++)
+if (a)
+  b = 32872 >> a;
+
+  if (b == 0)
+return 0;
+  else
+return 1;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c
index 081ee369394..70aae151000 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3" } */
+/* { dg-options "-march=rv64gcv_zvl256b -mabi=lp64d -O3 -fno-vect-cost-model" 
} */
 
 int b, c, d, f, i, a;
 int e[1] = {0};
-- 
2.36.3

[PATCH] RISC-V: VLA preempts VLS on unknown NITERS loop

2024-01-10 Thread Juzhe-Zhong

This patch fixes the known issues on SLP cases:

ble a2,zero,.L11
addiw   t1,a2,-1
li  a5,15
bleut1,a5,.L9
srliw   a7,t1,4
sllia7,a7,7
lui t3,%hi(.LANCHOR0)
lui a6,%hi(.LANCHOR0+128)
addit3,t3,%lo(.LANCHOR0)
li  a4,128
addia6,a6,%lo(.LANCHOR0+128)
add a7,a7,a0
addia3,a1,37
mv  a5,a0
vsetvli zero,a4,e8,m8,ta,ma
vle8.v  v24,0(t3)
vle8.v  v16,0(a6)
.L4:
li  a6,128
vle8.v  v0,0(a3)
vrgather.vv v8,v0,v24
vadd.vv v8,v8,v16
vse8.v  v8,0(a5)
add a5,a5,a6
add a3,a3,a6
bne a5,a7,.L4
andia5,t1,-16
mv  t1,a5
.L3:
subwa2,a2,a5
li  a4,1
beq a2,a4,.L5
sllia5,a5,32
srlia5,a5,32
addiw   a2,a2,-1
sllia5,a5,3
csrra4,vlenb
sllia6,a2,32
addit3,a5,37
srlia3,a6,29
sllia4,a4,2
add t3,a1,t3
add a5,a0,a5
mv  t5,a3
bgtua3,a4,.L14
.L6:
li  a4,50790400
addia4,a4,1541
li  a6,67633152
addia6,a6,513
sllia4,a4,32
add a4,a4,a6
vsetvli t4,zero,e64,m4,ta,ma
vmv.v.x v16,a4
vsetvli a6,zero,e16,m8,ta,ma
vid.v   v8
vsetvli zero,t5,e8,m4,ta,ma
vle8.v  v20,0(t3)
vsetvli a6,zero,e16,m8,ta,ma
csrra7,vlenb
vand.vi v8,v8,-8
vsetvli zero,zero,e8,m4,ta,ma
sllia4,a7,2
vrgatherei16.vv v4,v20,v8
vadd.vv v4,v4,v16
vsetvli zero,t5,e8,m4,ta,ma
vse8.v  v4,0(a5)
bgtua3,a4,.L15
.L7:
addwt1,a2,t1
.L5:
slliw   a5,t1,3
add a1,a1,a5
lui a4,%hi(.LC2)
add a0,a0,a5
lbu a3,37(a1)
addia5,a4,%lo(.LC2)
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.x v1,a3
vle8.v  v2,0(a5)
vadd.vv v1,v1,v2
vse8.v  v1,0(a0)
.L11:
ret
.L15:
sub a3,a3,a4
bleua3,a4,.L8
mv  a3,a4
.L8:
li  a7,50790400
csrra4,vlenb
sllia4,a4,2
addia7,a7,1541
li  t4,67633152
add t3,t3,a4
vsetvli zero,a3,e8,m4,ta,ma
sllia7,a7,32
addit4,t4,513
vle8.v  v20,0(t3)
add a4,a5,a4
add a7,a7,t4
vsetvli a5,zero,e64,m4,ta,ma
vmv.v.x v16,a7
vsetvli a6,zero,e16,m8,ta,ma
vid.v   v8
vand.vi v8,v8,-8
vsetvli zero,zero,e8,m4,ta,ma
vrgatherei16.vv v4,v20,v8
vadd.vv v4,v4,v16
vsetvli zero,a3,e8,m4,ta,ma
vse8.v  v4,0(a4)
j   .L7
.L14:
mv  t5,a4
j   .L6
.L9:
li  a5,0
li  t1,0
j   .L3

The vectorization codegen is quite inefficient since we choose a VLS modes to 
vectorize the loop body
with epilogue choosing a VLA modes.

cost.c:6:21: note:  * Choosing vector mode V128QI
cost.c:6:21: note:  * Choosing epilogue vector mode RVVM4QI

As we known, in RVV side, we have VLA modes and VLS modes. VLAmodes support 
partial vectors wheras
VLSmodes support full vectors.  The goal we add VLSmodes is to improve the 
codegen of known NITERS
or SLP codes.

If NITERS is unknown, that is i < n, n is unknown. We will always have partial 
vectors vectorization.
It can be loop body or epilogue. In this case, It's always more efficient to 
apply VLA partial vectorization
on loop body which doesn't have epilogue.

After this patch:

f:
ble a2,zero,.L7
li  a5,1
beq a2,a5,.L5
li  a6,50790400
addia6,a6,1541
li  a4,67633152
addia4,a4,513
csrra5,vlenb
addiw   a2,a2,-1
sllia6,a6,32
add a6,a6,a4
sllia5,a5,2
sllia4,a2,32
vsetvli t1,zero,e64,m4,ta,ma
srlia3,a4,29
neg t4,a5
addia7,a1,37
mv  a4,a0
vmv.v.x v12,a6
vsetvli t3,zero,e16,m8,ta,ma
vid.v   v16
vand.vi v16,v16,-8
.L4:
minua6,a3,a5
vsetvli zero,a6,e8,m4,ta,ma
vle8.v  v8,0(a7)
vsetvli t3,zero,e8,m4,ta,ma
mv  t1,a3
vrgatherei16.vv v4,v8,v16
vsetvli zero,a6,e8,m4,ta,ma
vadd.vv v4,v4,v12
vse8.v  v4,0(a4)
add a7,a7,a5
add a4,a4,a5
add a3,a3,t4
bgtut1,a5,.L4
.L3:
slliw   a2,a2,3
add a1,a1,a2
lui a5,%hi(.LC0)
lbu a4,37(a1)
add a0,a0,a2
addia5,a5,%lo(.LC0)
vsetivlizero,8,e8,mf2,ta,ma
vmv.v.x v1,a4
vle8.v  v2,0(a5)
vadd.vv v1,v1

[PATCH V2] RISC-V: Switch RVV cost model.

2024-01-10 Thread Juzhe-Zhong

This patch is preparing patch for the following cost model tweak.

Since we don't have vector cost model in default tune info (rocket),
we set the cost model default as generic cost model by default.

The reason we want to switch to generic vector cost model is the default
cost model generates inferior codegen for various benchmarks.

For example, PR113247, we have performance bug that we end up having over 70%
performance drop of SHA256.  Currently, no matter how we adapt cost model,
we are not able to fix the performance bug since we always use default cost 
model by default.

Also, tweak the generic cost model back to default cost model since we have 
some FAILs in
current tests.

After this patch, we (me an Robin) can work on cost model tunning together to 
improve performane
in various benchmarks.

Tested on both RV32 and RV64, ok for trunk ?

gcc/ChangeLog:

* config/riscv/riscv.cc (get_common_costs): Switch RVV cost model.
(get_vector_costs): Ditto.
(riscv_builtin_vectorization_cost): Ditto.

---
 gcc/config/riscv/riscv.cc | 144 --
 1 file changed, 75 insertions(+), 69 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 32183d63180..cca01fd54d9 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -352,48 +352,49 @@ const enum reg_class 
riscv_regno_to_class[FIRST_PSEUDO_REGISTER] = {
   VD_REGS, VD_REGS,VD_REGS,VD_REGS,
 };
 
-/* Generic costs for VLS vector operations.   */
-static const common_vector_cost generic_vls_vector_cost = {
+/* RVV costs for VLS vector operations.   */
+static const common_vector_cost rvv_vls_vector_cost = {
   1, /* int_stmt_cost  */
   1, /* fp_stmt_cost  */
   1, /* gather_load_cost  */
   1, /* scatter_store_cost  */
-  2, /* vec_to_scalar_cost  */
+  1, /* vec_to_scalar_cost  */
   1, /* scalar_to_vec_cost  */
-  2, /* permute_cost  */
+  1, /* permute_cost  */
   1, /* align_load_cost  */
   1, /* align_store_cost  */
-  1, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
+  2, /* unalign_load_cost  */
+  2, /* unalign_store_cost  */
 };
 
-/* Generic costs for VLA vector operations.  */
-static const scalable_vector_cost generic_vla_vector_cost = {
+/* RVV costs for VLA vector operations.  */
+static const scalable_vector_cost rvv_vla_vector_cost = {
   {
 1, /* int_stmt_cost  */
 1, /* fp_stmt_cost  */
 1, /* gather_load_cost  */
 1, /* scatter_store_cost  */
-2, /* vec_to_scalar_cost  */
+1, /* vec_to_scalar_cost  */
 1, /* scalar_to_vec_cost  */
-2, /* permute_cost  */
+1, /* permute_cost  */
 1, /* align_load_cost  */
 1, /* align_store_cost  */
-1, /* unalign_load_cost  */
-1, /* unalign_store_cost  */
+2, /* unalign_load_cost  */
+2, /* unalign_store_cost  */
   },
 };
 
-/* Generic costs for vector insn classes.  */
+/* Generic costs for vector insn classes.  It is supposed to be the vector cost
+   models used by default if no other cost model was specified.  */
 static const struct cpu_vector_cost generic_vector_cost = {
-  1,   /* scalar_int_stmt_cost  */
-  1,   /* scalar_fp_stmt_cost  */
-  1,   /* scalar_load_cost  */
-  1,   /* scalar_store_cost  */
-  3,   /* cond_taken_branch_cost  */
-  1,   /* cond_not_taken_branch_cost  */
-  &generic_vls_vector_cost, /* vls  */
-  &generic_vla_vector_cost, /* vla */
+  1,   /* scalar_int_stmt_cost  */
+  1,   /* scalar_fp_stmt_cost  */
+  1,   /* scalar_load_cost  */
+  1,   /* scalar_store_cost  */
+  3,   /* cond_taken_branch_cost  */
+  1,   /* cond_not_taken_branch_cost  */
+  &rvv_vls_vector_cost, /* vls  */
+  &rvv_vla_vector_cost, /* vla */
 };
 
 /* Costs to use when optimizing for rocket.  */
@@ -10372,11 +10373,10 @@ riscv_frame_pointer_required (void)
   return riscv_save_frame_pointer && !crtl->is_leaf;
 }
 
-/* Return the appropriate common costs for vectors of type VECTYPE.  */
+/* Return the appropriate common costs according to VECTYPE from COSTS.  */
 static const common_vector_cost *
-get_common_costs (tree vectype)
+get_common_costs (const cpu_vector_cost *costs, tree vectype)
 {
-  const cpu_vector_cost *costs = tune_param->vec_costs;
   gcc_assert (costs);
 
   if (vectype && riscv_v_ext_vls_mode_p (TYPE_MODE (vectype)))
@@ -10384,78 +10384,84 @@ get_common_costs (tree vectype)
   return costs->vla;
 }
 
+/* Return the CPU vector costs according to -mtune if tune info has non-NULL
+   vector cost.  Otherwide, return the default generic vector costs.  */
+static const cpu_vector_cost *
+get_vector_costs ()
+{
+  const cpu_vector_cost *costs = tune_param->vec_costs;
+  if (!costs)
+return &generic_vector_cost;
+  return costs;
+}
+
 /* Implement targetm.vectorize.builtin_vector

[PATCH] RISC-V: Switch RVV cost model to generic vector cost model

2024-01-10 Thread Juzhe-Zhong

This patch is preparing patch for the following cost model tweak.

Since we don't have vector cost model in default tune info (rocket),
we set the cost model default as generic cost model by default.

The reason we want to switch to generic vector cost model is the default
cost model generates inferior codegen for various benchmarks.

For example, PR113247, we have performance bug that we end up having over 70%
performance drop of SHA256.  Currently, no matter how we adapt cost model,
we are not able to fix the performance bug since we always use default cost 
model by default.

Also, tweak the generic cost model back to default cost model since we have 
some FAILs in
current tests.

After this patch, we (me an Robin) can work on cost model tunning together to 
improve performane
in various benchmarks.

Tested on both RV32 and RV64, ok for trunk ?

gcc/ChangeLog:

* config/riscv/riscv.cc (get_common_costs): Switch RVV cost model.
(get_vector_costs): Ditto.
(riscv_builtin_vectorization_cost): Ditto.

---
 gcc/config/riscv/riscv.cc | 117 --
 1 file changed, 61 insertions(+), 56 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 32183d63180..d72058039ce 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -358,13 +358,13 @@ static const common_vector_cost generic_vls_vector_cost = 
{
   1, /* fp_stmt_cost  */
   1, /* gather_load_cost  */
   1, /* scatter_store_cost  */
-  2, /* vec_to_scalar_cost  */
+  1, /* vec_to_scalar_cost  */
   1, /* scalar_to_vec_cost  */
-  2, /* permute_cost  */
+  1, /* permute_cost  */
   1, /* align_load_cost  */
   1, /* align_store_cost  */
-  1, /* unalign_load_cost  */
-  1, /* unalign_store_cost  */
+  2, /* unalign_load_cost  */
+  2, /* unalign_store_cost  */
 };
 
 /* Generic costs for VLA vector operations.  */
@@ -374,13 +374,13 @@ static const scalable_vector_cost generic_vla_vector_cost 
= {
 1, /* fp_stmt_cost  */
 1, /* gather_load_cost  */
 1, /* scatter_store_cost  */
-2, /* vec_to_scalar_cost  */
+1, /* vec_to_scalar_cost  */
 1, /* scalar_to_vec_cost  */
-2, /* permute_cost  */
+1, /* permute_cost  */
 1, /* align_load_cost  */
 1, /* align_store_cost  */
-1, /* unalign_load_cost  */
-1, /* unalign_store_cost  */
+2, /* unalign_load_cost  */
+2, /* unalign_store_cost  */
   },
 };
 
@@ -10372,11 +10372,10 @@ riscv_frame_pointer_required (void)
   return riscv_save_frame_pointer && !crtl->is_leaf;
 }
 
-/* Return the appropriate common costs for vectors of type VECTYPE.  */
+/* Return the appropriate common costs according to VECTYPE from COSTS.  */
 static const common_vector_cost *
-get_common_costs (tree vectype)
+get_common_costs (const cpu_vector_cost *costs, tree vectype)
 {
-  const cpu_vector_cost *costs = tune_param->vec_costs;
   gcc_assert (costs);
 
   if (vectype && riscv_v_ext_vls_mode_p (TYPE_MODE (vectype)))
@@ -10384,78 +10383,84 @@ get_common_costs (tree vectype)
   return costs->vla;
 }
 
+/* Return the CPU vector costs according to -mtune if tune info has non-NULL
+   vector cost.  Otherwide, return the default generic vector costs.  */
+static const cpu_vector_cost *
+get_vector_costs ()
+{
+  const cpu_vector_cost *costs = tune_param->vec_costs;
+  if (!costs)
+return &generic_vector_cost;
+  return costs;
+}
+
 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
 
 static int
 riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
  tree vectype, int misalign ATTRIBUTE_UNUSED)
 {
-  unsigned elements;
-  const cpu_vector_cost *costs = tune_param->vec_costs;
+  const cpu_vector_cost *costs = get_vector_costs ();
   bool fp = false;
 
   if (vectype != NULL)
 fp = FLOAT_TYPE_P (vectype);
 
-  if (costs != NULL)
+  const common_vector_cost *common_costs = get_common_costs (costs, vectype);
+  gcc_assert (common_costs != NULL);
+  switch (type_of_cost)
 {
-  const common_vector_cost *common_costs = get_common_costs (vectype);
-  gcc_assert (common_costs != NULL);
-  switch (type_of_cost)
-   {
-   case scalar_stmt:
- return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
+case scalar_stmt:
+  return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
 
-   case scalar_load:
- return costs->scalar_load_cost;
+case scalar_load:
+  return costs->scalar_load_cost;
 
-   case scalar_store:
- return costs->scalar_store_cost;
+case scalar_store:
+  return costs->scalar_store_cost;
 
-   case vector_stmt:
- return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
+case vector_stmt:
+  return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
 
-   case vector_load:
- return common_costs->align_load_cost;
+case vector_load:
+  return common_costs->

[PATCH] RISC-V: Refine unsigned avg_floor/avg_ceil

2024-01-09 Thread Juzhe-Zhong

This patch is inspired by LLVM patches:
https://github.com/llvm/llvm-project/pull/76550
https://github.com/llvm/llvm-project/pull/77473

Use vaaddu for AVG vectorization.

Before this patch:

vsetivlizero,8,e8,mf2,ta,ma
vle8.v  v3,0(a1)
vle8.v  v2,0(a2)
vwaddu.vvv1,v3,v2
vsetvli zero,zero,e16,m1,ta,ma
vadd.vi v1,v1,1
vsetvli zero,zero,e8,mf2,ta,ma
vnsrl.wiv1,v1,1
vse8.v  v1,0(a0)
ret

After this patch:

vsetivlizero,8,e8,mf2,ta,ma
csrwi   vxrm,0
vle8.v  v1,0(a1)
vle8.v  v2,0(a2)
vaaddu.vv   v1,v1,v2
vse8.v  v1,0(a0)
ret

Note on signed averaging addition

Based on the rvv spec, there is also a variant for signed averaging addition 
called vaadd.
But AFAIU, no matter in which rounding mode, we cannot achieve the semantic of 
signed averaging addition through vaadd.
Thus this patch only introduces vaaddu.

More details in:
https://github.com/riscv/riscv-v-spec/issues/935
https://github.com/riscv/riscv-v-spec/issues/934

Tested on both RV32 and RV64 no regression.

Ok for trunk ?

gcc/ChangeLog:

* config/riscv/autovec.md (avg3_floor): Remove.
(avg3_floor): New pattern.
(avg3_ceil): Remove.
(avg3_ceil): New pattern.
(uavg3_floor): Ditto.
(uavg3_ceil): Ditto.
* config/riscv/riscv-protos.h (enum insn_flags): Add for average 
addition.
(enum insn_type): Ditto.
* config/riscv/riscv-v.cc: Ditto.
* config/riscv/vector-iterators.md (ashiftrt): Remove.
(ASHIFTRT): Ditto.
* config/riscv/vector.md: Add VLS modes.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/avg-1.c: Adapt test.
* gcc.target/riscv/rvv/autovec/vls/avg-2.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/avg-3.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/avg-4.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/avg-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/avg-6.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/vec-avg-rv32gcv.c: Ditto.
* gcc.target/riscv/rvv/autovec/widen/vec-avg-rv64gcv.c: Ditto.

---
 gcc/config/riscv/autovec.md   | 50 ++-
 gcc/config/riscv/riscv-protos.h   |  8 +++
 gcc/config/riscv/riscv-v.cc   | 11 
 gcc/config/riscv/vector-iterators.md  |  5 --
 gcc/config/riscv/vector.md| 12 ++---
 .../gcc.target/riscv/rvv/autovec/vls/avg-1.c  |  4 +-
 .../gcc.target/riscv/rvv/autovec/vls/avg-2.c  |  4 +-
 .../gcc.target/riscv/rvv/autovec/vls/avg-3.c  |  4 +-
 .../gcc.target/riscv/rvv/autovec/vls/avg-4.c  |  6 +--
 .../gcc.target/riscv/rvv/autovec/vls/avg-5.c  |  6 +--
 .../gcc.target/riscv/rvv/autovec/vls/avg-6.c  |  6 +--
 .../riscv/rvv/autovec/widen/vec-avg-rv32gcv.c |  7 +--
 .../riscv/rvv/autovec/widen/vec-avg-rv64gcv.c |  7 +--
 13 files changed, 86 insertions(+), 44 deletions(-)

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 775eaa825b0..706cd9717cb 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -2345,39 +2345,39 @@
 ;;  op[0] = (narrow) ((wide) op[1] + (wide) op[2] + 1)) >> 1;
 ;; -
 
-(define_expand "avg3_floor"
+(define_expand "avg3_floor"
  [(set (match_operand: 0 "register_operand")
(truncate:
-(:VWEXTI
+(ashiftrt:VWEXTI
  (plus:VWEXTI
-  (any_extend:VWEXTI
+  (sign_extend:VWEXTI
(match_operand: 1 "register_operand"))
-  (any_extend:VWEXTI
+  (sign_extend:VWEXTI
(match_operand: 2 "register_operand"))]
   "TARGET_VECTOR"
 {
   /* First emit a widening addition.  */
   rtx tmp1 = gen_reg_rtx (mode);
   rtx ops1[] = {tmp1, operands[1], operands[2]};
-  insn_code icode = code_for_pred_dual_widen (PLUS, , mode);
+  insn_code icode = code_for_pred_dual_widen (PLUS, SIGN_EXTEND, mode);
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops1);
 
   /* Then a narrowing shift.  */
   rtx ops2[] = {operands[0], tmp1, const1_rtx};
-  icode = code_for_pred_narrow_scalar (, mode);
+  icode = code_for_pred_narrow_scalar (ASHIFTRT, mode);
   riscv_vector::emit_vlmax_insn (icode, riscv_vector::BINARY_OP, ops2);
   DONE;
 })
 
-(define_expand "avg3_ceil"
+(define_expand "avg3_ceil"
  [(set (match_operand: 0 "register_operand")
(truncate:
-(:VWEXTI
+(ashiftrt:VWEXTI
  (plus:VWEXTI
   (plus:VWEXTI
-   (any_extend:VWEXTI
+   (sign_extend:VWEXTI
(match_operand: 1 "register_operand"))
-   (any_extend:VWEXTI
+   (sign_extend:VWEXTI
(match_operand: 2 "register_operand")))
   (const_int 1)]
   "TARGET_VECTOR"
@@ -2385,7 +2385,7 @@
   /* First emit a widening addition.  */
   rtx tmp1 = gen_reg_rtx (mode);
   rtx ops1[] = {tmp1, operands[1], operands[2]};
-  insn_code icode = code_for

[PATCH V2] RISC-V: Minor tweak dynamic cost model

2024-01-09 Thread Juzhe-Zhong

v2 update: Robostify tests.

While working on cost model, I notice one case that dynamic lmul cost doesn't 
work well.

Before this patch:

foo:
lui a4,%hi(.LANCHOR0)
li  a0,1953
li  a1,63
addia4,a4,%lo(.LANCHOR0)
li  a3,64
vsetvli a2,zero,e32,mf2,ta,ma
vmv.v.x v5,a0
vmv.v.x v4,a1
vid.v   v3
.L2:
vsetvli a5,a3,e32,mf2,ta,ma
vadd.vi v2,v3,1
vadd.vv v1,v3,v5
mv  a2,a5
vmacc.vvv1,v2,v4
sllia1,a5,2
vse32.v v1,0(a4)
sub a3,a3,a5
add a4,a4,a1
vsetvli a5,zero,e32,mf2,ta,ma
vmv.v.x v1,a2
vadd.vv v3,v3,v1
bne a3,zero,.L2
li  a0,0
ret

Unexpected: Use scalable vector and LMUL = MF2 which is wasting computation 
resources.

Ideally, we should use LMUL = M8 VLS modes.

The root cause is the dynamic LMUL heuristic dominates the VLS heuristic.
Adapt the cost model heuristic.

After this patch:

foo:
lui a4,%hi(.LANCHOR0)
addia4,a4,%lo(.LANCHOR0)
li  a3,4096
li  a5,32
li  a1,2016
addia2,a4,128
addiw   a3,a3,-32
vsetvli zero,a5,e32,m8,ta,ma
li  a0,0
vid.v   v8
vsll.vi v8,v8,6
vadd.vx v16,v8,a1
vadd.vx v8,v8,a3
vse32.v v16,0(a4)
vse32.v v8,0(a2)
ret

Tested on both RV32/RV64 no regression.

Ok for trunk ?

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (costs::better_main_loop_than_p): 
Minior tweak.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c: Fix test.
* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c: Ditto.

---
 gcc/config/riscv/riscv-vector-costs.cc | 3 ++-
 .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c| 5 ++---
 .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c| 5 ++---
 .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c| 7 +++
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index f4a1a789f23..e53f4a186f3 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -994,7 +994,8 @@ costs::better_main_loop_than_p (const vector_costs 
*uncast_other) const
 vect_vf_for_cost (other_loop_vinfo));
 
   /* Apply the unrolling heuristic described above m_unrolled_vls_niters.  */
-  if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts))
+  if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts)
+  && m_cost_type != other->m_cost_type)
 {
   bool this_prefer_unrolled = this->prefer_unrolled_loop ();
   bool other_prefer_unrolled = other->prefer_unrolled_loop ();
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
index 3ddffa37fe4..89a6c678960 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
@@ -3,7 +3,7 @@
 
 #include 
 
-#define N 40
+#define N 48
 
 int a[N];
 
@@ -22,7 +22,6 @@ foo (){
   return 0;
 }
 
-/* { dg-final { scan-assembler-times 
{vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
 /* { dg-final { scan-assembler-times 
{vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
-/* { dg-final { scan-assembler-times {vsetivli} 2 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
 /* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
index 7625ec5c4b1..86732ef2ce5 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
@@ -3,7 +3,7 @@
 
 #include 
 
-#define N 40
+#define N 64
 
 int a[N];
 
@@ -22,7 +22,6 @@ foo (){
   return 0;
 }
 
-/* { dg-final { scan-assembler-times 
{vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
 /* { dg-final { scan-assembler-times 
{vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
-/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetivli} } } */
 /* { dg-final { scan-assembler-times {vsetvli} 1 } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
index 7625ec5c4b1..a1fcb3f3443 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
@@ -1,9 +1,9 @@
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param

[PATCH] RISC-V: Minor tweak dynamic cost model

2024-01-09 Thread Juzhe-Zhong

While working on cost model, I notice one case that dynamic lmul cost doesn't 
work well.

Before this patch:

foo:
lui a4,%hi(.LANCHOR0)
li  a0,1953
li  a1,63
addia4,a4,%lo(.LANCHOR0)
li  a3,64
vsetvli a2,zero,e32,mf2,ta,ma
vmv.v.x v5,a0
vmv.v.x v4,a1
vid.v   v3
.L2:
vsetvli a5,a3,e32,mf2,ta,ma
vadd.vi v2,v3,1
vadd.vv v1,v3,v5
mv  a2,a5
vmacc.vvv1,v2,v4
sllia1,a5,2
vse32.v v1,0(a4)
sub a3,a3,a5
add a4,a4,a1
vsetvli a5,zero,e32,mf2,ta,ma
vmv.v.x v1,a2
vadd.vv v3,v3,v1
bne a3,zero,.L2
li  a0,0
ret

Unexpected: Use scalable vector and LMUL = MF2 which is wasting computation 
resources.

Ideally, we should use LMUL = M8 VLS modes.

The root cause is the dynamic LMUL heuristic dominates the VLS heuristic.
Adapt the cost model heuristic.

After this patch:

foo:
lui a4,%hi(.LANCHOR0)
addia4,a4,%lo(.LANCHOR0)
li  a3,4096
li  a5,32
li  a1,2016
addia2,a4,128
addiw   a3,a3,-32
vsetvli zero,a5,e32,m8,ta,ma
li  a0,0
vid.v   v8
vsll.vi v8,v8,6
vadd.vx v16,v8,a1
vadd.vx v8,v8,a3
vse32.v v16,0(a4)
vse32.v v8,0(a2)
ret

Tested on both RV32/RV64 no regression.

Ok for trunk ?

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (costs::better_main_loop_than_p): 
Minior tweak.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c: Fix test.
* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c: Ditto.

---
 gcc/config/riscv/riscv-vector-costs.cc   | 3 ++-
 .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c  | 5 ++---
 .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c  | 5 ++---
 .../gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c  | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index f4a1a789f23..e53f4a186f3 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -994,7 +994,8 @@ costs::better_main_loop_than_p (const vector_costs 
*uncast_other) const
 vect_vf_for_cost (other_loop_vinfo));
 
   /* Apply the unrolling heuristic described above m_unrolled_vls_niters.  */
-  if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts))
+  if (bool (m_unrolled_vls_stmts) != bool (other->m_unrolled_vls_stmts)
+  && m_cost_type != other->m_cost_type)
 {
   bool this_prefer_unrolled = this->prefer_unrolled_loop ();
   bool other_prefer_unrolled = other->prefer_unrolled_loop ();
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
index 3ddffa37fe4..89a6c678960 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c
@@ -3,7 +3,7 @@
 
 #include 
 
-#define N 40
+#define N 48
 
 int a[N];
 
@@ -22,7 +22,6 @@ foo (){
   return 0;
 }
 
-/* { dg-final { scan-assembler-times 
{vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
 /* { dg-final { scan-assembler-times 
{vsetivli\s+zero,\s*16,\s*e32,\s*m4,\s*t[au],\s*m[au]} 1 } } */
-/* { dg-final { scan-assembler-times {vsetivli} 2 } } */
+/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
 /* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
index 7625ec5c4b1..86732ef2ce5 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c
@@ -3,7 +3,7 @@
 
 #include 
 
-#define N 40
+#define N 64
 
 int a[N];
 
@@ -22,7 +22,6 @@ foo (){
   return 0;
 }
 
-/* { dg-final { scan-assembler-times 
{vsetivli\s+zero,\s*8,\s*e32,\s*m2,\s*t[au],\s*m[au]} 1 } } */
 /* { dg-final { scan-assembler-times 
{vsetvli\s+zero,\s*[a-x0-9]+,\s*e32,\s*m8,\s*t[au],\s*m[au]} 1 } } */
-/* { dg-final { scan-assembler-times {vsetivli} 1 } } */
+/* { dg-final { scan-assembler-not {vsetivli} } } */
 /* { dg-final { scan-assembler-times {vsetvli} 1 } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
index 7625ec5c4b1..505c4cd2c40 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 
-fn

[Committed] RISC-V: Robostify dynamic lmul test

2024-01-09 Thread Juzhe-Zhong

While working on refining the cost model, I notice this test will generate 
unexpected
scalar xor instructions if we don't tune cost model carefully.

Add more assembler to avoid future regression.

Committed.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: Add assembler-not 
check.

---
 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
index 87e963edc47..38cbefbe625 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
@@ -22,3 +22,4 @@ x264_pixel_8x8 (unsigned char *pix1, unsigned char *pix2, int 
i_stride_pix2)
 }
 
 /* { dg-final { scan-assembler {e32,m2} } } */
+/* { dg-final { scan-assembler-not {xor} } } */
-- 
2.36.3

[Committed] RISC-V: Fix comments of segment load/store intrinsic

2024-01-08 Thread Juzhe-Zhong

We have supported segment load/store intrinsics.

Committed as it is obvious.

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-functions.def (vleff): Move 
comments.
(vundefined): Ditto.

---
 gcc/config/riscv/riscv-vector-builtins-functions.def | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 96dd0d95dec..f742c98be8a 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -79,8 +79,6 @@ DEF_RVV_FUNCTION (vsoxei64, indexed_loadstore, none_m_preds, 
all_v_scalar_ptr_ee
 // 7.7. Unit-stride Fault-Only-First Loads
 DEF_RVV_FUNCTION (vleff, fault_load, full_preds, 
all_v_scalar_const_ptr_size_ptr_ops)
 
-// TODO: 7.8. Vector Load/Store Segment Instructions
-
 /* 11. Vector Integer Arithmetic Instructions.  */
 
 // 11.1. Vector Single-Width Integer Add and Subtract
@@ -630,6 +628,8 @@ DEF_RVV_FUNCTION (vset, vset, none_preds, 
all_v_vset_tuple_ops)
 DEF_RVV_FUNCTION (vget, vget, none_preds, all_v_vget_tuple_ops)
 DEF_RVV_FUNCTION (vcreate, vcreate, none_preds, all_v_vcreate_tuple_ops)
 DEF_RVV_FUNCTION (vundefined, vundefined, none_preds, all_none_void_tuple_ops)
+
+// 7.8. Vector Load/Store Segment Instructions
 DEF_RVV_FUNCTION (vlseg, seg_loadstore, full_preds, 
tuple_v_scalar_const_ptr_ops)
 DEF_RVV_FUNCTION (vsseg, seg_loadstore, none_m_preds, tuple_v_scalar_ptr_ops)
 DEF_RVV_FUNCTION (vlsseg, seg_loadstore, full_preds, 
tuple_v_scalar_const_ptr_ptrdiff_ops)
-- 
2.36.3

[Committed] RISC-V: Fix comments of segment load/store intrinsic[NFC]

2024-01-08 Thread Juzhe-Zhong

We have supported segment load/store intrinsics.

Committed as it is obvious.

gcc/ChangeLog:

* config/riscv/riscv-vector-builtins-functions.def (vleff): Move 
comments to real place.
(vcreate): Ditto.

---
 gcc/config/riscv/riscv-vector-builtins-functions.def | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/gcc/config/riscv/riscv-vector-builtins-functions.def 
b/gcc/config/riscv/riscv-vector-builtins-functions.def
index 96dd0d95dec..14560923d11 100644
--- a/gcc/config/riscv/riscv-vector-builtins-functions.def
+++ b/gcc/config/riscv/riscv-vector-builtins-functions.def
@@ -79,8 +79,6 @@ DEF_RVV_FUNCTION (vsoxei64, indexed_loadstore, none_m_preds, 
all_v_scalar_ptr_ee
 // 7.7. Unit-stride Fault-Only-First Loads
 DEF_RVV_FUNCTION (vleff, fault_load, full_preds, 
all_v_scalar_const_ptr_size_ptr_ops)
 
-// TODO: 7.8. Vector Load/Store Segment Instructions
-
 /* 11. Vector Integer Arithmetic Instructions.  */
 
 // 11.1. Vector Single-Width Integer Add and Subtract
@@ -625,7 +623,7 @@ DEF_RVV_FUNCTION (vcreate, vcreate, none_preds, 
all_v_vcreate_lmul2_x2_ops)
 DEF_RVV_FUNCTION (vcreate, vcreate, none_preds, all_v_vcreate_lmul2_x4_ops)
 DEF_RVV_FUNCTION (vcreate, vcreate, none_preds, all_v_vcreate_lmul4_x2_ops)
 
-// Tuple types
+// 7.8. Vector Load/Store Segment Instructions
 DEF_RVV_FUNCTION (vset, vset, none_preds, all_v_vset_tuple_ops)
 DEF_RVV_FUNCTION (vget, vget, none_preds, all_v_vget_tuple_ops)
 DEF_RVV_FUNCTION (vcreate, vcreate, none_preds, all_v_vcreate_tuple_ops)
-- 
2.36.3

[PATCH] RISC-V: Fix loop invariant check

2024-01-08 Thread Juzhe-Zhong

As Robin suggested, remove gimple_uid check which is sufficient for our need.

Tested on both RV32/RV64 no regression, ok for trunk ?

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (loop_invariant_op_p): Fix loop 
invariant check.

---
 gcc/config/riscv/riscv-vector-costs.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 3bae581d6fd..f4a1a789f23 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -241,7 +241,7 @@ loop_invariant_op_p (class loop *loop,
   if (SSA_NAME_IS_DEFAULT_DEF (op)
   || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op
 return true;
-  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
+  return false;
 }
 
 /* Return true if the variable should be counted into liveness.  */
-- 
2.36.3

[Committed] RISC-V: Use MAX instead of std::max [VSETVL PASS]

2024-01-06 Thread Juzhe-Zhong

Obvious fix, Committed.

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc: replace std::max by MAX.

---
 gcc/config/riscv/riscv-vsetvl.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 7d748edc0ef..df7ed149388 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -1668,7 +1668,7 @@ private:
   }
   inline void use_max_sew (vsetvl_info &prev, const vsetvl_info &next)
   {
-auto max_sew = std::max (prev.get_sew (), next.get_sew ());
+int max_sew = MAX (prev.get_sew (), next.get_sew ());
 prev.set_sew (max_sew);
 use_min_of_max_sew (prev, next);
   }
@@ -1702,7 +1702,7 @@ private:
   inline void use_max_sew_and_lmul_with_prev_ratio (vsetvl_info &prev,
const vsetvl_info &next)
   {
-auto max_sew = std::max (prev.get_sew (), next.get_sew ());
+int max_sew = MAX (prev.get_sew (), next.get_sew ());
 prev.set_vlmul (calculate_vlmul (max_sew, prev.get_ratio ()));
 prev.set_sew (max_sew);
   }
-- 
2.36.3

[Committed] RISC-V: Update MAX_SEW for available vsevl info[VSETVL PASS]

2024-01-05 Thread Juzhe-Zhong

This patch fixes a bug of VSETVL PASS in this following situation:

Ignore curr info since prev info available with it:
  prev_info: VALID (insn 8, bb 2)
Demand fields: demand_ratio_and_ge_sew demand_avl
SEW=16, VLMUL=mf4, RATIO=64, MAX_SEW=64
TAIL_POLICY=agnostic, MASK_POLICY=agnostic
AVL=(const_int 1 [0x1])
VL=(nil)
  curr_info: VALID (insn 12, bb 2)
Demand fields: demand_ge_sew demand_non_zero_avl
SEW=16, VLMUL=m1, RATIO=16, MAX_SEW=32
TAIL_POLICY=agnostic, MASK_POLICY=agnostic
AVL=(const_int 1 [0x1])
VL=(nil)

We should update prev_info MAX_SEW from 64 into 32.

Before this patch:
foo:
vsetivlizero,1,e64,m1,ta,ma
vle64.v v1,0(a1)
vmv.s.x v3,a0
vfmv.s.fv2,fa0
vadd.vv v1,v1,v1
ret

After this patch:
foo:
vsetivlizero,1,e16,mf4,ta,ma
vle64.v v1,0(a1)
vmv.s.x v3,a0
vfmv.s.fv2,fa0
vsetvli zero,zero,e64,m1,ta,ma
vadd.vv v1,v1,v1
ret

Tested on both RV32 and RV64 no regression. Committed.

PR target/113248

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (pre_vsetvl::fuse_local_vsetvl_info):

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/pr113248.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc| 17 +
 .../gcc.target/riscv/rvv/vsetvl/pr113248.c  | 15 +++
 2 files changed, 32 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 3a2ea9ad44a..7d748edc0ef 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2876,6 +2876,23 @@ pre_vsetvl::fuse_local_vsetvl_info ()
  curr_info.dump (dump_file, "");
  fprintf (dump_file, "\n");
}
+ /* Even though prev_info is available with curr_info,
+we need to update the MAX_SEW of prev_info since
+we don't check MAX_SEW in available_p check.
+
+prev_info:
+Demand fields: demand_ratio_and_ge_sew demand_avl
+SEW=16, VLMUL=mf4, RATIO=64, MAX_SEW=64
+
+curr_info:
+Demand fields: demand_ge_sew demand_non_zero_avl
+SEW=16, VLMUL=m1, RATIO=16, MAX_SEW=32
+
+In the example above, prev_info is available with
+curr_info, we need to update prev_info MAX_SEW from
+64 into 32.  */
+ prev_info.set_max_sew (
+   MIN (prev_info.get_max_sew (), curr_info.get_max_sew ()));
  if (!curr_info.vl_used_by_non_rvv_insn_p ()
  && vsetvl_insn_p (curr_info.get_insn ()->rtl ()))
m_delete_list.safe_push (curr_info);
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c 
b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c
new file mode 100644
index 000..b3b506177df
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr113248.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-mtune=generic-ooo --param=riscv-autovec-preference=scalable 
-march=rv32gc_zve64f_zvfh -mabi=ilp32d -O3" } */
+
+#include "riscv_vector.h"
+
+void foo(_Float16 y, int64_t *i64p)
+{
+  vint64m1_t vx =__riscv_vle64_v_i64m1 (i64p, 1);
+  vx = __riscv_vadd_vv_i64m1 (vx, vx, 1);
+  vfloat16m1_t vy =__riscv_vfmv_s_f_f16m1 (y, 1);
+  asm volatile ("# use %0 %1" : : "vr"(vx), "vr" (vy));
+}
+
+/* { dg-final { scan-assembler-times 
{vsetivli\s+zero,\s*1,\s*e16,\s*mf4,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times 
{vsetvli\s+zero,\s*zero,\s*e64,\s*m1,\s*t[au],\s*m[au]} 1 } } */
-- 
2.36.3

[Committed V2] RISC-V: Teach liveness computation loop invariant shift amount

2024-01-05 Thread Juzhe-Zhong

1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize shift 
with vector shift amount,
that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant.
2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize 
shift with scalar shift amount,
that is, vectorization of 'a[i] >> x', the shift amount is loop invariant.

For the 2) case, we don't need to allocate a vector register group for shift 
amount.

So consider this following case:

void
f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x,
   int n)
{
  for (int i = 0; i < n; i++)
{
  int tmp = b[i] >> x;
  int tmp2 = tmp * b[i];
  c[i] = tmp2 * b[i];
  d[i] = tmp * tmp2 * b[i] >> x;
}
}

Before this patch, we choose LMUL = 4, now after this patch, we can choose LMUL 
= 8:

f:
ble a5,zero,.L5
.L3:
vsetvli a0,a5,e32,m8,ta,ma
sllia6,a0,2
vle32.v v16,0(a1)
vsra.vx v24,v16,a4
vmul.vv v8,v24,v16
vmul.vv v0,v8,v16
vse32.v v0,0(a2)
vmul.vv v8,v8,v24
vmul.vv v8,v8,v16
vsra.vx v8,v8,a4
vse32.v v8,0(a3)
add a1,a1,a6
add a2,a2,a6
add a3,a3,a6
sub a5,a5,a0
bne a5,zero,.L3
.L5:
ret

Tested on both RV32/RV64 no regression.  Ok for trunk ?

Note that we will apply same heuristic for vadd.vx, ... etc when the 
late-combine pass from
Richard Sandiford is committed (Since we need late combine pass to do vv->vx 
transformation for vadd).

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (loop_invariant_op_p): New 
function.
(variable_vectorized_p): Teach loop invariant.
(has_unexpected_spills_p): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 31 +++--
 .../costmodel/riscv/rvv/dynamic-lmul4-12.c| 40 
 .../costmodel/riscv/rvv/dynamic-lmul8-14.c| 64 +++
 3 files changed, 131 insertions(+), 4 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index ec8156fbaf8..3bae581d6fd 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -230,9 +230,24 @@ get_biggest_mode (machine_mode mode1, machine_mode mode2)
   return mode1_size >= mode2_size ? mode1 : mode2;
 }
 
+/* Return true if OP is invariant.  */
+
+static bool
+loop_invariant_op_p (class loop *loop,
+tree op)
+{
+  if (is_gimple_constant (op))
+return true;
+  if (SSA_NAME_IS_DEFAULT_DEF (op)
+  || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op
+return true;
+  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
+}
+
 /* Return true if the variable should be counted into liveness.  */
 static bool
-variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p)
+variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var,
+  bool lhs_p)
 {
   if (!var)
 return false;
@@ -275,6 +290,10 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
 || !tree_fits_shwi_p (var)
 || !IN_RANGE (tree_to_shwi (var), -16, 15)
 || gimple_assign_rhs1 (stmt) != var;
+   case LSHIFT_EXPR:
+   case RSHIFT_EXPR:
+ return gimple_assign_rhs2 (stmt) != var
+|| !loop_invariant_op_p (loop, var);
default:
  break;
}
@@ -312,10 +331,12 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
The live range of SSA 2 is [0, 4] in bb 3.  */
 static machine_mode
 compute_local_live_ranges (
+  loop_vec_info loop_vinfo,
   const hash_map> &program_points_per_bb,
   hash_map> &live_ranges_per_bb)
 {
   machine_mode biggest_mode = QImode;
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   if (!program_points_per_bb.is_empty ())
 {
   auto_vec visited_vars;
@@ -339,7 +360,8 @@ compute_local_live_ranges (
  unsigned int point = program_point.point;
  gimple *stmt = program_point.stmt;
  tree lhs = gimple_get_lhs (stmt);
- if (variable_vectorized_p (program_point.stmt_info, lhs, true))
+ if (variable_vectorized_p (loop, program_point.stmt_info, lhs,
+true))
{
  biggest_mode = get_biggest_mode (biggest_mode,
   TYPE_MODE (TREE_TYPE (lhs)));
@@ -356,7 +378,7 @@ compute_local_live_ranges (
  for (i = 0; i < gimple_num_args (stmt); i++)
{

[Committed V2] RISC-V: Allow simplification non-vlmax with len = NUNITS reg to reg move

2024-01-05 Thread Juzhe-Zhong

V2: Address comments from Robin.

While working on fixing a bug, I notice this following code has redundant move:

#include "riscv_vector.h"
void
f (float x, float y, void *out)
{
  float f[4] = { x, x, x, y };
  vfloat32m1_t v = __riscv_vle32_v_f32m1 (f, 4);
  __riscv_vse32_v_f32m1 (out, v, 4);
}

Before this patch:

f:
vsetivlizero,4,e32,m1,ta,ma
addisp,sp,-16
vfmv.v.fv1,fa0
vfslide1down.vf v1,v1,fa1
vmv.v.v v1,v1   > redundant move.
vse32.v v1,0(a0)
addisp,sp,16
jr  ra

The rootcause is that the complicate vmv.v.v pattern doesn't simplify it
into simple (set (reg) (reg)) reg-to-reg move pattern.

Currently, we support such simplification for VLMAX.

However, the case I found is non-VLMAX but with LEN = NUNITS which should be
considered as equivalent to VLMAX.

Add a simple fix for such situation.

Tested on both RV32/RV64 no regressions.

gcc/ChangeLog:

* config/riscv/riscv-protos.h (whole_reg_to_reg_move_p): New function.
* config/riscv/riscv-v.cc (whole_reg_to_reg_move_p): Ditto.
* config/riscv/vector.md: Allow non-vlmax with len = NUNITS 
simplification.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/vf_avl-4.c: New test.

---
 gcc/config/riscv/riscv-protos.h   |  1 +
 gcc/config/riscv/riscv-v.cc   | 23 +++
 gcc/config/riscv/vector.md|  9 ++--
 .../gcc.target/riscv/rvv/base/vf_avl-4.c  | 13 +++
 4 files changed, 39 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 0f0337cfb38..00a5b645abe 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -687,6 +687,7 @@ bool imm_avl_p (machine_mode);
 bool can_be_broadcasted_p (rtx);
 bool gather_scatter_valid_offset_p (machine_mode);
 HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
+bool whole_reg_to_reg_move_p (rtx *, machine_mode, int);
 }
 
 /* We classify builtin types into two classes:
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index ec859645415..2491522191a 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -5117,4 +5117,27 @@ estimated_poly_value (poly_int64 val, unsigned int kind)
   return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
 }
 
+/* Return true it is whole register-register move.  */
+bool
+whole_reg_to_reg_move_p (rtx *ops, machine_mode mode, int avl_type_index)
+{
+  /* An operation is a whole-register move if either
+ (1) Its vlmax operand equals VLMAX
+ (2) Its vl operand equals the number of units of its mode.  */
+  if (register_operand (ops[0], mode)
+  && register_operand (ops[3], mode)
+  && satisfies_constraint_vu (ops[2])
+  && satisfies_constraint_Wc1 (ops[1]))
+{
+  if (INTVAL (ops[avl_type_index]) == VLMAX)
+   return true;
+  /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
+into NON-VLMAX with LEN = NUNITS.  */
+  else if (CONST_INT_P (ops[4])
+  && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode)))
+   return true;
+}
+  return false;
+}
+
 } // namespace riscv_vector
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 3d2c1c3ce8f..be5beb5ab64 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1724,10 +1724,7 @@
vse.v\t%3,%0%p1
vmv.v.v\t%0,%3
vmv.v.v\t%0,%3"
-  "&& register_operand (operands[0], mode)
-   && register_operand (operands[3], mode)
-   && satisfies_constraint_vu (operands[2])
-   && INTVAL (operands[7]) == riscv_vector::VLMAX"
+  "&& riscv_vector::whole_reg_to_reg_move_p (operands, mode, 7)"
   [(set (match_dup 0) (match_dup 3))]
   ""
   [(set_attr "type" "vlde,vlde,vlde,vste,vimov,vimov")
@@ -1776,9 +1773,7 @@
vmmv.m\t%0,%3
vmclr.m\t%0
vmset.m\t%0"
-  "&& register_operand (operands[0], mode)
-   && register_operand (operands[3], mode)
-   && INTVAL (operands[5]) == riscv_vector::VLMAX"
+  "&& riscv_vector::whole_reg_to_reg_move_p (operands, mode, 5)"
   [(set (match_dup 0) (match_dup 3))]
   ""
   [(set_attr "type" "vldm,vstm,vmalu,vmalu,vmalu")
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c
new file mode 100644
index 000..1b4bfd96481
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d --param 
riscv-autovec-preference=fixed-vlmax" } */
+
+#include "riscv_vector.h"
+void
+f (float x, float y, void *out)
+{
+  float f[4] = { x, x, x, y };
+  vfloat32m1_t v = __riscv_vle32_v_f32m1 (f, 4);
+  __riscv_vse32_v_f32m1 (out, v, 4);
+}
+
+/* { dg-final { scan-assembler-not {vmv} } } */
--

[PATCH] RISC-V: Allow simplification non-vlmax with len = NUNITS reg to reg move

2024-01-04 Thread Juzhe-Zhong

While working on fixing a bug, I notice this following code has redundant move:

#include "riscv_vector.h"
void
f (float x, float y, void *out)
{
  float f[4] = { x, x, x, y };
  vfloat32m1_t v = __riscv_vle32_v_f32m1 (f, 4);
  __riscv_vse32_v_f32m1 (out, v, 4);
}

Before this patch:

f:
vsetivlizero,4,e32,m1,ta,ma
addisp,sp,-16
vfmv.v.fv1,fa0
vfslide1down.vf v1,v1,fa1
vmv.v.v v1,v1   > redundant move.
vse32.v v1,0(a0)
addisp,sp,16
jr  ra

The rootcause is that the complicate vmv.v.v pattern doesn't simplify it
into simple (set (reg) (reg)) reg-to-reg move pattern.

Currently, we support such simplification for VLMAX.

However, the case I found is non-VLMAX but with LEN = NUNITS which should be
considered as equivalent to VLMAX.

Add a simple fix for such situation.

Tested on both RV32/RV64 no regressions. Ok for trunk ?

gcc/ChangeLog:

* config/riscv/riscv-protos.h (whole_reg_to_reg_move_p): New function.
* config/riscv/riscv-v.cc (whole_reg_to_reg_move_p): Ditto.
* config/riscv/vector.md: Allow non-vlmax with len = NUNITS 
simplification.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/vf_avl-4.c: New test.

---
 gcc/config/riscv/riscv-protos.h   |  1 +
 gcc/config/riscv/riscv-v.cc   | 21 +++
 gcc/config/riscv/vector.md|  9 ++--
 .../gcc.target/riscv/rvv/base/vf_avl-4.c  | 13 
 4 files changed, 37 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c

diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 0f0337cfb38..064e8f443f3 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -687,6 +687,7 @@ bool imm_avl_p (machine_mode);
 bool can_be_broadcasted_p (rtx);
 bool gather_scatter_valid_offset_p (machine_mode);
 HOST_WIDE_INT estimated_poly_value (poly_int64, unsigned int);
+bool whole_reg_to_reg_move_p (rtx *, machine_mode);
 }
 
 /* We classify builtin types into two classes:
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b7727b2b3e6..e5ba28d9078 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -5122,4 +5122,25 @@ estimated_poly_value (poly_int64 val, unsigned int kind)
   return val.coeffs[0] + val.coeffs[1] * over_min_vlen / TARGET_MIN_VLEN;
 }
 
+/* Return true it is whole register-register move.  */
+bool
+whole_reg_to_reg_move_p (rtx *ops, machine_mode mode)
+{
+  if (register_operand (ops[0], mode)
+  && register_operand (ops[3], mode)
+  && satisfies_constraint_vu (ops[2])
+  && satisfies_constraint_Wc1 (ops[1]))
+{
+  int vlmax_index = GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL ? 5 : 7;
+  if (INTVAL (ops[vlmax_index]) == VLMAX)
+   return true;
+  /* AVL propagation PASS will transform FIXED-VLMAX with NUNITS < 32
+into NON-VLMAX with LEN = NUNITS.  */
+  else if (CONST_INT_P (ops[4])
+  && known_eq (INTVAL (ops[4]), GET_MODE_NUNITS (mode)))
+   return true;
+}
+  return false;
+}
+
 } // namespace riscv_vector
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 3d2c1c3ce8f..abd293f310c 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -1724,10 +1724,7 @@
vse.v\t%3,%0%p1
vmv.v.v\t%0,%3
vmv.v.v\t%0,%3"
-  "&& register_operand (operands[0], mode)
-   && register_operand (operands[3], mode)
-   && satisfies_constraint_vu (operands[2])
-   && INTVAL (operands[7]) == riscv_vector::VLMAX"
+  "&& riscv_vector::whole_reg_to_reg_move_p (operands, mode)"
   [(set (match_dup 0) (match_dup 3))]
   ""
   [(set_attr "type" "vlde,vlde,vlde,vste,vimov,vimov")
@@ -1776,9 +1773,7 @@
vmmv.m\t%0,%3
vmclr.m\t%0
vmset.m\t%0"
-  "&& register_operand (operands[0], mode)
-   && register_operand (operands[3], mode)
-   && INTVAL (operands[5]) == riscv_vector::VLMAX"
+  "&& riscv_vector::whole_reg_to_reg_move_p (operands, mode)"
   [(set (match_dup 0) (match_dup 3))]
   ""
   [(set_attr "type" "vldm,vstm,vmalu,vmalu,vmalu")
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c
new file mode 100644
index 000..1b4bfd96481
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-4.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d --param 
riscv-autovec-preference=fixed-vlmax" } */
+
+#include "riscv_vector.h"
+void
+f (float x, float y, void *out)
+{
+  float f[4] = { x, x, x, y };
+  vfloat32m1_t v = __riscv_vle32_v_f32m1 (f, 4);
+  __riscv_vse32_v_f32m1 (out, v, 4);
+}
+
+/* { dg-final { scan-assembler-not {vmv} } } */
-- 
2.36.3

[PATCH] RISC-V: Teach liveness computation loop invariant shift amount[Dynamic LMUL]

2024-01-04 Thread Juzhe-Zhong

1). We not only have vashl_optab,vashr_optab,vlshr_optab which vectorize shift 
with vector shift amount,
that is, vectorization of 'a[i] >> x[i]', the shift amount is loop variant.
2). But also, we have ashl_optab, ashr_optab, lshr_optab which can vectorize 
shift with scalar shift amount,
that is, vectorization of 'a[i] >> x', the shift amount is loop invariant.

For the 2) case, we don't need to allocate a vector register group for shift 
amount.

So consider this following case:

void
f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int x,
   int n)
{
  for (int i = 0; i < n; i++)
{
  int tmp = b[i] >> x;
  int tmp2 = tmp * b[i];
  c[i] = tmp2 * b[i];
  d[i] = tmp * tmp2 * b[i] >> x;
}
}

Before this patch, we choose LMUL = 4, now after this patch, we can choose LMUL 
= 8:

f:
ble a5,zero,.L5
.L3:
vsetvli a0,a5,e32,m8,ta,ma
sllia6,a0,2
vle32.v v16,0(a1)
vsra.vx v24,v16,a4
vmul.vv v8,v24,v16
vmul.vv v0,v8,v16
vse32.v v0,0(a2)
vmul.vv v8,v8,v24
vmul.vv v8,v8,v16
vsra.vx v8,v8,a4
vse32.v v8,0(a3)
add a1,a1,a6
add a2,a2,a6
add a3,a3,a6
sub a5,a5,a0
bne a5,zero,.L3
.L5:
ret

Tested on both RV32/RV64 no regression.  Ok for trunk ?

Note that we will apply same heuristic for vadd.vx, ... etc when the 
late-combine pass from
Richard Sandiford is committed (Since we need late combine pass to do vv->vx 
transformation for vadd).

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (loop_invariant_op_p): New 
function.
(variable_vectorized_p): Teach loop invariant.
(has_unexpected_spills_p): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 31 +++--
 .../costmodel/riscv/rvv/dynamic-lmul4-12.c| 40 
 .../costmodel/riscv/rvv/dynamic-lmul8-14.c| 64 +++
 3 files changed, 131 insertions(+), 4 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-12.c
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-14.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index ec8156fbaf8..00b0b4d64b9 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -230,9 +230,24 @@ get_biggest_mode (machine_mode mode1, machine_mode mode2)
   return mode1_size >= mode2_size ? mode1 : mode2;
 }
 
+/* Return true if OP is invariant.  */
+
+static bool
+loop_invariant_op_p (class loop *loop,
+tree op)
+{
+  if (is_gimple_min_invariant (op))
+return true;
+  if (SSA_NAME_IS_DEFAULT_DEF (op)
+  || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (op
+return true;
+  return gimple_uid (SSA_NAME_DEF_STMT (op)) & 1;
+}
+
 /* Return true if the variable should be counted into liveness.  */
 static bool
-variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p)
+variable_vectorized_p (class loop *loop, stmt_vec_info stmt_info, tree var,
+  bool lhs_p)
 {
   if (!var)
 return false;
@@ -275,6 +290,10 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
 || !tree_fits_shwi_p (var)
 || !IN_RANGE (tree_to_shwi (var), -16, 15)
 || gimple_assign_rhs1 (stmt) != var;
+   case LSHIFT_EXPR:
+   case RSHIFT_EXPR:
+ return gimple_assign_rhs2 (stmt) != var
+|| !loop_invariant_op_p (loop, var);
default:
  break;
}
@@ -312,10 +331,12 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
The live range of SSA 2 is [0, 4] in bb 3.  */
 static machine_mode
 compute_local_live_ranges (
+  loop_vec_info loop_vinfo,
   const hash_map> &program_points_per_bb,
   hash_map> &live_ranges_per_bb)
 {
   machine_mode biggest_mode = QImode;
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   if (!program_points_per_bb.is_empty ())
 {
   auto_vec visited_vars;
@@ -339,7 +360,8 @@ compute_local_live_ranges (
  unsigned int point = program_point.point;
  gimple *stmt = program_point.stmt;
  tree lhs = gimple_get_lhs (stmt);
- if (variable_vectorized_p (program_point.stmt_info, lhs, true))
+ if (variable_vectorized_p (loop, program_point.stmt_info, lhs,
+true))
{
  biggest_mode = get_biggest_mode (biggest_mode,
   TYPE_MODE (TREE_TYPE (lhs)));
@@ -356,7 +378,7 @@ compute_local_live_ranges (
  for (i = 0; i < gimple_num_args (stmt); i++)

[Committed V3] RISC-V: Make liveness estimation be aware of .vi variant

2024-01-04 Thread Juzhe-Zhong

Consider this following case:

void
f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
{
  for (int i = 0; i < n; i++)
{
  int tmp = b[i] + 15;
  int tmp2 = tmp + b[i];
  c[i] = tmp2 + b[i];
  d[i] = tmp + tmp2 + b[i];
}
}

Current dynamic LMUL cost model choose LMUL = 4 because we count the "15" as
consuming 1 vector register group which is not accurate.

We teach the dynamic LMUL cost model be aware of the potential vi variant 
instructions
transformation, so that we can choose LMUL = 8 according to more accurate cost 
model.

After this patch:

f:
ble a4,zero,.L5
.L3:
vsetvli a5,a4,e32,m8,ta,ma
sllia0,a5,2
vle32.v v16,0(a1)
vadd.vi v24,v16,15
vadd.vv v8,v24,v16
vadd.vv v0,v8,v16
vse32.v v0,0(a2)
vadd.vv v8,v8,v24
vadd.vv v8,v8,v16
vse32.v v8,0(a3)
add a1,a1,a0
add a2,a2,a0
add a3,a3,a0
sub a4,a4,a5
bne a4,zero,.L3
.L5:
ret

Tested on both RV32 and RV64 no regression.

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (variable_vectorized_p): Teach vi 
variant.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 32 ++--
 .../costmodel/riscv/rvv/dynamic-lmul8-13.c| 74 +++
 2 files changed, 99 insertions(+), 7 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 21f8a81c89c..ec8156fbaf8 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -255,6 +255,31 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
return false;
}
 }
+  else if (is_gimple_assign (stmt))
+{
+  tree_code tcode = gimple_assign_rhs_code (stmt);
+  /* vi variant doesn't need to allocate such statement.
+E.g. tmp_15 = _4 + 1; will be transformed into vadd.vi
+so the INTEGER_CST '1' doesn't need a vector register.  */
+  switch (tcode)
+   {
+   case PLUS_EXPR:
+   case BIT_IOR_EXPR:
+   case BIT_XOR_EXPR:
+   case BIT_AND_EXPR:
+ return TREE_CODE (var) != INTEGER_CST
+|| !tree_fits_shwi_p (var)
+|| !IN_RANGE (tree_to_shwi (var), -16, 15);
+   case MINUS_EXPR:
+ return TREE_CODE (var) != INTEGER_CST
+|| !tree_fits_shwi_p (var)
+|| !IN_RANGE (tree_to_shwi (var), -16, 15)
+|| gimple_assign_rhs1 (stmt) != var;
+   default:
+ break;
+   }
+}
+
   if (lhs_p)
 return is_gimple_reg (var)
   && (!POINTER_TYPE_P (TREE_TYPE (var))
@@ -331,13 +356,6 @@ compute_local_live_ranges (
  for (i = 0; i < gimple_num_args (stmt); i++)
{
  tree var = gimple_arg (stmt, i);
- /* Both IMM and REG are included since a VECTOR_CST may be
-potentially held in a vector register.  However, it's not
-accurate, since a PLUS_EXPR can be vectorized into vadd.vi
-if IMM is -16 ~ 15.
-
-TODO: We may elide the cases that the unnecessary IMM in
-the future.  */
  if (variable_vectorized_p (program_point.stmt_info, var,
 false))
{
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c
new file mode 100644
index 000..baef4e39014
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c
@@ -0,0 +1,74 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param 
riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */
+
+void
+f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = b[i] + 15;
+  int tmp2 = tmp + b[i];
+  c[i] = tmp2 + b[i];
+  d[i] = tmp + tmp2 + b[i];
+}
+}
+
+void
+f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = 15 - b[i];
+  int tmp2 = tmp * b[i];
+  c[i] = tmp2 * b[i];
+  d[i] = tmp * tmp2 * b[i];
+}
+}
+
+void
+f3 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = b[i] & 15;
+  int tmp2 = tmp * b[i];
+  c[i] = tmp2 * b[i];
+  d[i] = tmp * tmp2 * b[i];
+}
+}
+
+void
+f4 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = b[i]

[Committed V2] RISC-V: Make liveness estimation be aware of .vi variant

2024-01-04 Thread Juzhe-Zhong

Consider this following case:

void
f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
{
  for (int i = 0; i < n; i++)
{
  int tmp = b[i] + 15;
  int tmp2 = tmp + b[i];
  c[i] = tmp2 + b[i];
  d[i] = tmp + tmp2 + b[i];
}
}

Current dynamic LMUL cost model choose LMUL = 4 because we count the "15" as
consuming 1 vector register group which is not accurate.

We teach the dynamic LMUL cost model be aware of the potential vi variant 
instructions
transformation, so that we can choose LMUL = 8 according to more accurate cost 
model.

After this patch:

f:
ble a4,zero,.L5
.L3:
vsetvli a5,a4,e32,m8,ta,ma
sllia0,a5,2
vle32.v v16,0(a1)
vadd.vi v24,v16,15
vadd.vv v8,v24,v16
vadd.vv v0,v8,v16
vse32.v v0,0(a2)
vadd.vv v8,v8,v24
vadd.vv v8,v8,v16
vse32.v v8,0(a3)
add a1,a1,a0
add a2,a2,a0
add a3,a3,a0
sub a4,a4,a5
bne a4,zero,.L3
.L5:
ret

Tested on both RV32 and RV64 no regression. Ok for trunk ?

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (variable_vectorized_p): Teach vi 
variant.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 30 ++--
 .../costmodel/riscv/rvv/dynamic-lmul8-13.c| 74 +++
 2 files changed, 97 insertions(+), 7 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 21f8a81c89c..e4435032035 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -255,6 +255,29 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
return false;
}
 }
+  else if (is_gimple_assign (stmt))
+{
+  tree_code tcode = gimple_assign_rhs_code (stmt);
+  /* vi variant doesn't need to allocate such statement.
+E.g. tmp_15 = _4 + 1; will be transformed into vadd.vi
+so the INTEGER_CST '1' doesn't need a vector register.  */
+  switch (tcode)
+   {
+   case PLUS_EXPR:
+   case BIT_IOR_EXPR:
+   case BIT_XOR_EXPR:
+   case BIT_AND_EXPR:
+ return TREE_CODE (var) != INTEGER_CST
+|| !IN_RANGE (tree_to_shwi (var), -16, 15);
+   case MINUS_EXPR:
+ return TREE_CODE (var) != INTEGER_CST
+|| !IN_RANGE (tree_to_shwi (var), -16, 15)
+|| gimple_assign_rhs1 (stmt) != var;
+   default:
+ break;
+   }
+}
+
   if (lhs_p)
 return is_gimple_reg (var)
   && (!POINTER_TYPE_P (TREE_TYPE (var))
@@ -331,13 +354,6 @@ compute_local_live_ranges (
  for (i = 0; i < gimple_num_args (stmt); i++)
{
  tree var = gimple_arg (stmt, i);
- /* Both IMM and REG are included since a VECTOR_CST may be
-potentially held in a vector register.  However, it's not
-accurate, since a PLUS_EXPR can be vectorized into vadd.vi
-if IMM is -16 ~ 15.
-
-TODO: We may elide the cases that the unnecessary IMM in
-the future.  */
  if (variable_vectorized_p (program_point.stmt_info, var,
 false))
{
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c
new file mode 100644
index 000..baef4e39014
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c
@@ -0,0 +1,74 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param 
riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */
+
+void
+f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = b[i] + 15;
+  int tmp2 = tmp + b[i];
+  c[i] = tmp2 + b[i];
+  d[i] = tmp + tmp2 + b[i];
+}
+}
+
+void
+f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = 15 - b[i];
+  int tmp2 = tmp * b[i];
+  c[i] = tmp2 * b[i];
+  d[i] = tmp * tmp2 * b[i];
+}
+}
+
+void
+f3 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = b[i] & 15;
+  int tmp2 = tmp * b[i];
+  c[i] = tmp2 * b[i];
+  d[i] = tmp * tmp2 * b[i];
+}
+}
+
+void
+f4 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = b[i] | 15;
+  int tmp2 = tmp * b[i];
+  c[i] = tmp2 * b[i];
+  d[i

[PATCH] RISC-V: Teach liveness estimation be aware of .vi variant

2024-01-04 Thread Juzhe-Zhong

Consider this following case:

void
f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
{
  for (int i = 0; i < n; i++)
{
  int tmp = b[i] + 15;
  int tmp2 = tmp + b[i];
  c[i] = tmp2 + b[i];
  d[i] = tmp + tmp2 + b[i];
}
}

Current dynamic LMUL cost model choose LMUL = 4 because we count the "15" as
consuming 1 vector register group which is not accurate.

We teach the dynamic LMUL cost model be aware of the potential vi variant 
instructions
transformation, so that we can choose LMUL = 8 according to more accurate cost 
model.

After this patch:

f:
ble a4,zero,.L5
.L3:
vsetvli a5,a4,e32,m8,ta,ma
sllia0,a5,2
vle32.v v16,0(a1)
vadd.vi v24,v16,15
vadd.vv v8,v24,v16
vadd.vv v0,v8,v16
vse32.v v0,0(a2)
vadd.vv v8,v8,v24
vadd.vv v8,v8,v16
vse32.v v8,0(a3)
add a1,a1,a0
add a2,a2,a0
add a3,a3,a0
sub a4,a4,a5
bne a4,zero,.L3
.L5:
ret

Tested on both RV32 and RV64 no regression. Ok for trunk ?

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (variable_vectorized_p): Teach vi 
variant.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 30 ++--
 .../costmodel/riscv/rvv/dynamic-lmul8-13.c| 74 +++
 2 files changed, 97 insertions(+), 7 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 21f8a81c89c..7f083b04edd 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -255,6 +255,29 @@ variable_vectorized_p (stmt_vec_info stmt_info, tree var, 
bool lhs_p)
return false;
}
 }
+  else if (is_gimple_assign (stmt))
+{
+  tree_code tcode = gimple_assign_rhs_code (stmt);
+  /* vi variant doesn't need to allocate such statement.
+E.g. tmp_15 = _4 + 1; will be transformed into vadd.vi
+so the INTEGER_CST '1' doesn't need vector a register.  */
+  switch (tcode)
+   {
+   case PLUS_EXPR:
+   case BIT_IOR_EXPR:
+   case BIT_XOR_EXPR:
+   case BIT_AND_EXPR:
+ return TREE_CODE (var) != INTEGER_CST
+|| !IN_RANGE (tree_to_shwi (var), -16, 15);
+   case MINUS_EXPR:
+ return TREE_CODE (var) != INTEGER_CST
+|| !IN_RANGE (tree_to_shwi (var), -16, 15)
+|| gimple_assign_rhs1 (stmt) != var;
+   default:
+ break;
+   }
+}
+
   if (lhs_p)
 return is_gimple_reg (var)
   && (!POINTER_TYPE_P (TREE_TYPE (var))
@@ -331,13 +354,6 @@ compute_local_live_ranges (
  for (i = 0; i < gimple_num_args (stmt); i++)
{
  tree var = gimple_arg (stmt, i);
- /* Both IMM and REG are included since a VECTOR_CST may be
-potentially held in a vector register.  However, it's not
-accurate, since a PLUS_EXPR can be vectorized into vadd.vi
-if IMM is -16 ~ 15.
-
-TODO: We may elide the cases that the unnecessary IMM in
-the future.  */
  if (variable_vectorized_p (program_point.stmt_info, var,
 false))
{
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c
new file mode 100644
index 000..baef4e39014
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-13.c
@@ -0,0 +1,74 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param 
riscv-autovec-lmul=dynamic -fdump-tree-vect-details" } */
+
+void
+f (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = b[i] + 15;
+  int tmp2 = tmp + b[i];
+  c[i] = tmp2 + b[i];
+  d[i] = tmp + tmp2 + b[i];
+}
+}
+
+void
+f2 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = 15 - b[i];
+  int tmp2 = tmp * b[i];
+  c[i] = tmp2 * b[i];
+  d[i] = tmp * tmp2 * b[i];
+}
+}
+
+void
+f3 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = b[i] & 15;
+  int tmp2 = tmp * b[i];
+  c[i] = tmp2 * b[i];
+  d[i] = tmp * tmp2 * b[i];
+}
+}
+
+void
+f4 (int *restrict a, int *restrict b, int *restrict c, int *restrict d, int n)
+{
+  for (int i = 0; i < n; i++)
+{
+  int tmp = b[i] | 15;
+  int tmp2 = tmp * b[i];
+  c[i] = tmp2 * b[i];
+  d[i

[Committed] RISC-V: Refine LMUL computation for MASK_LEN_LOAD/MASK_LEN_STORE IFN

2024-01-03 Thread Juzhe-Zhong

Notice a case has "Maximum lmul = 16" which is incorrect.
Correct LMUL estimation for MASK_LEN_LOAD/MASK_LEN_STORE.

Committed.

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (variable_vectorized_p): New 
function.
(compute_nregs_for_mode): Refine LMUL.
(max_number_of_live_regs): Ditto.
(compute_estimated_lmul): Ditto.
(has_unexpected_spills_p): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-11.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 66 +++
 .../costmodel/riscv/rvv/dynamic-lmul4-11.c| 16 +
 2 files changed, 68 insertions(+), 14 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-11.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index b9fdfdc5e3a..21f8a81c89c 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -230,6 +230,42 @@ get_biggest_mode (machine_mode mode1, machine_mode mode2)
   return mode1_size >= mode2_size ? mode1 : mode2;
 }
 
+/* Return true if the variable should be counted into liveness.  */
+static bool
+variable_vectorized_p (stmt_vec_info stmt_info, tree var, bool lhs_p)
+{
+  if (!var)
+return false;
+  gimple *stmt = STMT_VINFO_STMT (stmt_info);
+  enum stmt_vec_info_type type
+= STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
+  if (is_gimple_call (stmt) && gimple_call_internal_p (stmt))
+{
+  if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE
+ || gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
+   {
+ /* .MASK_LOAD (_5, 32B, _33)
+ ^^^
+Only the 3rd argument will be vectorized and consume
+a vector register.  */
+ if (TREE_CODE (TREE_TYPE (var)) == BOOLEAN_TYPE
+ || (is_gimple_reg (var) && !POINTER_TYPE_P (TREE_TYPE (var
+   return true;
+ else
+   return false;
+   }
+}
+  if (lhs_p)
+return is_gimple_reg (var)
+  && (!POINTER_TYPE_P (TREE_TYPE (var))
+  || type != store_vec_info_type);
+  else
+return poly_int_tree_p (var)
+  || (is_gimple_val (var)
+  && (!POINTER_TYPE_P (TREE_TYPE (var))
+  || type != load_vec_info_type));
+}
+
 /* Compute local live ranges of each vectorized variable.
Note that we only compute local live ranges (within a block) since
local live ranges information is accurate enough for us to determine
@@ -277,13 +313,8 @@ compute_local_live_ranges (
{
  unsigned int point = program_point.point;
  gimple *stmt = program_point.stmt;
- stmt_vec_info stmt_info = program_point.stmt_info;
  tree lhs = gimple_get_lhs (stmt);
- enum stmt_vec_info_type type
-   = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
- if (lhs != NULL_TREE && is_gimple_reg (lhs)
- && (!POINTER_TYPE_P (TREE_TYPE (lhs))
- || type != store_vec_info_type))
+ if (variable_vectorized_p (program_point.stmt_info, lhs, true))
{
  biggest_mode = get_biggest_mode (biggest_mode,
   TYPE_MODE (TREE_TYPE (lhs)));
@@ -307,10 +338,8 @@ compute_local_live_ranges (
 
 TODO: We may elide the cases that the unnecessary IMM in
 the future.  */
- if (poly_int_tree_p (var)
- || (is_gimple_val (var)
- && (!POINTER_TYPE_P (TREE_TYPE (var))
- || type != load_vec_info_type)))
+ if (variable_vectorized_p (program_point.stmt_info, var,
+false))
{
  biggest_mode
= get_biggest_mode (biggest_mode,
@@ -383,7 +412,9 @@ compute_nregs_for_mode (loop_vec_info loop_vinfo, 
machine_mode mode,
   unsigned int biggest_size = GET_MODE_SIZE (biggest_mode).to_constant ();
   gcc_assert (biggest_size >= mode_size);
   unsigned int ratio = biggest_size / mode_size;
-  return MAX (lmul / ratio, 1) * rgroup_size;
+  /* RVV mask bool modes always consume 1 vector register regardless LMUL.  */
+  unsigned int nregs = mode == BImode ? 1 : lmul / ratio;
+  return MAX (nregs, 1) * rgroup_size;
 }
 
 /* This function helps to determine whether current LMUL will cause
@@ -414,7 +445,9 @@ max_number_of_live_regs (loop_vec_info loop_vinfo, const 
basic_block bb,
   pair live_range = (*iter).second;
   for (i = live_range.first + 1; i <= live_range.second; i++)
{
- machine_mode mode = TYPE_MODE (TREE_TYPE (var));
+ machine_mode mode = TREE_CODE (TREE_TYPE (var)) == BOOLEAN_TYPE
+   ? BImode
+

[Committed] RISC-V: Fix indent

2024-01-03 Thread Juzhe-Zhong

Fix indent of some codes to make them 8 spaces align.

Committed.

gcc/ChangeLog:

* config/riscv/vector.md: Fix indent.

---
 gcc/config/riscv/vector.md | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 24f91f058ef..1de656a8ced 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -53,8 +53,8 @@
  
vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,\
  
vgather,vcompress,vlsegde,vssegte,vlsegds,vssegts,vlsegdux,vlsegdox,\
  
vssegtux,vssegtox,vlsegdff,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,\
-  
vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,\
-  
vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c")
+ 
vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,\
+ 
vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c")
 (const_string "true")]
(const_string "false")))
 
@@ -77,8 +77,8 @@
  
vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,\
  
vgather,vcompress,vlsegde,vssegte,vlsegds,vssegts,vlsegdux,vlsegdox,\
  
vssegtux,vssegtox,vlsegdff,vandn,vbrev,vbrev8,vrev8,vclz,vctz,vrol,\
-  
vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,\
-  
vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c")
+ 
vror,vwsll,vclmul,vclmulh,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,\
+ 
vaeskf1,vaeskf2,vaesz,vsha2ms,vsha2ch,vsha2cl,vsm4k,vsm4r,vsm3me,vsm3c")
 (const_string "true")]
(const_string "false")))
 
@@ -707,7 +707,7 @@

vired,viwred,vfredu,vfredo,vfwredu,vfwredo,vimovxv,vfmovfv,\

vslideup,vslidedown,vislide1up,vislide1down,vfslide1up,vfslide1down,\

vgather,vldff,viwmuladd,vfwmuladd,vlsegde,vlsegds,vlsegdux,vlsegdox,vlsegdff,\
-
vandn,vbrev,vbrev8,vrev8,vrol,vror,vwsll,vclmul,vclmulh")
+   
vandn,vbrev,vbrev8,vrev8,vrol,vror,vwsll,vclmul,vclmulh")
   (const_int 2)
 
   (eq_attr "type" 
"vimerge,vfmerge,vcompress,vghsh,vgmul,vaesef,vaesem,vaesdf,vaesdm,\
-- 
2.36.3

[Committed V3] RISC-V: Fix bug of earliest fusion for infinite loop[VSETVL PASS]

2024-01-03 Thread Juzhe-Zhong

As PR113206 and PR113209, the bugs happens on the following situation:

li  a4,32
...
vsetvli zero,a4,e8,m8,ta,ma
...
slliw   a4,a3,24
sraiw   a4,a4,24
bge a3,a1,.L8
sb  a4,%lo(e)(a0)
vsetvli zero,a4,e8,m8,ta,ma  --> a4 is polluted value not the expected 
"32".
...
.L7:
j   .L7 ---> infinite loop.

The root cause is that infinite loop confuse earliest computation and let 
earliest fusion
happens on unexpected place.

Disable blocks that belong to infinite loop to fix this bug since applying 
ealiest LCM fusion
on infinite loop seems quite complicated and we don't see any benefits.

Note that disabling earliest fusion on infinite loops doesn't hurt the vsetvli 
performance,
instead, it does improve codegen of some cases.

Tested on both RV32 and RV64 no regression.

PR target/113206
PR target/113209

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (invalid_opt_bb_p): New function.
(pre_vsetvl::compute_lcm_local_properties): Disable earliest fusion on 
blocks belong to infinite loop.
(pre_vsetvl::emit_vsetvl): Remove fake edges.
* config/riscv/t-riscv: Add a new include file.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/avl_single-23.c: Adapt test.
* gcc.target/riscv/rvv/vsetvl/vlmax_call-1.c: Robostify test.
* gcc.target/riscv/rvv/vsetvl/vlmax_call-2.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_call-3.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-1.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-2.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-3.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-4.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/pr113206-1.c: New test.
* gcc.target/riscv/rvv/autovec/pr113206-2.c: New test.
* gcc.target/riscv/rvv/autovec/pr113209.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 43 +++
 gcc/config/riscv/t-riscv  |  2 +-
 .../gcc.target/riscv/rvv/autovec/pr113206-1.c | 29 +
 .../gcc.target/riscv/rvv/autovec/pr113206-2.c | 29 +
 .../gcc.target/riscv/rvv/autovec/pr113209.c   | 34 +++
 .../riscv/rvv/vsetvl/avl_single-23.c  |  1 -
 .../riscv/rvv/vsetvl/vlmax_call-1.c   | 15 +++
 .../riscv/rvv/vsetvl/vlmax_call-2.c   | 12 +++---
 .../riscv/rvv/vsetvl/vlmax_call-3.c   | 12 +++---
 .../riscv/rvv/vsetvl/vlmax_conflict-5.c   |  5 +--
 .../riscv/rvv/vsetvl/vlmax_single_vtype-1.c   | 14 +++---
 .../riscv/rvv/vsetvl/vlmax_single_vtype-2.c   |  6 +--
 .../riscv/rvv/vsetvl/vlmax_single_vtype-3.c   |  6 +--
 .../riscv/rvv/vsetvl/vlmax_single_vtype-4.c   |  4 +-
 .../riscv/rvv/vsetvl/vlmax_single_vtype-5.c   |  4 +-
 15 files changed, 166 insertions(+), 50 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index eabaef80f89..d44922feafd 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -85,6 +85,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "predict.h"
 #include "profile-count.h"
 #include "gcse.h"
+#include "cfgloop.h"
 
 using namespace rtl_ssa;
 using namespace riscv_vector;
@@ -648,6 +649,27 @@ has_no_uses (basic_block cfg_bb, rtx_insn *rinsn, int 
regno)
   return true;
 }
 
+/* Return true for the special block that we can't apply LCM optimization.  */
+static bool
+invalid_opt_bb_p (basic_block cfg_bb)
+{
+  edge e;
+  edge_iterator ei;
+
+  /* We don't do LCM optimizations on complex edges.  */
+  FOR_EACH_EDGE (e, ei, cfg_bb->preds)
+if (e->flags & EDGE_COMPLEX)
+  return true;
+
+  /* We only do LCM optimizations on blocks that are post dominated by
+ EXIT block, that is, we don't do LCM optimizations on infinite loop.  */
+  FOR_EACH_EDGE (e, ei, cfg_bb->succs)
+if (e->flags & EDGE_FAKE)
+  return true;
+
+  return false;
+}
+
 /* This flags indicates the minimum demand of the vl and vtype values by the
RVV instruction. For example, DEMAND_RATIO_P indicates that this RVV
instruction only needs the SEW/LMUL ratio to remain the same, and does not
@@ -2261,6 +2283,9 @@ public:
   {
 /* Initialization of RTL_SSA.  */
 calculate_dominance_info (CDI_DOMINATORS);
+loop_optimizer_init (LOOPS_NORMAL);
+/* Create FAKE edges for infinite loops.  */
+connect_infinite_loops_to_exit ();
 df_analyze ();
 crtl->ssa = new function_info (cfun);
 m_vector_block_infos.safe_grow_c

[PATCH V2] RISC-V: Fix bug of earliest fusion for infinite loop[VSETVL PASS]

2024-01-03 Thread Juzhe-Zhong

As PR113206 and PR113209, the bugs happens on the following situation:

li  a4,32
...
vsetvli zero,a4,e8,m8,ta,ma
...
slliw   a4,a3,24
sraiw   a4,a4,24
bge a3,a1,.L8
sb  a4,%lo(e)(a0)
vsetvli zero,a4,e8,m8,ta,ma  --> a4 is polluted value not the expected 
"32".
...
.L7:
j   .L7 ---> infinite loop.

The root cause is that infinite loop confuse earliest computation and let 
earliest fusion
happens on unexpected place.

Disable blocks that belong to infinite loop to fix this bug since applying 
ealiest LCM fusion
on infinite loop seems quite complicated and we don't see any benefits.

Note that disabling earliest fusion on infinite loops doesn't hurt the vsetvli 
performance,
instead, it does improve codegen of some cases.

Tested on both RV32 and RV64 no regression.

Ok for trunk ?

PR target/113206
PR target/113209

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (invalid_opt_bb_p): New function.
(pre_vsetvl::compute_lcm_local_properties): Disable earliest fusion on 
blocks belong to infinite loop.
(pre_vsetvl::emit_vsetvl): Remove fake edges.
* config/riscv/t-riscv: Add a new include file.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/avl_single-23.c: Adapt test.
* gcc.target/riscv/rvv/vsetvl/vlmax_call-1.c: Robostify test.
* gcc.target/riscv/rvv/vsetvl/vlmax_call-2.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_call-3.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-1.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-2.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-3.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-4.c: Ditto.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-5.c: Ditto.
* gcc.target/riscv/rvv/autovec/pr113206-1.c: New test.
* gcc.target/riscv/rvv/autovec/pr113206-2.c: New test.
* gcc.target/riscv/rvv/autovec/pr113209.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 41 +++
 gcc/config/riscv/t-riscv  |  2 +-
 .../gcc.target/riscv/rvv/autovec/pr113206-1.c | 29 +
 .../gcc.target/riscv/rvv/autovec/pr113206-2.c | 29 +
 .../gcc.target/riscv/rvv/autovec/pr113209.c   | 34 +++
 .../riscv/rvv/vsetvl/avl_single-23.c  |  1 -
 .../riscv/rvv/vsetvl/vlmax_call-1.c   | 15 ---
 .../riscv/rvv/vsetvl/vlmax_call-2.c   | 12 +++---
 .../riscv/rvv/vsetvl/vlmax_call-3.c   | 12 +++---
 .../riscv/rvv/vsetvl/vlmax_conflict-5.c   |  5 +--
 .../riscv/rvv/vsetvl/vlmax_single_vtype-1.c   | 14 +++
 .../riscv/rvv/vsetvl/vlmax_single_vtype-2.c   |  6 +--
 .../riscv/rvv/vsetvl/vlmax_single_vtype-3.c   |  6 +--
 .../riscv/rvv/vsetvl/vlmax_single_vtype-4.c   |  4 +-
 .../riscv/rvv/vsetvl/vlmax_single_vtype-5.c   |  4 +-
 15 files changed, 164 insertions(+), 50 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-2.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113209.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index eabaef80f89..7b1d8376e41 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -85,6 +85,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "predict.h"
 #include "profile-count.h"
 #include "gcse.h"
+#include "cfgloop.h"
 
 using namespace rtl_ssa;
 using namespace riscv_vector;
@@ -648,6 +649,27 @@ has_no_uses (basic_block cfg_bb, rtx_insn *rinsn, int 
regno)
   return true;
 }
 
+/* Return true for the special block that we can't apply LCM optimization.  */
+static bool
+invalid_opt_bb_p (basic_block cfg_bb)
+{
+  edge e;
+  edge_iterator ei;
+
+  /* We don't do LCM optimizations on complex edges.  */
+  FOR_EACH_EDGE (e, ei, cfg_bb->preds)
+if (e->flags & EDGE_COMPLEX)
+  return true;
+
+  /* We only do LCM optimizations on blocks that are post dominated by
+ EXIT block, that is, we don't do LCM optimizations on infinite loop.  */
+  FOR_EACH_EDGE (e, ei, cfg_bb->succs)
+if (e->flags & EDGE_FAKE)
+  return true;
+
+  return false;
+}
+
 /* This flags indicates the minimum demand of the vl and vtype values by the
RVV instruction. For example, DEMAND_RATIO_P indicates that this RVV
instruction only needs the SEW/LMUL ratio to remain the same, and does not
@@ -2261,6 +2283,8 @@ public:
   {
 /* Initialization of RTL_SSA.  */
 calculate_dominance_info (CDI_DOMINATORS);
+loop_optimizer_init (LOOPS_NORMAL);
+connect_infinite_loops_to_exit ();
 df_analyze ();
 crtl->ssa = new function_info (cfun);
 m_vector_block_infos.safe_grow_cleared (last_basic_block_for_fn (

[PATCH] RISC-V: Fix bug of earliest fusion for infinite loop[VSETVL PASS]

2024-01-03 Thread Juzhe-Zhong

As PR113206, the bugs happens on the following situation:

li  a4,32
...
vsetvli zero,a4,e8,m8,ta,ma
...
slliw   a4,a3,24
sraiw   a4,a4,24
bge a3,a1,.L8
sb  a4,%lo(e)(a0)
vsetvli zero,a4,e8,m8,ta,ma  --> a4 is polluted value not the expected 
"32".
...
.L7:
j   .L7 ---> infinite loop.

The root cause is that infinite loop confuse earliest computation and let 
earliest fusion
happens on unexpected place.

Disable blocks that belong to infinite loop to fix this bug since applying 
ealiest LCM fusion
on infinite loop seems quite complicated and we don't see any benefits.

Note that disabling earliest fusion on infinite loops doesn't hurt the vsetvli 
performance,
instead, it does improve codegen of some cases.

Tested on both RV32 and RV64 no regression.

Ok for trunk ?

PR target/113206

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc (invalid_opt_bb_p): New function.
(pre_vsetvl::compute_lcm_local_properties): Disable earliest fusion on 
blocks belong to infinite loop.
* config/riscv/t-riscv: Add a new include file.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/vsetvl/avl_single-23.c: Adapt test.
* gcc.target/riscv/rvv/vsetvl/vlmax_call-1.c: Robostify test.
* gcc.target/riscv/rvv/vsetvl/vlmax_call-2.c: Robostify test.
* gcc.target/riscv/rvv/vsetvl/vlmax_call-3.c: Robostify test.
* gcc.target/riscv/rvv/vsetvl/vlmax_conflict-5.c: Robostify test.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-1.c: Robostify test.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-2.c: Robostify test.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-3.c: Robostify test.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-4.c: Robostify test.
* gcc.target/riscv/rvv/vsetvl/vlmax_single_vtype-5.c: Robostify test.
* gcc.target/riscv/rvv/autovec/pr113206-1.c: New test.
* gcc.target/riscv/rvv/autovec/pr113206-2.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 39 +++
 gcc/config/riscv/t-riscv  |  2 +-
 .../gcc.target/riscv/rvv/autovec/pr113206-1.c | 29 ++
 .../gcc.target/riscv/rvv/autovec/pr113206-2.c | 29 ++
 .../riscv/rvv/vsetvl/avl_single-23.c  |  1 -
 .../riscv/rvv/vsetvl/vlmax_call-1.c   | 15 ---
 .../riscv/rvv/vsetvl/vlmax_call-2.c   | 12 +++---
 .../riscv/rvv/vsetvl/vlmax_call-3.c   | 12 +++---
 .../riscv/rvv/vsetvl/vlmax_conflict-5.c   |  5 +--
 .../riscv/rvv/vsetvl/vlmax_single_vtype-1.c   | 14 +++
 .../riscv/rvv/vsetvl/vlmax_single_vtype-2.c   |  6 +--
 .../riscv/rvv/vsetvl/vlmax_single_vtype-3.c   |  6 +--
 .../riscv/rvv/vsetvl/vlmax_single_vtype-4.c   |  4 +-
 .../riscv/rvv/vsetvl/vlmax_single_vtype-5.c   |  4 +-
 14 files changed, 128 insertions(+), 50 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-1.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113206-2.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index eabaef80f89..07dcdfd217e 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -85,6 +85,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "predict.h"
 #include "profile-count.h"
 #include "gcse.h"
+#include "cfgloop.h"
 
 using namespace rtl_ssa;
 using namespace riscv_vector;
@@ -648,6 +649,27 @@ has_no_uses (basic_block cfg_bb, rtx_insn *rinsn, int 
regno)
   return true;
 }
 
+/* Return true for the special block that we can't apply LCM optimization.  */
+static bool
+invalid_opt_bb_p (basic_block cfg_bb)
+{
+  edge e;
+  edge_iterator ei;
+
+  /* We don't do LCM optimizations on complex edges.  */
+  FOR_EACH_EDGE (e, ei, cfg_bb->preds)
+if (e->flags & EDGE_COMPLEX)
+  return true;
+
+  /* We only do LCM optimizations on blocks that are post dominated by
+ EXIT block, that is, we don't do LCM optimizations on infinite loop.  */
+  FOR_EACH_EDGE (e, ei, cfg_bb->succs)
+if (e->flags & EDGE_FAKE)
+  return true;
+
+  return false;
+}
+
 /* This flags indicates the minimum demand of the vl and vtype values by the
RVV instruction. For example, DEMAND_RATIO_P indicates that this RVV
instruction only needs the SEW/LMUL ratio to remain the same, and does not
@@ -2261,6 +2283,8 @@ public:
   {
 /* Initialization of RTL_SSA.  */
 calculate_dominance_info (CDI_DOMINATORS);
+loop_optimizer_init (LOOPS_NORMAL);
+connect_infinite_loops_to_exit ();
 df_analyze ();
 crtl->ssa = new function_info (cfun);
 m_vector_block_infos.safe_grow_cleared (last_basic_block_for_fn (cfun));
@@ -2271,6 +2295,8 @@ public:
   void finish ()
   {
 free_dominance_info (CDI_DOMINATORS);
+remove_fake_exit_edges ();
+loop_optimizer_finalize ();
 if (crtl->ssa->perform_pending_updates ())

[Committed] RISC-V: Add simplification of dummy len and dummy mask COND_LEN_xxx pattern

2024-01-01 Thread Juzhe-Zhong

In 
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=d1eacedc6d9ba9f5522f2c8d49ccfdf7939ad72d
I optimize COND_LEN_xxx pattern with dummy len and dummy mask with too simply 
solution which
causes redundant vsetvli in the following case:

vsetvli a5,a2,e8,m1,ta,ma
vle32.v v8,0(a0)
vsetivlizero,16,e32,m4,tu,mu   > We should apply VLMAX 
instead of a CONST_INT AVL
sllia4,a5,2
vand.vv v0,v8,v16
vand.vv v4,v8,v12
vmseq.viv0,v0,0
sub a2,a2,a5
vneg.v  v4,v8,v0.t
vsetvli zero,a5,e32,m4,ta,ma

The root cause above is the following codes:

is_vlmax_len_p (...)
   return poly_int_rtx_p (len, &value)
&& known_eq (value, GET_MODE_NUNITS (mode))
&& !satisfies_constraint_K (len);---> incorrect check.

Actually, we should not elide the VLMAX situation that has AVL in range of 
[0,31].

After removing the the check above, we will have this following issue:

vsetivlizero,4,e32,m1,ta,ma
vlseg4e32.v v4,(a5)
vlseg4e32.v v12,(a3)
vsetvli a5,zero,e32,m1,tu,ma ---> This is redundant since 
VLMAX AVL = 4 when it is fixed-vlmax
vfadd.vfv3,v13,fa0
vfadd.vfv1,v12,fa1
vfmul.vvv17,v3,v5
vfmul.vvv16,v1,v5

Since all the following operations (vfadd.vf ... etc) are COND_LEN_xxx with 
dummy len and dummy mask,
we add the simplification operations dummy len and dummy mask into VLMAX TA and 
MA policy.

So, after this patch. Both cases are optimal codegen now:

case 1:
vsetvli a5,a2,e32,m1,ta,mu
vle32.v v2,0(a0)
sllia4,a5,2
vand.vv v1,v2,v3
vand.vv v0,v2,v4
sub a2,a2,a5
vmseq.viv0,v0,0
vneg.v  v1,v2,v0.t
vse32.v v1,0(a1)

case 2:
vsetivli zero,4,e32,m1,tu,ma
addi a4,a5,400
vlseg4e32.v v12,(a3)
vfadd.vf v3,v13,fa0
vfadd.vf v1,v12,fa1
vlseg4e32.v v4,(a4)
vfadd.vf v2,v14,fa1
vfmul.vv v17,v3,v5
vfmul.vv v16,v1,v5

This patch is just additional fix of previous approved patch.
Tested on both RV32 and RV64 newlib no regression. Committed.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (is_vlmax_len_p): Remove 
satisfies_constraint_K.
(expand_cond_len_op): Add simplification of dummy len and dummy mask.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/base/vf_avl-3.c: New test.

---
 gcc/config/riscv/riscv-v.cc| 11 ---
 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-3.c | 11 +++
 2 files changed, 19 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-3.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index b4c7e0f0126..3c83be35715 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -74,8 +74,7 @@ is_vlmax_len_p (machine_mode mode, rtx len)
 {
   poly_int64 value;
   return poly_int_rtx_p (len, &value)
-&& known_eq (value, GET_MODE_NUNITS (mode))
-&& !satisfies_constraint_K (len);
+&& known_eq (value, GET_MODE_NUNITS (mode));
 }
 
 /* Helper functions for insn_flags && insn_types */
@@ -3855,7 +3854,13 @@ expand_cond_len_op (unsigned icode, insn_flags op_type, 
rtx *ops, rtx len)
   bool is_vlmax_len = is_vlmax_len_p (mode, len);
 
   unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
-  if (is_dummy_mask)
+  /* FIXME: We don't support simplification of COND_LEN_NEG (..., dummy len,
+ dummy mask) into NEG_EXPR in GIMPLE FOLD yet.  So, we do such
+ simplification in RISC-V backend and may do that in middle-end in the
+ future.  */
+  if (is_dummy_mask && is_vlmax_len)
+insn_flags |= TDEFAULT_POLICY_P | MDEFAULT_POLICY_P;
+  else if (is_dummy_mask)
 insn_flags |= TU_POLICY_P | MDEFAULT_POLICY_P;
   else if (is_vlmax_len)
 insn_flags |= TDEFAULT_POLICY_P | MU_POLICY_P;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-3.c 
b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-3.c
new file mode 100644
index 000..116b5b538cc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d --param 
riscv-autovec-preference=fixed-vlmax" } */
+
+void foo (int *src, int *dst, int size) {
+ int i;
+ for (i = 0; i < size; i++)
+  *dst++ = *src & 0x80 ? (*src++ & 0x7f) : -*src++;
+}
+
+/* { dg-final { scan-assembler-times 
{vsetvli\s+[a-x0-9]+,\s*[a-x0-9]+,\s*e32,\s*m1,\s*t[au],\s*mu} 1 } } */
+/* { dg-final { scan-assembler-times {vsetvli} 1 } } */
-- 
2.36.3

[PATCH] RISC-V: Make liveness be aware of rgroup number of LENS[dynamic LMUL]

2024-01-01 Thread Juzhe-Zhong

This patch fixes the following situation:
vl4re16.v   v12,0(a5)
...
vl4re16.v   v16,0(a3)
vs4r.v  v12,0(a5)
...
vl4re16.v   v4,0(a0)
vs4r.v  v16,0(a3)
...
vsetvli a3,zero,e16,m4,ta,ma
...
vmv.v.x v8,t6
vmsgeu.vv   v2,v16,v8
vsub.vv v16,v16,v8
vs4r.v  v16,0(a5)
...
vs4r.v  v4,0(a0)
vmsgeu.vv   v1,v4,v8
...
vsub.vv v4,v4,v8
sllia6,a4,2
vs4r.v  v4,0(a5)
...
vsub.vv v4,v12,v8
vmsgeu.vv   v3,v12,v8
vs4r.v  v4,0(a5)
...

There are many spills which are 'vs4r.v'.  The root cause is that we don't count
vector REG liveness referencing the rgroup controls.

_29 = _25->iatom[0]; is transformed into the following vect statement with 4 
different loop_len (loop_len_74, loop_len_75, loop_len_76, loop_len_77).

  vect__29.11_78 = .MASK_LEN_LOAD (vectp_sb.9_72, 32B, { -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, loop_len_74, 0);
  vect__29.12_80 = .MASK_LEN_LOAD (vectp_sb.9_79, 32B, { -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, loop_len_75, 0);
  vect__29.13_82 = .MASK_LEN_LOAD (vectp_sb.9_81, 32B, { -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, loop_len_76, 0);
  vect__29.14_84 = .MASK_LEN_LOAD (vectp_sb.9_83, 32B, { -1, -1, -1, -1, -1, 
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, loop_len_77, 0);

which are the LENS number (LOOP_VINFO_LENS (loop_vinfo).length ()).

Count liveness according to LOOP_VINFO_LENS (loop_vinfo).length () to compute 
liveness more accurately:

vsetivlizero,8,e16,m1,ta,ma
vmsgeu.vi   v19,v14,8
vadd.vi v18,v14,-8
vmsgeu.vi   v17,v1,8
vadd.vi v16,v1,-8
vlm.v   v15,0(a5)
...

Tested no regression, ok for trunk ?

PR target/113112

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (compute_nregs_for_mode): Add 
rgroup info.
(max_number_of_live_regs): Ditto.
(has_unexpected_spills_p): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/pr113112-5.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 34 +++
 .../vect/costmodel/riscv/rvv/pr113112-5.c | 24 +
 2 files changed, 52 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-5.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 1199b3af067..12d3b57aff6 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -373,13 +373,17 @@ compute_local_live_ranges (
E.g. If mode = SImode, biggest_mode = DImode, LMUL = M4.
Then return RVVM4SImode (LMUL = 4, element mode = SImode).  */
 static unsigned int
-compute_nregs_for_mode (machine_mode mode, machine_mode biggest_mode, int lmul)
+compute_nregs_for_mode (loop_vec_info loop_vinfo, machine_mode mode,
+   machine_mode biggest_mode, int lmul)
 {
+  unsigned int rgroup_size = LOOP_VINFO_LENS (loop_vinfo).is_empty ()
+  ? 1
+  : LOOP_VINFO_LENS (loop_vinfo).length ();
   unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
   unsigned int biggest_size = GET_MODE_SIZE (biggest_mode).to_constant ();
   gcc_assert (biggest_size >= mode_size);
   unsigned int ratio = biggest_size / mode_size;
-  return MAX (lmul / ratio, 1);
+  return MAX (lmul / ratio, 1) * rgroup_size;
 }
 
 /* This function helps to determine whether current LMUL will cause
@@ -393,7 +397,7 @@ compute_nregs_for_mode (machine_mode mode, machine_mode 
biggest_mode, int lmul)
mode.
  - Third, Return the maximum V_REGs are alive of the loop.  */
 static unsigned int
-max_number_of_live_regs (const basic_block bb,
+max_number_of_live_regs (loop_vec_info loop_vinfo, const basic_block bb,
 const hash_map &live_ranges,
 unsigned int max_point, machine_mode biggest_mode,
 int lmul)
@@ -412,7 +416,7 @@ max_number_of_live_regs (const basic_block bb,
{
  machine_mode mode = TYPE_MODE (TREE_TYPE (var));
  unsigned int nregs
-   = compute_nregs_for_mode (mode, biggest_mode, lmul);
+   = compute_nregs_for_mode (loop_vinfo, mode, biggest_mode, lmul);
  live_vars_vec[i] += nregs;
  if (live_vars_vec[i] > max_nregs)
{
@@ -687,6 +691,24 @@ update_local_live_ranges (
dump_printf_loc (MSG_NOTE, vect_location,
 "Add perm indice %T, start = 0, end = %d\n",
 sel, max_point);
+ if (!LOOP_VINFO_LENS (loop_vinfo).is_empty ()
+ && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
+   {
+ /* If we are vectorizing a permutation when the rgroup number
+> 1, we will need additional mask to shuffle the second
+vector.  */
+ tree mask = build_decl (UNKNOWN_LOCATION, VAR_DECL,
+

[Committed] RISC-V: Declare STMT_VINFO_TYPE (...) as local variable

2024-01-01 Thread Juzhe-Zhong

Committed.

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc: Move STMT_VINFO_TYPE (...) to 
local.

---
 gcc/config/riscv/riscv-vector-costs.cc | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index b41a79429d4..1199b3af067 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -279,10 +279,11 @@ compute_local_live_ranges (
  gimple *stmt = program_point.stmt;
  stmt_vec_info stmt_info = program_point.stmt_info;
  tree lhs = gimple_get_lhs (stmt);
+ enum stmt_vec_info_type type
+   = STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info));
  if (lhs != NULL_TREE && is_gimple_reg (lhs)
  && (!POINTER_TYPE_P (TREE_TYPE (lhs))
- || STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
-  != store_vec_info_type))
+ || type != store_vec_info_type))
{
  biggest_mode = get_biggest_mode (biggest_mode,
   TYPE_MODE (TREE_TYPE (lhs)));
@@ -309,9 +310,7 @@ compute_local_live_ranges (
  if (poly_int_tree_p (var)
  || (is_gimple_val (var)
  && (!POINTER_TYPE_P (TREE_TYPE (var))
- || STMT_VINFO_TYPE (
-  vect_stmt_to_vectorize (stmt_info))
-  != load_vec_info_type)))
+ || type != load_vec_info_type)))
{
  biggest_mode
= get_biggest_mode (biggest_mode,
-- 
2.36.3

[Committed] RISC-V: Robostify testcase pr113112-1.c

2023-12-28 Thread Juzhe-Zhong

The redudant dump check is fragile and easily changed, not necessary.

Tested on both RV32/RV64 no regression.

Remove it and committed.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: Remove redundant checks.

---
 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c
index 95df7809d49..2dc39ad8e8b 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c
@@ -24,6 +24,3 @@ foo (int n){
 /* { dg-final { scan-assembler-not {jr} } } */
 /* { dg-final { scan-assembler-times {ret} 1 } } */
 /* { dg-final { scan-tree-dump-times "Preferring smaller LMUL loop because it 
has unexpected spills" 1 "vect" } } */
-/* { dg-final { scan-tree-dump "At most 8 number of live V_REG at program 
point 1 for bb 4" "vect" } } */
-/* { dg-final { scan-tree-dump "At most 40 number of live V_REG at program 
point 1 for bb 3" "vect" } } */
-/* { dg-final { scan-tree-dump "At most 8 number of live V_REG at program 
point 1 for bb 5" "vect" } } */
-- 
2.36.3

[PATCH] RISC-V: Count pointer type SSA into RVV regs liveness for dynamic LMUL cost model

2023-12-28 Thread Juzhe-Zhong

This patch fixes the following choosing unexpected big LMUL which cause 
register spillings.

Before this patch, choosing LMUL = 4:

addisp,sp,-160
addiw   t1,a2,-1
li  a5,7
bleut1,a5,.L16
vsetivlizero,8,e64,m4,ta,ma
vmv.v.x v4,a0
vs4r.v  v4,0(sp)---> spill to the stack.
vmv.v.x v4,a1
addia5,sp,64
vs4r.v  v4,0(a5)---> spill to the stack.

The root cause is the following codes:

  if (poly_int_tree_p (var)
  || (is_gimple_val (var)
 && !POINTER_TYPE_P (TREE_TYPE (var

We count the variable as consuming a RVV reg group when it is not POINTER_TYPE.

It is right for load/store STMT for example:

_1 = (MEM)*addr -->  addr won't be allocated an RVV vector group.

However, we find it is not right for non-load/store STMT:

_3 = _1 == x_8(D);

_1 is pointer type too but we does allocate a RVV register group for it.

So after this patch, we are choosing the perfect LMUL for the testcase in this 
patch:

ble a2,zero,.L17
addiw   a7,a2,-1
li  a5,3
bleua7,a5,.L15
srliw   a5,a7,2
sllia6,a5,1
add a6,a6,a5
lui a5,%hi(replacements)
addit1,a5,%lo(replacements)
sllia6,a6,5
lui t4,%hi(.LANCHOR0)
lui t3,%hi(.LANCHOR0+8)
lui a3,%hi(.LANCHOR0+16)
lui a4,%hi(.LC1)
vsetivlizero,4,e16,mf2,ta,ma
addit4,t4,%lo(.LANCHOR0)
addit3,t3,%lo(.LANCHOR0+8)
addia3,a3,%lo(.LANCHOR0+16)
addia4,a4,%lo(.LC1)
add a6,t1,a6
addia5,a5,%lo(replacements)
vle16.v v18,0(t4)
vle16.v v17,0(t3)
vle16.v v16,0(a3)
vmsgeu.vi   v25,v18,4
vadd.vi v24,v18,-4
vmsgeu.vi   v23,v17,4
vadd.vi v22,v17,-4
vlm.v   v21,0(a4)
vmsgeu.vi   v20,v16,4
vadd.vi v19,v16,-4
vsetvli zero,zero,e64,m2,ta,mu
vmv.v.x v12,a0
vmv.v.x v14,a1
.L4:
vlseg3e64.v v6,(a5)
vmseq.vvv2,v6,v12
vmseq.vvv0,v8,v12
vmsne.vvv1,v8,v12
vmand.mmv1,v1,v2
vmerge.vvm  v2,v8,v14,v0
vmv1r.v v0,v1
addia4,a5,24
vmerge.vvm  v6,v6,v14,v0
vmerge.vim  v2,v2,0,v0
vrgatherei16.vv v4,v6,v18
vmv1r.v v0,v25
vrgatherei16.vv v4,v2,v24,v0.t
vs1r.v  v4,0(a5)
addia3,a5,48
vmv1r.v v0,v21
vmv2r.v v4,v2
vcompress.vmv4,v6,v0
vs1r.v  v4,0(a4)
vmv1r.v v0,v23
addia4,a5,72
vrgatherei16.vv v4,v6,v17
vrgatherei16.vv v4,v2,v22,v0.t
vs1r.v  v4,0(a3)
vmv1r.v v0,v20
vrgatherei16.vv v4,v6,v16
addia5,a5,96
vrgatherei16.vv v4,v2,v19,v0.t
vs1r.v  v4,0(a4)
bne a6,a5,.L4

No spillings, no "sp" register used.

Tested on both RV32 and RV64, no regression.

Ok for trunk ?

PR target/113112

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (compute_nregs_for_mode): Fix 
pointer type liveness count.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 12 ++--
 .../vect/costmodel/riscv/rvv/pr113112-4.c | 28 +++
 2 files changed, 37 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-4.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 0c485dc4f29..b41a79429d4 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -277,9 +277,12 @@ compute_local_live_ranges (
{
  unsigned int point = program_point.point;
  gimple *stmt = program_point.stmt;
+ stmt_vec_info stmt_info = program_point.stmt_info;
  tree lhs = gimple_get_lhs (stmt);
  if (lhs != NULL_TREE && is_gimple_reg (lhs)
- && !POINTER_TYPE_P (TREE_TYPE (lhs)))
+ && (!POINTER_TYPE_P (TREE_TYPE (lhs))
+ || STMT_VINFO_TYPE (vect_stmt_to_vectorize (stmt_info))
+  != store_vec_info_type))
{
  biggest_mode = get_biggest_mode (biggest_mode,
   TYPE_MODE (TREE_TYPE (lhs)));
@@ -305,7 +308,10 @@ compute_local_live_ranges (
 the future.  */
  if (poly_int_tree_p (var)
  || (is_gimple_val (var)
- && !POINTER_TYPE_P (TREE_TYPE (var
+ && (!POINTER_TYPE_P (TREE_TYPE (var))
+ || STMT_VINFO_TYPE (
+

[Committed] RISC-V: Make dynamic LMUL cost model more accurate for conversion codes

2023-12-27 Thread Juzhe-Zhong

Notice current dynamic LMUL is not accurate for conversion codes.
Refine for it, there is current case is changed from choosing LMUL = 4 into 
LMUL = 8.

Tested no regression, committed.

Before this patch (LMUL = 4):  After this patch (LMUL = 8):  
lw  a7,56(sp) lwa7,56(sp)
ld  t5,0(sp)  ldt5,0(sp)
ld  t1,8(sp)  ldt1,8(sp)
ld  t6,16(sp) ldt6,16(sp)
ld  t0,24(sp) ldt0,24(sp)
ld  t3,32(sp) ldt3,32(sp)
ld  t4,40(sp) ldt4,40(sp)
ble a7,zero,.L5   ble   a7,zero,.L5
.L3:   .L3:
vsetvli a4,a7,e32,m2,ta,mavsetvli   a4,a7,e32,m4,ta
vle8.v  v1,0(a2)  vle8.vv3,0(a2)
vle8.v  v4,0(a1)  vle8.vv16,0(t0)
vsext.vf4   v8,v1 vle8.vv7,0(a1)
vsext.vf4   v2,v4 vle8.vv12,0(t6)
vsetvli zero,zero,e8,mf2,ta,mavle8.vv2,0(a5)
vadd.vv v4,v4,v1  vle8.vv1,0(t5)
vsetvli zero,zero,e32,m2,ta,mavsext.vf4 v20,v3
vle8.v  v5,0(t0)  vsext.vf4 v8,v7
vle8.v  v6,0(t6)  vadd.vv   v8,v8,v20
vadd.vv v2,v2,v8  vadd.vv   v8,v8,v8
vadd.vv v2,v2,v2  vadd.vv   v8,v8,v20
vadd.vv v2,v2,v8  vsetvli   zero,zero,e8,m1
vsetvli zero,zero,e8,mf2,ta,mavadd.vv   v15,v12,v16
vadd.vv v6,v6,v5  vsetvli   zero,zero,e32,m4
vsetvli zero,zero,e32,m2,ta,mavsext.vf4 v12,v15
vle8.v  v8,0(t5)  vadd.vv   v8,v8,v12
vle8.v  v9,0(a5)  vsetvli   zero,zero,e8,m1
vsext.vf4   v10,v4vadd.vv   v7,v7,v3
vsext.vf4   v12,v6vsetvli   zero,zero,e32,m4
vadd.vv v2,v2,v12 vsext.vf4 v4,v7
vadd.vv v2,v2,v10 vadd.vv   v8,v8,v4
vsetvli zero,zero,e16,m1,ta,mavsetvli   zero,zero,e16,m2
vncvt.x.x.w v4,v2 vncvt.x.x.w   v4,v8
vsetvli zero,zero,e32,m2,ta,mavsetvli   zero,zero,e8,m1
vadd.vv v6,v2,v2  vncvt.x.x.w   v4,v4
vsetvli zero,zero,e8,mf2,ta,mavadd.vv   v15,v3,v4
vncvt.x.x.w v4,v4 vadd.vv   v2,v2,v4
vadd.vv v5,v5,v4  vse8.vv15,0(t4)
vadd.vv v9,v9,v4  vadd.vv   v3,v16,v4
vadd.vv v1,v1,v4  vse8.vv2,0(a3)
vadd.vv v4,v8,v4  vadd.vv   v1,v1,v4
vse8.v  v1,0(t4)  vse8.vv1,0(a6)
vse8.v  v9,0(a3)  vse8.vv3,0(t1)
vsetvli zero,zero,e32,m2,ta,mavsetvli   zero,zero,e32,m4
vse8.v  v4,0(a6)  vsext.vf4 v4,v3
vsext.vf4   v8,v5 vadd.vv   v4,v4,v8
vse8.v  v5,0(t1)  vsetvli   zero,zero,e64,m8
vadd.vv v2,v8,v2  vsext.vf2 v16,v4
vsetvli zero,zero,e64,m4,ta,mavse64.v   v16,0(t3)
vsext.vf2   v8,v2 vsetvli   zero,zero,e32,m4
vsetvli zero,zero,e32,m2,ta,mavadd.vv   v8,v8,v8
sllit2,a4,3   vsext.vf4 v4,v15
vse64.v v8,0(t3)  slli  t2,a4,3
vsext.vf4   v2,v1 vadd.vv   v4,v8,v4
sub a7,a7,a4  sub   a7,a7,a4
vadd.vv v2,v6,v2  vsetvli   zero,zero,e64,m8
vsetvli zero,zero,e64,m4,ta,mavsext.vf2 v8,v4
vsext.vf2   v4,v2 vse64.v   v8,0(a0)
vse64.v v4,0(a0)  add   a1,a1,a4
add a2,a2,a4  add   a2,a2,a4
add a1,a1,a4  add   a5,a5,a4
add t6,t6,a4  add   t5,t5,a4
add t0,t0,a4  add   t6,t6,a4
add a5,a5,a4  add

[Committed] RISC-V: Make known NITERS loop be aware of dynamic lmul cost model liveness information

2023-12-27 Thread Juzhe-Zhong

Consider this following case:

int f[12][100];

void bad1(int v1, int v2)
{
  for (int r = 0; r < 100; r += 4)
{
  int i = r + 1;
  f[0][r] = f[1][r] * (f[2][r]) - f[1][i] * (f[2][i]);
  f[0][i] = f[1][r] * (f[2][i]) + f[1][i] * (f[2][r]);
  f[0][r+2] = f[1][r+2] * (f[2][r+2]) - f[1][i+2] * (f[2][i+2]);
  f[0][i+2] = f[1][r+2] * (f[2][i+2]) + f[1][i+2] * (f[2][r+2]);
}
}

Pick up LMUL = 8 VLS blindly:

lui a4,%hi(f)
addia4,a4,%lo(f)
addisp,sp,-592
addia3,a4,800
lui a5,%hi(.LANCHOR0)
vl8re32.v   v24,0(a3)
addia5,a5,%lo(.LANCHOR0)
addia1,a4,400
addia3,sp,140
vl8re32.v   v16,0(a1)
vl4re16.v   v4,0(a5)
addia7,a5,192
vs4r.v  v4,0(a3)
addit0,a5,64
addia3,sp,336
li  t2,32
addia2,a5,128
vsetvli a5,zero,e32,m8,ta,ma
vrgatherei16.vv v8,v16,v4
vmul.vv v8,v8,v24
vl8re32.v   v0,0(a7)
vs8r.v  v8,0(a3)
vmsltu.vx   v8,v0,t2
addia3,sp,12
addit2,sp,204
vsm.v   v8,0(t2)
vl4re16.v   v4,0(t0)
vl4re16.v   v0,0(a2)
vs4r.v  v4,0(a3)
addit0,sp,336
vrgatherei16.vv v8,v24,v4
addia3,sp,208
vrgatherei16.vv v24,v16,v0
vs4r.v  v0,0(a3)
vmul.vv v8,v8,v24
vlm.v   v0,0(t2)
vl8re32.v   v24,0(t0)
addia3,sp,208
vsub.vv v16,v24,v8
addit6,a4,528
vadd.vv v8,v24,v8
addit5,a4,928
vmerge.vvm  v8,v8,v16,v0
addit3,a4,128
vs8r.v  v8,0(a4)
addit4,a4,1056
addit1,a4,656
addia0,a4,256
addia6,a4,1184
addia1,a4,784
addia7,a4,384
addia4,sp,140
vl4re16.v   v0,0(a3)
vl8re32.v   v24,0(t6)
vl4re16.v   v4,0(a4)
vrgatherei16.vv v16,v24,v0
addia3,sp,12
vs8r.v  v16,0(t0)
vl8re32.v   v8,0(t5)
vrgatherei16.vv v16,v24,v4
vl4re16.v   v4,0(a3)
vrgatherei16.vv v24,v8,v4
vmul.vv v16,v16,v8
vl8re32.v   v8,0(t0)
vmul.vv v8,v8,v24
vsub.vv v24,v16,v8
vlm.v   v0,0(t2)
addia3,sp,208
vadd.vv v8,v8,v16
vl8re32.v   v16,0(t4)
vmerge.vvm  v8,v8,v24,v0
vrgatherei16.vv v24,v16,v4
vs8r.v  v24,0(t0)
vl4re16.v   v28,0(a3)
addia3,sp,464
vs8r.v  v8,0(t3)
vl8re32.v   v8,0(t1)
vrgatherei16.vv v0,v8,v28
vs8r.v  v0,0(a3)
addia3,sp,140
vl4re16.v   v24,0(a3)
addia3,sp,464
vrgatherei16.vv v0,v8,v24
vl8re32.v   v24,0(t0)
vmv8r.v v8,v0
vl8re32.v   v0,0(a3)
vmul.vv v8,v8,v16
vmul.vv v24,v24,v0
vsub.vv v16,v8,v24
vadd.vv v8,v8,v24
vsetivlizero,4,e32,m8,ta,ma
vle32.v v24,0(a6)
vsetvli a4,zero,e32,m8,ta,ma
addia4,sp,12
vlm.v   v0,0(t2)
vmerge.vvm  v8,v8,v16,v0
vl4re16.v   v16,0(a4)
vrgatherei16.vv v0,v24,v16
vsetivlizero,4,e32,m8,ta,ma
vs8r.v  v0,0(a4)
addia4,sp,208
vl4re16.v   v0,0(a4)
vs8r.v  v8,0(a0)
vle32.v v16,0(a1)
vsetvli a5,zero,e32,m8,ta,ma
vrgatherei16.vv v8,v16,v0
vs8r.v  v8,0(a4)
addia4,sp,140
vl4re16.v   v4,0(a4)
addia5,sp,12
vrgatherei16.vv v8,v16,v4
vl8re32.v   v0,0(a5)
vsetivlizero,4,e32,m8,ta,ma
addia5,sp,208
vmv8r.v v16,v8
vl8re32.v   v8,0(a5)
vmul.vv v24,v24,v16
vmul.vv v8,v0,v8
vsub.vv v16,v24,v8
vadd.vv v8,v8,v24
vsetvli a5,zero,e8,m2,ta,ma
vlm.v   v0,0(t2)
vsetivlizero,4,e32,m8,ta,ma
vmerge.vvm  v8,v8,v16,v0
vse32.v v8,0(a7)
addisp,sp,592
jr  ra

This patch makes loop with known NITERS be aware of liveness estimation, after 
this patch, choosing LMUL = 4:

lui a5,%hi(f)
addia5,a5,%lo(f)
addia3,a5,400
addia4,a5,800
vsetivlizero,8,e32,m2,ta,ma
vlseg4e32.v v16,(a3)
vlseg4e32.v v8,(a4)
vmul.vv v2,v8,v16
addia3,a5,528
vmv.v.v v24,v10
vnmsub.vv   v24,v18,v2
addia4,a5,928
vmul.vv v2,v12,v22
vmul.vv v6,v8,v18
vmv.v.v v30,v2
vmacc.vvv30,v14,v20
vmv.v.v v26,v6
vmacc.vvv26,v10,v16
vmul.vv v4,v12,v20
vmv.v.v v28,v14
vnmsub.vv   v28,v22,v4
vsseg4e32.v v24,(a5)
vlseg4e32.v v16,(a3)
vlseg4e32.v v8,(a4)

[PATCH V2] RISC-V: Disallow transformation into VLMAX AVL for cond_len_xxx when length is in range [0, 31]

2023-12-26 Thread Juzhe-Zhong

Notice we have this following situation:

vsetivlizero,4,e32,m1,ta,ma
vlseg4e32.v v4,(a5)
vlseg4e32.v v12,(a3)
vsetvli a5,zero,e32,m1,tu,ma ---> This is redundant since 
VLMAX AVL = 4 when it is fixed-vlmax
vfadd.vfv3,v13,fa0
vfadd.vfv1,v12,fa1
vfmul.vvv17,v3,v5
vfmul.vvv16,v1,v5

The rootcause is that we transform COND_LEN_xxx into VLMAX AVL when len == 
NUNITS blindly.
However, we don't need to transform all of them since when len is range of 
[0,31], we don't need to
consume scalar registers.

After this patch:

vsetivlizero,4,e32,m1,tu,ma
addia4,a5,400
vlseg4e32.v v12,(a3)
vfadd.vfv3,v13,fa0
vfadd.vfv1,v12,fa1
vlseg4e32.v v4,(a4)
vfadd.vfv2,v14,fa1
vfmul.vvv17,v3,v5
vfmul.vvv16,v1,v5

Tested on both RV32 and RV64 no regression.

Ok for trunk ?

gcc/ChangeLog:

* config/riscv/riscv-v.cc (is_vlmax_len_p): New function.
(expand_load_store): Disallow transformation into VLMAX when len is in 
range of [0,31]
(expand_cond_len_op): Ditto.
(expand_gather_scatter): Ditto.
(expand_lanes_load_store): Ditto.
(expand_fold_extract_last): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/post-ra-avl.c: Adapt test.
* gcc.target/riscv/rvv/base/vf_avl-2.c: New test.

---
 gcc/config/riscv/riscv-v.cc   | 21 +--
 .../riscv/rvv/autovec/post-ra-avl.c   |  2 +-
 .../gcc.target/riscv/rvv/base/vf_avl-2.c  | 21 +++
 3 files changed, 37 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 038ab084a37..0cc7af58da6 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -68,6 +68,16 @@ imm_avl_p (machine_mode mode)
   : false;
 }
 
+/* Return true if LEN is equal to NUNITS that outbounds range of [0, 31].  */
+static bool
+is_vlmax_len_p (machine_mode mode, rtx len)
+{
+  poly_int64 value;
+  return poly_int_rtx_p (len, &value)
+&& known_eq (value, GET_MODE_NUNITS (mode))
+&& !satisfies_constraint_K (len);
+}
+
 /* Helper functions for insn_flags && insn_types */
 
 /* Return true if caller need pass mask operand for insn pattern with
@@ -3776,7 +3786,7 @@ expand_load_store (rtx *ops, bool is_load)
   rtx len = ops[3];
   machine_mode mode = GET_MODE (ops[0]);
 
-  if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
 {
   /* If the length operand is equal to VF, it is VLMAX load/store.  */
   if (is_load)
@@ -3842,8 +3852,7 @@ expand_cond_len_op (unsigned icode, insn_flags op_type, 
rtx *ops, rtx len)
   machine_mode mask_mode = GET_MODE (mask);
   poly_int64 value;
   bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
-  bool is_vlmax_len
-= poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode));
+  bool is_vlmax_len = is_vlmax_len_p (mode, len);
 
   unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
   if (is_dummy_mask)
@@ -4012,7 +4021,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
   unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
   poly_int64 value;
-  bool is_vlmax = poly_int_rtx_p (len, &value) && known_eq (value, nunits);
+  bool is_vlmax = is_vlmax_len_p (vec_mode, len);
 
   /* Extend the offset element to address width.  */
   if (inner_offsize < BITS_PER_WORD)
@@ -4199,7 +4208,7 @@ expand_lanes_load_store (rtx *ops, bool is_load)
   rtx reg = is_load ? ops[0] : ops[1];
   machine_mode mode = GET_MODE (ops[0]);
 
-  if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
 {
   /* If the length operand is equal to VF, it is VLMAX load/store.  */
   if (is_load)
@@ -4252,7 +4261,7 @@ expand_fold_extract_last (rtx *ops)
   rtx slide_vect = gen_reg_rtx (mode);
   insn_code icode;
 
-  if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
 len = NULL_RTX;
 
   /* Calculate the number of 1-bit in mask. */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c
index f3d12bac7cd..bff6dcb1c38 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c
@@ -13,4 +13,4 @@ int foo() {
   return a;
 }
 
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli\s+[a-x0-9]+,\s*zero} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_

[PATCH] RISC-V: Disallow transformation into VLMAX AVL for cond_len_xxx when length is in range [0, 31]

2023-12-26 Thread Juzhe-Zhong

Notice we have this following situation:

vsetivlizero,4,e32,m1,ta,ma
vlseg4e32.v v4,(a5)
vlseg4e32.v v12,(a3)
vsetvli a5,zero,e32,m1,tu,ma ---> This is redundant since 
VLMAX AVL = 4 when it is fixed-vlmax
vfadd.vfv3,v13,fa0
vfadd.vfv1,v12,fa1
vfmul.vvv17,v3,v5
vfmul.vvv16,v1,v5

The rootcause is that we transform COND_LEN_xxx into VLMAX AVL when len == 
NUNITS blindly.
However, we don't need to transform all of them since when len is range of 
[0,31], we don't need to
consume scalar registers.

After this patch:

vsetivlizero,4,e32,m1,tu,ma
addia4,a5,400
vlseg4e32.v v12,(a3)
vfadd.vfv3,v13,fa0
vfadd.vfv1,v12,fa1
vlseg4e32.v v4,(a4)
vfadd.vfv2,v14,fa1
vfmul.vvv17,v3,v5
vfmul.vvv16,v1,v5

Tested on both RV32 and RV64 no regression.

Ok for trunk ?

gcc/ChangeLog:

* config/riscv/riscv-v.cc (is_vlmax_len_p): New function.
(expand_load_store): Disallow transformation into VLMAX when len is in 
range of [0,31]
(expand_cond_len_op): Ditto.
(expand_gather_scatter): Ditto.
(expand_lanes_load_store): Ditto.
(expand_fold_extract_last): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/post-ra-avl.c: Adapt test.
* gcc.target/riscv/rvv/base/vf_avl-2.c: New test.

---
 gcc/config/riscv/riscv-v.cc   | 21 +--
 .../riscv/rvv/autovec/post-ra-avl.c   |  2 +-
 .../gcc.target/riscv/rvv/base/vf_avl-2.c  | 21 +++
 3 files changed, 37 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 038ab084a37..0cc7af58da6 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -68,6 +68,16 @@ imm_avl_p (machine_mode mode)
   : false;
 }
 
+/* Return true if LEN is equal to NUNITS that outbounds range of [0, 31].  */
+static bool
+is_vlmax_len_p (machine_mode mode, rtx len)
+{
+  poly_int64 value;
+  return poly_int_rtx_p (len, &value)
+&& known_eq (value, GET_MODE_NUNITS (mode))
+&& !satisfies_constraint_K (len);
+}
+
 /* Helper functions for insn_flags && insn_types */
 
 /* Return true if caller need pass mask operand for insn pattern with
@@ -3776,7 +3786,7 @@ expand_load_store (rtx *ops, bool is_load)
   rtx len = ops[3];
   machine_mode mode = GET_MODE (ops[0]);
 
-  if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
 {
   /* If the length operand is equal to VF, it is VLMAX load/store.  */
   if (is_load)
@@ -3842,8 +3852,7 @@ expand_cond_len_op (unsigned icode, insn_flags op_type, 
rtx *ops, rtx len)
   machine_mode mask_mode = GET_MODE (mask);
   poly_int64 value;
   bool is_dummy_mask = rtx_equal_p (mask, CONSTM1_RTX (mask_mode));
-  bool is_vlmax_len
-= poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode));
+  bool is_vlmax_len = is_vlmax_len_p (mode, len);
 
   unsigned insn_flags = HAS_DEST_P | HAS_MASK_P | HAS_MERGE_P | op_type;
   if (is_dummy_mask)
@@ -4012,7 +4021,7 @@ expand_gather_scatter (rtx *ops, bool is_load)
   unsigned inner_offsize = GET_MODE_BITSIZE (inner_idx_mode);
   poly_int64 nunits = GET_MODE_NUNITS (vec_mode);
   poly_int64 value;
-  bool is_vlmax = poly_int_rtx_p (len, &value) && known_eq (value, nunits);
+  bool is_vlmax = is_vlmax_len_p (vec_mode, len);
 
   /* Extend the offset element to address width.  */
   if (inner_offsize < BITS_PER_WORD)
@@ -4199,7 +4208,7 @@ expand_lanes_load_store (rtx *ops, bool is_load)
   rtx reg = is_load ? ops[0] : ops[1];
   machine_mode mode = GET_MODE (ops[0]);
 
-  if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
 {
   /* If the length operand is equal to VF, it is VLMAX load/store.  */
   if (is_load)
@@ -4252,7 +4261,7 @@ expand_fold_extract_last (rtx *ops)
   rtx slide_vect = gen_reg_rtx (mode);
   insn_code icode;
 
-  if (poly_int_rtx_p (len, &value) && known_eq (value, GET_MODE_NUNITS (mode)))
+  if (is_vlmax_len_p (mode, len))
 len = NULL_RTX;
 
   /* Calculate the number of 1-bit in mask. */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c
index f3d12bac7cd..c77b2d187fe 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/post-ra-avl.c
@@ -13,4 +13,4 @@ int foo() {
   return a;
 }
 
-/* { dg-final { scan-assembler-times {vsetvli\s+[a-x0-9]+,\s*zero} 1 } } */
+/* { dg-final { scan-assembler-not {vsetvli} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vf_avl-2.c 
b/gcc/tests

[Committed] RISC-V: Fix typo

2023-12-26 Thread Juzhe-Zhong

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c: Fix typo.

---
 .../gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c
index f3c2315c2c5..e47af25aa9b 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c
@@ -19,5 +19,5 @@ bar (int *x, int a, int b, int n)
 
 /* { dg-final { scan-assembler {e32,m4} } } */
 /* { dg-final { scan-assembler-not {jr} } } */
-/* { dg-final { scan-assembler-times {ret} 2 } } *
+/* { dg-final { scan-assembler-times {ret} 2 } } */
 /* { dg-final { scan-tree-dump-times "Preferring smaller LMUL loop because it 
has unexpected spills" 1 "vect" } } */
-- 
2.36.3

[Committed] RISC-V: Some minior tweak on dynamic LMUL cost model

2023-12-26 Thread Juzhe-Zhong

Tweak some codes of dynamic LMUL cost model to make computation more 
predictable and accurate.

Tested on both RV32 and RV64 no regression.

Committed.

PR target/113112

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (compute_estimated_lmul): Tweak 
LMUL estimation.
(has_unexpected_spills_p): Ditto.
(costs::record_potential_unexpected_spills): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-1.c: Add more checks.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-2.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-3.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-4.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-5.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-6.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-7.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-1.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-2.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-3.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-4.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-5.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-1.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-2.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-3.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-5.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-7.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-1.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-10.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-11.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-2.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-3.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-4.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-5.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-6.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-7.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-8.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-9.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-12.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113112-2.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 42 +--
 .../costmodel/riscv/rvv/dynamic-lmul1-1.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul1-2.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul1-3.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul1-4.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul1-5.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul1-6.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul1-7.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul2-1.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul2-2.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul2-3.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul2-4.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul2-5.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul4-1.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul4-2.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul4-3.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul4-5.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul4-6.c |  5 ++-
 .../costmodel/riscv/rvv/dynamic-lmul4-7.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul4-8.c |  5 ++-
 .../costmodel/riscv/rvv/dynamic-lmul8-1.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul8-10.c|  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul8-11.c|  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul8-12.c| 25 +++
 .../costmodel/riscv/rvv/dynamic-lmul8-2.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul8-3.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul8-4.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul8-5.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul8-6.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul8-7.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul8-8.c |  3 ++
 .../costmodel/riscv/rvv/dynamic-lmul8-9.c |  3 ++
 .../vect/costmodel/riscv/rvv/pr113112-2.c | 20 +
 33 files changed, 166 insertions(+), 15 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-12.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-2.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index 7b837b08f9e..74b8e86a5e1 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -394,21 +394,32 @@ compute_estimated_lmul (loop_vec_info loop_vinfo, 
machine_mode mode)

[PATCH] RISC-V: Move RVV V_REGS liveness computation into analyze_loop_vinfo

2023-12-25 Thread Juzhe-Zhong

Currently, we compute RVV V_REGS liveness during better_main_loop_than_p which 
is not appropriate
time to do that since we for example, when have the codes will finally pick 
LMUL = 8 vectorization
factor, we compute liveness for LMUL = 8 multiple times which are redundant.

Since we have leverage the current ARM SVE COST model:

  /* Do one-time initialization based on the vinfo.  */
  loop_vec_info loop_vinfo = dyn_cast (m_vinfo);
  if (!m_analyzed_vinfo)
{
  if (loop_vinfo)
analyze_loop_vinfo (loop_vinfo);

  m_analyzed_vinfo = true;
}

Analyze COST model only once for each cost model.

So here we move dynamic LMUL liveness information into analyze_loop_vinfo.

/* Do one-time initialization of the costs given that we're
   costing the loop vectorization described by LOOP_VINFO.  */
void
costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
{
  ...

  /* Detect whether the LOOP has unexpected spills.  */
  record_potential_unexpected_spills (loop_vinfo);
}

So that we can avoid redundant computations and the current dynamic LMUL cost 
model flow is much
more reasonable and consistent with others.

Tested on RV32 and RV64 no regressions.

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (compute_estimated_lmul): Allow 
fractional vecrtor.
(preferred_new_lmul_p): Move RVV V_REGS liveness computation into 
analyze_loop_vinfo.
(has_unexpected_spills_p): New function.
(costs::record_potential_unexpected_spills): Ditto.
(costs::better_main_loop_than_p): Move RVV V_REGS liveness computation 
into analyze_loop_vinfo.
* config/riscv/riscv-vector-costs.h: New functions and variables.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul-mixed-1.c: Robostify 
test.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-1.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-2.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-3.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-4.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-5.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-6.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul1-7.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-1.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-2.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-3.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-4.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-5.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-6.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-1.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-10.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-2.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-3.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-5.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-6.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-7.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul4-8.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-1.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-10.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-11.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-2.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-3.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-4.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-5.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-6.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-7.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-8.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul8-9.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/no-dynamic-lmul-1.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/pr111848.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: Ditto.

---
 gcc/config/riscv/riscv-vector-costs.cc| 110 +++---
 gcc/config/riscv/riscv-vector-costs.h |   8 ++
 .../riscv/rvv/dynamic-lmul-mixed-1.c  |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul1-1.c |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul1-2.c |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul1-3.c |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul1-4.c |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul1-5.c |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul1-6.c |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul1-7.c |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul2-1.c |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul2-2.c |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul2-3.c |   5 +-
 .../costmodel/riscv/rvv/dynamic-lmul2-4.c |   5 +-
 .../costmodel/riscv/rvv/dynamic

[Committed] RISC-V: Add one more ASM check in PR113112-1.c

2023-12-24 Thread Juzhe-Zhong

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: Add one more ASM check.

---
 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c
index a44a1c041af..31b41ba707e 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c
@@ -20,6 +20,7 @@ foo (int n){
   return 0;
 }
 
+/* { dg-final { scan-assembler {e32,m4} } } */
 /* { dg-final { scan-assembler-not {jr} } } */
 /* { dg-final { scan-assembler-times {ret} 1 } } */
 /* { dg-final { scan-tree-dump "Maximum lmul = 8" "vect" } } */
-- 
2.36.3

[Committed] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis

2023-12-22 Thread Juzhe-Zhong

Consider this following case:

foo:
ble a0,zero,.L11
lui a2,%hi(.LANCHOR0)
addisp,sp,-128
addia2,a2,%lo(.LANCHOR0)
mv  a1,a0
vsetvli a6,zero,e32,m8,ta,ma
vid.v   v8
vs8r.v  v8,0(sp) ---> spill
.L3:
vl8re32.v   v16,0(sp)---> reload
vsetvli a4,a1,e8,m2,ta,ma
li  a3,0
vsetvli a5,zero,e32,m8,ta,ma
vmv8r.v v0,v16
vmv.v.x v8,a4
vmv.v.i v24,0
vadd.vv v8,v16,v8
vmv8r.v v16,v24
vs8r.v  v8,0(sp)---> spill
.L4:
addiw   a3,a3,1
vadd.vv v8,v0,v16
vadd.vi v16,v16,1
vadd.vv v24,v24,v8
bne a0,a3,.L4
vsetvli zero,a4,e32,m8,ta,ma
sub a1,a1,a4
vse32.v v24,0(a2)
sllia4,a4,2
add a2,a2,a4
bne a1,zero,.L3
li  a0,0
addisp,sp,128
jr  ra
.L11:
li  a0,0
ret

Pick unexpected LMUL = 8.

The root cause is we didn't involve PHI initial value in the dynamic LMUL 
calculation:

  # j_17 = PHI---> # vect_vec_iv_.8_24 = 
PHI <_25(9), { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }(5)>

We didn't count { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } in consuming vector register but it does 
allocate an vector register group for it.

This patch fixes this missing count. Then after this patch we pick up perfect 
LMUL (LMUL = M4)

foo:
ble a0,zero,.L9
lui a4,%hi(.LANCHOR0)
addia4,a4,%lo(.LANCHOR0)
mv  a2,a0
vsetivlizero,16,e32,m4,ta,ma
vid.v   v20
.L3:
vsetvli a3,a2,e8,m1,ta,ma
li  a5,0
vsetivlizero,16,e32,m4,ta,ma
vmv4r.v v16,v20
vmv.v.i v12,0
vmv.v.x v4,a3
vmv4r.v v8,v12
vadd.vv v20,v20,v4
.L4:
addiw   a5,a5,1
vmv4r.v v4,v8
vadd.vi v8,v8,1
vadd.vv v4,v16,v4
vadd.vv v12,v12,v4
bne a0,a5,.L4
sllia5,a3,2
vsetvli zero,a3,e32,m4,ta,ma
sub a2,a2,a3
vse32.v v12,0(a4)
add a4,a4,a5
bne a2,zero,.L3
.L9:
li  a0,0
ret

Tested on --with-arch=gcv no regression.

PR target/113112

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Refine 
dump information.
(preferred_new_lmul_p): Make PHI initial value into live regs 
calculation.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 45 ---
 .../vect/costmodel/riscv/rvv/pr113112-1.c | 31 +
 2 files changed, 71 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index a316603e207..946eb4a9fc6 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -355,10 +355,11 @@ max_number_of_live_regs (const basic_block bb,
 }
 
   if (dump_enabled_p ())
-dump_printf_loc (MSG_NOTE, vect_location,
-"Maximum lmul = %d, %d number of live V_REG at program "
-"point %d for bb %d\n",
-lmul, max_nregs, live_point, bb->index);
+dump_printf_loc (
+  MSG_NOTE, vect_location,
+  "Maximum lmul = %d, At most %d number of live V_REG at program "
+  "point %d for bb %d\n",
+  lmul, max_nregs, live_point, bb->index);
   return max_nregs;
 }
 
@@ -472,6 +473,41 @@ update_local_live_ranges (
  tree def = gimple_phi_arg_def (phi, j);
  auto *live_ranges = live_ranges_per_bb.get (bb);
  auto *live_range = live_ranges->get (def);
+ if (poly_int_tree_p (def))
+   {
+ /* Insert live range of INTEGER_CST or POLY_CST since we will
+need to allocate a vector register for it.
+
+E.g. # j_17 = PHI  will be transformed
+into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
+
+The live range for such value is short which only lives
+from program point 0 to 1.  */
+ if (live_range)
+   {
+ unsigned int start = (*live_range).first;
+ (*live_range).first = 0;
+ if (dump_enabled_p ())
+   dump_printf_loc (
+ MSG_NOTE, vect_location,
+ "Update %T start point from %d to 0:\n", def, start);
+   }
+ else
+   {
+ live_range

[PATCH] RISC-V: Make PHI initial value occupy live V_REG in dynamic LMUL cost model analysis

2023-12-22 Thread Juzhe-Zhong

Consider this following case:

foo:
ble a0,zero,.L11
lui a2,%hi(.LANCHOR0)
addisp,sp,-128
addia2,a2,%lo(.LANCHOR0)
mv  a1,a0
vsetvli a6,zero,e32,m8,ta,ma
vid.v   v8
vs8r.v  v8,0(sp) ---> spill
.L3:
vl8re32.v   v16,0(sp)---> reload
vsetvli a4,a1,e8,m2,ta,ma
li  a3,0
vsetvli a5,zero,e32,m8,ta,ma
vmv8r.v v0,v16
vmv.v.x v8,a4
vmv.v.i v24,0
vadd.vv v8,v16,v8
vmv8r.v v16,v24
vs8r.v  v8,0(sp)---> spill
.L4:
addiw   a3,a3,1
vadd.vv v8,v0,v16
vadd.vi v16,v16,1
vadd.vv v24,v24,v8
bne a0,a3,.L4
vsetvli zero,a4,e32,m8,ta,ma
sub a1,a1,a4
vse32.v v24,0(a2)
sllia4,a4,2
add a2,a2,a4
bne a1,zero,.L3
li  a0,0
addisp,sp,128
jr  ra
.L11:
li  a0,0
ret

Pick unexpected LMUL = 8.

The root cause is we didn't involve PHI initial value in the dynamic LMUL 
calculation:

  # j_17 = PHI---> # vect_vec_iv_.8_24 = 
PHI <_25(9), { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }(5)>

We didn't count { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } in consuming vector register but it does 
allocate an vector register group for it.

This patch fixes this missing count. Then after this patch we pick up perfect 
LMUL (LMUL = M4)

foo:
ble a0,zero,.L9
lui a4,%hi(.LANCHOR0)
addia4,a4,%lo(.LANCHOR0)
mv  a2,a0
vsetivlizero,16,e32,m4,ta,ma
vid.v   v20
.L3:
vsetvli a3,a2,e8,m1,ta,ma
li  a5,0
vsetivlizero,16,e32,m4,ta,ma
vmv4r.v v16,v20
vmv.v.i v12,0
vmv.v.x v4,a3
vmv4r.v v8,v12
vadd.vv v20,v20,v4
.L4:
addiw   a5,a5,1
vmv4r.v v4,v8
vadd.vi v8,v8,1
vadd.vv v4,v16,v4
vadd.vv v12,v12,v4
bne a0,a5,.L4
sllia5,a3,2
vsetvli zero,a3,e32,m4,ta,ma
sub a2,a2,a3
vse32.v v12,0(a4)
add a4,a4,a5
bne a2,zero,.L3
.L9:
li  a0,0
ret

Tested on --with-arch=gcv no regression. Ok for trunk ?

PR target/113112

gcc/ChangeLog:

* config/riscv/riscv-vector-costs.cc (max_number_of_live_regs): Refine 
dump information.
(preferred_new_lmul_p): Make PHI initial value into live regs 
calculation.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c: New test.

---
 gcc/config/riscv/riscv-vector-costs.cc| 45 ---
 .../vect/costmodel/riscv/rvv/pr113112-1.c | 31 +
 2 files changed, 71 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c

diff --git a/gcc/config/riscv/riscv-vector-costs.cc 
b/gcc/config/riscv/riscv-vector-costs.cc
index a316603e207..2d4b82a643a 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -355,10 +355,11 @@ max_number_of_live_regs (const basic_block bb,
 }
 
   if (dump_enabled_p ())
-dump_printf_loc (MSG_NOTE, vect_location,
-"Maximum lmul = %d, %d number of live V_REG at program "
-"point %d for bb %d\n",
-lmul, max_nregs, live_point, bb->index);
+dump_printf_loc (
+  MSG_NOTE, vect_location,
+  "Maximum lmul = %d, At most %d number of live V_REG at program "
+  "point %d for bb %d\n",
+  lmul, max_nregs, live_point, bb->index);
   return max_nregs;
 }
 
@@ -472,6 +473,41 @@ update_local_live_ranges (
  tree def = gimple_phi_arg_def (phi, j);
  auto *live_ranges = live_ranges_per_bb.get (bb);
  auto *live_range = live_ranges->get (def);
+ if (poly_int_tree_p (def))
+   {
+ /* Insert live range of INTEGER_CST since we will need to
+allocate a vector register for it.
+
+E.g. # j_17 = PHI  will be transformed
+into # vect_vec_iv_.8_24 = PHI <_25(9), { 0, ... }(5)>
+
+The live range for such value is short which only lives
+at program point 0.  */
+ if (live_range)
+   {
+ unsigned int start = (*live_range).first;
+ (*live_range).first = 0;
+ if (dump_enabled_p ())
+   dump_printf_loc (
+ MSG_NOTE, vect_location,
+ "Update %T start point from %d to 0:\n", def, start);
+   }
+ else
+   {
+ live_ranges->p

[Committed] RISC-V: Add dynamic LMUL test for x264

2023-12-21 Thread Juzhe-Zhong

When working on evaluating x264 performance, I notice the best LMUL for such 
case with -march=rv64gcv is LMUL = 2

LMUL = 1:

x264_pixel_8x8:
add a4,a1,a2
addia6,a0,16
vsetivlizero,4,e8,mf4,ta,ma
add a5,a4,a2
vle8.v  v12,0(a6)
vle8.v  v2,0(a4)
addia6,a0,4
addia4,a4,4
vle8.v  v11,0(a6)
vle8.v  v9,0(a4)
addia6,a1,4
addia4,a0,32
vle8.v  v13,0(a0)
vle8.v  v1,0(a1)
vle8.v  v4,0(a6)
vle8.v  v8,0(a4)
vle8.v  v7,0(a5)
vwsubu.vv   v3,v13,v1
add a3,a5,a2
addia6,a0,20
addia4,a0,36
vle8.v  v10,0(a6)
vle8.v  v6,0(a4)
addia5,a5,4
vle8.v  v5,0(a5)
vsetvli zero,zero,e16,mf2,ta,mu
vmslt.viv0,v3,0
vneg.v  v3,v3,v0.t
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv   v1,v12,v2
vsetvli zero,zero,e16,mf2,ta,mu
vmslt.viv0,v1,0
vneg.v  v1,v1,v0.t
vmv1r.v v2,v1
vwadd.vvv1,v3,v2
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv   v2,v11,v4
vsetvli zero,zero,e16,mf2,ta,mu
vmslt.viv0,v2,0
vneg.v  v2,v2,v0.t
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv   v3,v10,v9
vsetvli zero,zero,e16,mf2,ta,mu
vmv1r.v v4,v2
vmslt.viv0,v3,0
vneg.v  v3,v3,v0.t
vwadd.vvv2,v4,v3
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv   v3,v8,v7
vsetvli zero,zero,e16,mf2,ta,mu
add a4,a3,a2
vmslt.viv0,v3,0
vneg.v  v3,v3,v0.t
vwadd.wvv1,v1,v3
vsetvli zero,zero,e8,mf4,ta,ma
add a5,a4,a2
vwsubu.vv   v3,v6,v5
addia6,a0,48
vsetvli zero,zero,e16,mf2,ta,mu
vle8.v  v16,0(a3)
vle8.v  v12,0(a4)
addia3,a3,4
addia4,a4,4
vle8.v  v17,0(a6)
vle8.v  v14,0(a3)
vle8.v  v10,0(a4)
vle8.v  v8,0(a5)
add a6,a5,a2
addia3,a0,64
addia4,a0,80
addia5,a5,4
vle8.v  v13,0(a3)
vle8.v  v4,0(a5)
vle8.v  v9,0(a4)
vle8.v  v6,0(a6)
vmslt.viv0,v3,0
addia7,a0,52
vneg.v  v3,v3,v0.t
vle8.v  v15,0(a7)
vwadd.wvv2,v2,v3
addia3,a0,68
addia4,a0,84
vle8.v  v11,0(a3)
vle8.v  v5,0(a4)
addia5,a0,96
vle8.v  v7,0(a5)
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv   v3,v17,v16
vsetvli zero,zero,e16,mf2,ta,mu
vmslt.viv0,v3,0
vneg.v  v3,v3,v0.t
vwadd.wvv1,v1,v3
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv   v3,v15,v14
vsetvli zero,zero,e16,mf2,ta,mu
vmslt.viv0,v3,0
vneg.v  v3,v3,v0.t
vwadd.wvv2,v2,v3
vsetvli zero,zero,e8,mf4,ta,ma
vwsubu.vv   v3,v13,v12
vsetvli zero,zero,e16,mf2,ta,mu
sllia4,a2,3
vmslt.viv0,v3,0
vneg.v  v3,v3,v0.t
vwadd.wvv1,v1,v3
vsetvli zero,zero,e8,mf4,ta,ma
sub a4,a4,a2
vwsubu.vv   v3,v11,v10
vsetvli zero,zero,e16,mf2,ta,mu
add a1,a1,a4
vmslt.viv0,v3,0
vneg.v  v3,v3,v0.t
vwadd.wvv2,v2,v3
vsetvli zero,zero,e8,mf4,ta,ma
lbu a7,0(a1)
vwsubu.vv   v3,v9,v8
lbu a5,112(a0)
vsetvli zero,zero,e16,mf2,ta,mu
subwa5,a5,a7
vmslt.viv0,v3,0
lbu a3,113(a0)
vneg.v  v3,v3,v0.t
lbu a4,1(a1)
vwadd.wvv1,v1,v3
addia6,a6,4
vsetvli zero,zero,e8,mf4,ta,ma
subwa3,a3,a4
vwsubu.vv   v3,v5,v4
addia2,a0,100
vsetvli zero,zero,e16,mf2,ta,mu
vle8.v  v4,0(a6)
sraiw   a6,a5,31
vle8.v  v5,0(a2)
sraiw   a7,a3,31
vmslt.viv0,v3,0
xor a2,a5,a6
vneg.v  v3,v3,v0.t
vwadd.wvv2,v2,v3
vsetvli zero,zero,e8,mf4,ta,ma
lbu a4,114(a0)
vwsubu.vv   v3,v7,v6
lbu t1,2(a1)
vsetvli zero,zero,e16,mf2,ta,mu
subwa2,a2,a6
xor a6,a3,a7
vmslt.viv0,v3,0
subwa4,a4,t1
vneg.v  v3,v3,v0.t
lbu t1,3(a1)
vwadd.wvv1,v1,v3
lbu a5,115(a0)
subwa6,a6,a7
vsetvli zero,zero,e8,mf4,ta,ma
li  a7,0
vwsubu.vv   v3,v5,v4
sraiw   t3,a4,31
vsetvli zero,zero,e16,mf2,ta,mu
subwa5,a5,t1
vmslt.viv0,v3,0
vneg.v  v3,v3,v0.t
vwadd.wvv2,v2,v3
sraiw   t1,a5,31
vsetvli zero,zero,e32,m1,ta,ma

[Committed] RISC-V: Fix ICE of moving SUBREG of vector mode to DImode scalar register on RV32 system.

2023-12-20 Thread Juzhe-Zhong

This patch fixes following ICE on full coverage testing of RV32.

Running target 
riscv-sim/-march=rv32gc_zve32f/-mabi=ilp32d/-mcmodel=medlow/--param=riscv-autovec-lmul=dynamic
FAIL: gcc.c-torture/compile/930120-1.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/930120-1.c   -O3 -fomit-frame-pointer 
-funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler 
error: in emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/930120-1.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/pr42196-1.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/pr42196-1.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -O1  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -Os  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-3.c   -O1  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-3.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-3.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-3.c   -Os  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/bswap-1.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/bswap-1.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
Running target 
riscv-sim/-march=rv32gc_zve32f/-mabi=ilp32d/-mcmodel=medlow/--param=riscv-autovec-lmul=dynamic/--param=riscv-autovec-preference=fixed-vlmax
FAIL: gcc.c-torture/compile/930120-1.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/930120-1.c   -O3 -fomit-frame-pointer 
-funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler 
error: in emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/930120-1.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/pr42196-1.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/pr42196-1.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -O1  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -Os  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-3.c   -O1  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-3.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-3.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-3.c   -Os  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/bswap-1.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/bswap-1.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
Running target 
riscv-sim/-march=rv32gc_zve32f/-mabi=ilp32d/-mcmodel=medlow/--param=riscv-autovec-lmul=m2
FAIL: gcc.c-torture/compile/930120-1.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/930120-1.c   -O3 -fomit-frame-pointer 
-funroll-loops -fpeel-loops -ftracer -finline-functions  (internal compiler 
error: in emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/930120-1.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/pr42196-1.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/compile/pr42196-1.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -O1  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -O2  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -O3 -g  (internal compiler error: in 
emit_move_insn, at expr.cc:4606)
FAIL: gcc.c-torture/execute/20050316-2.c   -Os  (internal compiler error: in 
emit_move_insn, at expr.cc:

[PATCH] RISC-V: Optimize SELECT_VL codegen when length is known as smaller than VF

2023-12-19 Thread Juzhe-Zhong

While trying to fix bugs of PR113097, notice this following situation we
generate redundant vsetvli

_255 = SELECT_VL (3, POLY_INT_CST [4, 4]);
COND_LEN (..., _255)

Before this patch:

vsetivli a5, 3...
...
vadd.vv (use a5)

After this patch:

...
vadd.vv (use AVL = 3)

The reason we can do this is because known_ge (3, [4,4]) is true.
It's safe to apply such optimization

Tested on both RV32 and RV64 full coverage testing, no regression.

PR target/113087

gcc/ChangeLog:

* config/riscv/riscv-v.cc (expand_select_vl): Optimize SELECT_VL.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113087-2.c: New test.

---
 gcc/config/riscv/riscv-v.cc   | 10 +++
 .../gcc.target/riscv/rvv/autovec/pr113087-2.c | 61 +++
 2 files changed, 71 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-2.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 486f5deb296..fc9825f168a 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3671,6 +3671,16 @@ void
 expand_select_vl (rtx *ops)
 {
   poly_int64 nunits = rtx_to_poly_int64 (ops[2]);
+  if (CONST_INT_P (ops[1]) && known_le (INTVAL (ops[1]), nunits))
+{
+  /* If length is known <= VF, we just use the length directly instead
+of using vsetvli.
+
+E.g. _255 = .SELECT_VL (3, POLY_INT_CST [4, 4]);
+We move 3 into _255 intead of using explicit vsetvl.  */
+  emit_move_insn (ops[0], ops[1]);
+  return;
+}
   /* We arbitrary picked QImode as inner scalar mode to get vector mode.
  since vsetvl only demand ratio. We let VSETVL PASS to optimize it.  */
   scalar_int_mode mode = QImode;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-2.c
new file mode 100644
index 000..836260fe911
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-2.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */
+
+#include 
+int (e) (int g, int h) { return h > 0x10 || g > 0x >> h ? g : g << h; }
+struct i
+{
+  int j;
+  int l : 1;
+};
+struct m
+{
+  char k;
+  int n;
+};
+char o;
+char p;
+short s;
+int q;
+struct m r;
+int v;
+int t;
+short z;
+long ac;
+int ad;
+int ae;
+
+static void
+ai (struct i bf)
+{
+  for (; v; v++)
+r.k = 0;
+  do
+ac ^= bf.j;
+  while (bf.j < 0);
+  s = 0;
+  if (bf.l)
+q |= 0x800;
+}
+
+int
+main ()
+{
+  struct i aw = {0xE00, 1};
+  o = 4;
+  s = p;
+  ai (aw);
+  t = 1;
+  ++p;
+  for (; t <= 7; t++)
+{
+  ad &= 1;
+  (o &= 1 - e (0x4012, ++ae)) & (z |= 1);
+}
+  for (; r.n;)
+;
+  assert (o == 4);
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not {vsetivli\s+[a-x0-9]+,\s*3} } } */
-- 
2.36.3

[PATCH] RISC-V: Fix bug of VSETVL fusion

2023-12-19 Thread Juzhe-Zhong

This patch fixes bugs in the fusion of this following case:

li a5,-1
vmv.s.x v0,a5 -> demand any non-zero AVL
vsetvli a5, ...

Incorrect fusion after VSETVL PASS:

li a5,-1
vsetvli a5...
vmv.s.x v0, a5 --> a5 is modified as incorrect value.

We disallow this incorrect fusion above.

Full coverage testing of RV64 and RV32 no regression.

PR target/113087

gcc/ChangeLog:

* config/riscv/riscv-vsetvl.cc: Disallow fusion when VL modification 
pollutes non AVL use.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr113087-1.c: New test.

---
 gcc/config/riscv/riscv-vsetvl.cc  | 41 -
 .../gcc.target/riscv/rvv/autovec/pr113087-1.c | 60 +++
 2 files changed, 99 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-1.c

diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc
index 6af8d8429ab..eabaef80f89 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -1128,6 +1128,27 @@ public:
   return gen_vsetvl_discard_result (Pmode, avl, sew, vlmul, ta, ma);
   }
 
+  /* Return true that the non-AVL operands of THIS will be modified
+ if we fuse the VL modification from OTHER into THIS.  */
+  bool vl_modify_non_avl_op_p (const vsetvl_info &other) const
+  {
+/* We don't need to worry about any operands from THIS be
+   modified by OTHER vsetvl since we OTHER vsetvl doesn't
+   modify any operand.  */
+if (!other.has_vl ())
+  return false;
+
+/* THIS VL operand always preempt OTHER VL operand.  */
+if (this->has_vl ())
+  return false;
+
+/* If THIS has non IMM AVL and THIS is AVL compatible with
+   OTHER, the AVL value of THIS is same as VL value of OTHER.  */
+if (!this->has_imm_avl ())
+  return false;
+return find_access (this->get_insn ()->uses (), REGNO (other.get_vl ()));
+  }
+
   bool operator== (const vsetvl_info &other) const
   {
 gcc_assert (!uninit_p () && !other.uninit_p ()
@@ -1896,6 +1917,20 @@ public:
 gcc_unreachable ();
   }
 
+  bool vl_not_in_conflict_p (const vsetvl_info &prev, const vsetvl_info &next)
+  {
+/* We don't fuse this following case:
+
+   li a5, -1
+   vmv.s.x v0, a5 -- PREV
+   vsetvli a5, ...-- NEXT
+
+   Don't fuse NEXT into PREV.
+*/
+return !prev.vl_modify_non_avl_op_p (next)
+  && !next.vl_modify_non_avl_op_p (prev);
+  }
+
   bool avl_compatible_p (const vsetvl_info &prev, const vsetvl_info &next)
   {
 gcc_assert (prev.valid_p () && next.valid_p ());
@@ -1953,7 +1988,8 @@ public:
   {
 bool compatible_p = sew_lmul_compatible_p (prev, next)
&& policy_compatible_p (prev, next)
-   && avl_compatible_p (prev, next);
+   && avl_compatible_p (prev, next)
+   && vl_not_in_conflict_p (prev, next);
 return compatible_p;
   }
 
@@ -1961,7 +1997,8 @@ public:
   {
 bool available_p = sew_lmul_available_p (prev, next)
   && policy_available_p (prev, next)
-  && avl_available_p (prev, next);
+  && avl_available_p (prev, next)
+  && vl_not_in_conflict_p (prev, next);
 gcc_assert (!available_p || compatible_p (prev, next));
 return available_p;
   }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-1.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-1.c
new file mode 100644
index 000..7b743effc79
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr113087-1.c
@@ -0,0 +1,60 @@
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+/* { dg-require-effective-target riscv_v } */
+
+#include 
+int (e) (int g, int h) { return h > 0x10 || g > 0x >> h ? g : g << h; }
+struct i
+{
+  int j;
+  int l : 1;
+};
+struct m
+{
+  char k;
+  int n;
+};
+char o;
+char p;
+short s;
+int q;
+struct m r;
+int v;
+int t;
+short z;
+long ac;
+int ad;
+int ae;
+
+static void
+ai (struct i bf)
+{
+  for (; v; v++)
+r.k = 0;
+  do
+ac ^= bf.j;
+  while (bf.j < 0);
+  s = 0;
+  if (bf.l)
+q |= 0x800;
+}
+
+int
+main ()
+{
+  struct i aw = {0xE00, 1};
+  o = 4;
+  s = p;
+  ai (aw);
+  t = 1;
+  ++p;
+  for (; t <= 7; t++)
+{
+  ad &= 1;
+  (o &= 1 - e (0x4012, ++ae)) & (z |= 1);
+}
+  for (; r.n;)
+;
+  assert (o == 4);
+  return 0;
+}
-- 
2.36.3

[PATCH] RISC-V: Fix FAIL of bb-slp-cond-1.c for RVV

2023-12-19 Thread Juzhe-Zhong

Due to recent VLSmode changes (Change for fixing ICE and run-time FAIL).

The dump check is same as ARM SVE now. So adapt test for RISC-V.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/bb-slp-cond-1.c: Adapt for RISC-V.

---
 gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c
index 4089eb51b2e..8faf6b6e3ac 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-cond-1.c
@@ -47,6 +47,6 @@ int main ()
 }
 
 /* { dg-final { scan-tree-dump {(no need for alias check [^\n]* when VF is 
1|no alias between [^\n]* when [^\n]* is outside \(-16, 16\))} "vect" { target 
vect_element_align } } } */
-/* { dg-final { scan-tree-dump-times "loop vectorized" 1 "vect" { target { 
vect_element_align && { ! { amdgcn-*-* riscv*-*-* } } } } } } */
-/* { dg-final { scan-tree-dump-times "loop vectorized" 2 "vect" { target { 
amdgcn-*-* riscv*-*-* } } } } */
+/* { dg-final { scan-tree-dump-times "loop vectorized" 1 "vect" { target { 
vect_element_align && { ! { amdgcn-*-* } } } } } } */
+/* { dg-final { scan-tree-dump-times "loop vectorized" 2 "vect" { target { 
amdgcn-*-* } } } } */
 
-- 
2.36.3

[PATCH] Regression FIX: Remove vect_variable_length XFAIL from some tests

2023-12-19 Thread Juzhe-Zhong

Hi, this patch fixes these following regression FAILs on RVV:

XPASS: gcc.dg/tree-ssa/pr84512.c scan-tree-dump optimized "return 285;"
XPASS: gcc.dg/vect/bb-slp-43.c -flto -ffat-lto-objects  scan-tree-dump-not slp2 
"vector operands from scalars"
XPASS: gcc.dg/vect/bb-slp-43.c scan-tree-dump-not slp2 "vector operands from 
scalars"
XPASS: gcc.dg/vect/bb-slp-subgroups-3.c -flto -ffat-lto-objects  
scan-tree-dump-times slp2 "optimized: basic block" 2
XPASS: gcc.dg/vect/bb-slp-subgroups-3.c scan-tree-dump-times slp2 "optimized: 
basic block" 2

Since vect_variable_length are available for ARM SVE and RVV, I just use 
compiler explorer to confirm ARM SVE same as
RVV.

Hi, @Tamar. Could you double check whether this patch fix is reasonable to you ?

And.

Hi, @Richard. Is this patch Ok for trunk if this patch fixes regression for 
both RVV and ARM SVE.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/pr84512.c: Remove vect_variable_length XFAIL.
* gcc.dg/vect/bb-slp-43.c: Ditto.
* gcc.dg/vect/bb-slp-subgroups-3.c: Ditto.

---
 gcc/testsuite/gcc.dg/tree-ssa/pr84512.c| 2 +-
 gcc/testsuite/gcc.dg/vect/bb-slp-43.c  | 2 +-
 gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr84512.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr84512.c
index 496c78b28dc..3c027012670 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/pr84512.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr84512.c
@@ -13,4 +13,4 @@ int foo()
 }
 
 /* Listed targets xfailed due to PR84958.  */
-/* { dg-final { scan-tree-dump "return 285;" "optimized" { xfail { amdgcn*-*-* 
|| vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump "return 285;" "optimized" { xfail { amdgcn*-*-* 
} } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c
index dad2d24262d..40bd2e0dfbf 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-43.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-43.c
@@ -14,4 +14,4 @@ f (int *restrict x, short *restrict y)
 }
 
 /* { dg-final { scan-tree-dump-not "mixed mask and nonmask" "slp2" } } */
-/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { 
target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } 
xfail { vect_variable_length && { ! vect256 } } } } } */
+/* { dg-final { scan-tree-dump-not "vector operands from scalars" "slp2" { 
target { { vect_int && vect_bool_cmp } && { vect_unpack && vect_hw_misalign } } 
} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c
index fb719915db7..3f0d45ce4a1 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-subgroups-3.c
@@ -42,7 +42,7 @@ main (int argc, char **argv)
 /* Because we disable the cost model, targets with variable-length
vectors can end up vectorizing the store to a[0..7] on its own.
With the cost model we do something sensible.  */
-/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { 
target { ! amdgcn-*-* } xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "optimized: basic block" 2 "slp2" { 
target { ! amdgcn-*-* } } } } */
 
 /* amdgcn can do this in one vector.  */
 /* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp2" { 
target amdgcn-*-* } } } */
-- 
2.36.3

[Committed] RISC-V: Refine some codes of expand_const_vector [NFC]

2023-12-19 Thread Juzhe-Zhong

gcc/ChangeLog:

* config/riscv/riscv-v.cc (expand_const_vector): Use builder.inner_mode 
().

---
 gcc/config/riscv/riscv-v.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index d1eb7a0a9a5..486f5deb296 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -1380,15 +1380,15 @@ expand_const_vector (rtx target, rtx src)
  rtx base1 = builder.elt (1);
  rtx base2 = builder.elt (2);
 
- scalar_mode elem_mode = GET_MODE_INNER (mode);
- rtx step = simplify_binary_operation (MINUS, elem_mode, base2, base1);
+ rtx step = simplify_binary_operation (MINUS, builder.inner_mode (),
+   base2, base1);
 
  /* Step 1 - { base1, base1 + step, base1 + step * 2, ... }  */
  rtx tmp = gen_reg_rtx (mode);
  expand_vec_series (tmp, base1, step);
  /* Step 2 - { base0, base1, base1 + step, base1 + step * 2, ... }  */
  if (!rtx_equal_p (base0, const0_rtx))
-   base0 = force_reg (elem_mode, base0);
+   base0 = force_reg (builder.inner_mode (), base0);
 
  insn_code icode = optab_handler (vec_shl_insert_optab, mode);
  gcc_assert (icode != CODE_FOR_nothing);
-- 
2.36.3

[Committed] RISC-V: Fix FAIL of dynamic-lmul2-7.c

2023-12-18 Thread Juzhe-Zhong

Fix this FAIL: 
FAIL: gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c scan-tree-dump-times 
vect "Maximum lmul = 2" 1

gcc/testsuite/ChangeLog:

* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: Adapt test.

---
 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c 
b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
index 0e6d5fe5f62..636332dbb62 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c
@@ -22,5 +22,5 @@ bar (int *x, int a, int b, int n)
 /* { dg-final { scan-assembler-times {ret} 2 } } *
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 8" 1 "vect" } } */
 /* { dg-final { scan-tree-dump-times "Maximum lmul = 4" 1 "vect" } } */
-/* { dg-final { scan-tree-dump-times "Maximum lmul = 2" 1 "vect" } } */
+/* { dg-final { scan-tree-dump "Maximum lmul = 2" "vect" } } */
 /* { dg-final { scan-tree-dump-not "Maximum lmul = 1" "vect" } } */
-- 
2.36.3

[Committed] RISC-V: Remove 256/512/1024 VLS vectors

2023-12-18 Thread Juzhe-Zhong

Since 
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=2e7abd09621a4401d44f4513adf126bce4b4828b
we only allow VLSmodes with size <= TARGET_MIN_VLEN * TARGET_MAX_LMUL.

So when -march=rv64gcv default LMUL = 1, we don't have VLS modes of 
256/512/1024 vectors.

Disable them in vect test which fixes the following FAILs in the regression:

FAIL: gcc.dg/vect/bb-slp-pr65935.c -flto -ffat-lto-objects  
scan-tree-dump-times slp1 "optimized: basic block" 11
FAIL: gcc.dg/vect/bb-slp-pr65935.c scan-tree-dump-times slp1 "optimized: basic 
block" 11
FAIL: gcc.dg/vect/bb-slp-subgroups-2.c -flto -ffat-lto-objects  
scan-tree-dump-times slp2 "optimized: basic block" 1
FAIL: gcc.dg/vect/bb-slp-subgroups-2.c scan-tree-dump-times slp2 "optimized: 
basic block" 1

gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Remove 256/512/1024 vectors.

---
 gcc/testsuite/lib/target-supports.exp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 370df10978d..7f13ff0ca56 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -9033,7 +9033,7 @@ proc available_vector_sizes { } {
lappend result 4096 2048 1024 512 256 128 64 32 16 8 4 2
 } elseif { [istarget riscv*-*-*] } {
if { [check_effective_target_riscv_v] } {
-   lappend result 0 32 64 128 256 512 1024
+   lappend result 0 32 64 128
}
lappend result 128
 } else {
-- 
2.36.3

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1001 matches

Mail list logo