[PATCH 1/2] doloop: Add support for predicated vectorized loops

2024-05-23 Thread Andre Vieira

This patch adds support in the target agnostic doloop pass for the detection of
predicated vectorized hardware loops.  Arm is currently the only target that
will make use of this feature.

gcc/ChangeLog:

* df-core.cc (df_bb_regno_only_def_find): New helper function.
* df.h (df_bb_regno_only_def_find): Declare new function.
* loop-doloop.cc (doloop_condition_get): Add support for detecting
predicated vectorized hardware loops.
(doloop_modify): Add support for GTU condition checks.
(doloop_optimize): Update costing computation to support alterations to
desc->niter_expr by the backend.

Co-authored-by: Stam Markianos-Wright 
---
 gcc/df-core.cc |  15 +
 gcc/df.h   |   1 +
 gcc/loop-doloop.cc | 164 +++--
 3 files changed, 113 insertions(+), 67 deletions(-)

diff --git a/gcc/df-core.cc b/gcc/df-core.cc
index f0eb4c93957..b0e8a88d433 100644
--- a/gcc/df-core.cc
+++ b/gcc/df-core.cc
@@ -1964,6 +1964,21 @@ df_bb_regno_last_def_find (basic_block bb, unsigned int regno)
   return NULL;
 }
 
+/* Return the one and only def of REGNO within BB.  If there is no def or
+   there are multiple defs, return NULL.  */
+
+df_ref
+df_bb_regno_only_def_find (basic_block bb, unsigned int regno)
+{
+  df_ref temp = df_bb_regno_first_def_find (bb, regno);
+  if (!temp)
+return NULL;
+  else if (temp == df_bb_regno_last_def_find (bb, regno))
+return temp;
+  else
+return NULL;
+}
+
 /* Finds the reference corresponding to the definition of REG in INSN.
DF is the dataflow object.  */
 
diff --git a/gcc/df.h b/gcc/df.h
index 84e5aa8b524..c4e690b40cf 100644
--- a/gcc/df.h
+++ b/gcc/df.h
@@ -987,6 +987,7 @@ extern void df_check_cfg_clean (void);
 #endif
 extern df_ref df_bb_regno_first_def_find (basic_block, unsigned int);
 extern df_ref df_bb_regno_last_def_find (basic_block, unsigned int);
+extern df_ref df_bb_regno_only_def_find (basic_block, unsigned int);
 extern df_ref df_find_def (rtx_insn *, rtx);
 extern bool df_reg_defined (rtx_insn *, rtx);
 extern df_ref df_find_use (rtx_insn *, rtx);
diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc
index 529e810e530..8953e1de960 100644
--- a/gcc/loop-doloop.cc
+++ b/gcc/loop-doloop.cc
@@ -85,10 +85,10 @@ doloop_condition_get (rtx_insn *doloop_pat)
  forms:
 
  1)  (parallel [(set (pc) (if_then_else (condition)
-	  			(label_ref (label))
-(pc)))
-	 (set (reg) (plus (reg) (const_int -1)))
-	 (additional clobbers and uses)])
+	(label_ref (label))
+	(pc)))
+		 (set (reg) (plus (reg) (const_int -1)))
+		 (additional clobbers and uses)])
 
  The branch must be the first entry of the parallel (also required
  by jump.cc), and the second entry of the parallel must be a set of
@@ -96,19 +96,33 @@ doloop_condition_get (rtx_insn *doloop_pat)
  the loop counter in an if_then_else too.
 
  2)  (set (reg) (plus (reg) (const_int -1))
- (set (pc) (if_then_else (reg != 0)
-	 (label_ref (label))
-			 (pc))).  
+	 (set (pc) (if_then_else (reg != 0)
+ (label_ref (label))
+ (pc))).
 
- Some targets (ARM) do the comparison before the branch, as in the
+ 3) Some targets (Arm) do the comparison before the branch, as in the
  following form:
 
- 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
-   (set (reg) (plus (reg) (const_int -1)))])
-(set (pc) (if_then_else (cc == NE)
-(label_ref (label))
-(pc))) */
-
+ (parallel [(set (cc) (compare (plus (reg) (const_int -1)) 0))
+		(set (reg) (plus (reg) (const_int -1)))])
+ (set (pc) (if_then_else (cc == NE)
+			 (label_ref (label))
+			 (pc)))
+
+  4) This form supports a construct that is used to represent a vectorized
+  do loop with predication, however we do not need to care about the
+  details of the predication here.
+  Arm uses this construct to support MVE tail predication.
+
+  (parallel
+   [(set (pc)
+	 (if_then_else (gtu (plus (reg) (const_int -n))
+(const_int n-1))
+			   (label_ref)
+			   (pc)))
+	(set (reg) (plus (reg) (const_int -n)))
+	(additional clobbers and uses)])
+ */
   pattern = PATTERN (doloop_pat);
 
   if (GET_CODE (pattern) != PARALLEL)
@@ -173,15 +187,17 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
 return 0;
 
-  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
  On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
-  || XEXP (inc_src, 0) != reg
-  || XEXP (inc_src, 1) != constm1_rtx)
+  || !rtx_equal_p (XEXP (inc_src, 0), reg)
+  || 

[PATCH 1/2] doloop: Add support for predicated vectorized loops

2024-05-23 Thread Andre Vieira

This patch adds support in the target agnostic doloop pass for the detection of
predicated vectorized hardware loops.  Arm is currently the only target that
will make use of this feature.

gcc/ChangeLog:

* df-core.cc (df_bb_regno_only_def_find): New helper function.
* df.h (df_bb_regno_only_def_find): Declare new function.
* loop-doloop.cc (doloop_condition_get): Add support for detecting
predicated vectorized hardware loops.
(doloop_modify): Add support for GTU condition checks.
(doloop_optimize): Update costing computation to support alterations to
desc->niter_expr by the backend.

Co-authored-by: Stam Markianos-Wright 
---
 gcc/df-core.cc |  15 +
 gcc/df.h   |   1 +
 gcc/loop-doloop.cc | 164 +++--
 3 files changed, 113 insertions(+), 67 deletions(-)

diff --git a/gcc/df-core.cc b/gcc/df-core.cc
index f0eb4c93957..b0e8a88d433 100644
--- a/gcc/df-core.cc
+++ b/gcc/df-core.cc
@@ -1964,6 +1964,21 @@ df_bb_regno_last_def_find (basic_block bb, unsigned int regno)
   return NULL;
 }
 
+/* Return the one and only def of REGNO within BB.  If there is no def or
+   there are multiple defs, return NULL.  */
+
+df_ref
+df_bb_regno_only_def_find (basic_block bb, unsigned int regno)
+{
+  df_ref temp = df_bb_regno_first_def_find (bb, regno);
+  if (!temp)
+return NULL;
+  else if (temp == df_bb_regno_last_def_find (bb, regno))
+return temp;
+  else
+return NULL;
+}
+
 /* Finds the reference corresponding to the definition of REG in INSN.
DF is the dataflow object.  */
 
diff --git a/gcc/df.h b/gcc/df.h
index 84e5aa8b524..c4e690b40cf 100644
--- a/gcc/df.h
+++ b/gcc/df.h
@@ -987,6 +987,7 @@ extern void df_check_cfg_clean (void);
 #endif
 extern df_ref df_bb_regno_first_def_find (basic_block, unsigned int);
 extern df_ref df_bb_regno_last_def_find (basic_block, unsigned int);
+extern df_ref df_bb_regno_only_def_find (basic_block, unsigned int);
 extern df_ref df_find_def (rtx_insn *, rtx);
 extern bool df_reg_defined (rtx_insn *, rtx);
 extern df_ref df_find_use (rtx_insn *, rtx);
diff --git a/gcc/loop-doloop.cc b/gcc/loop-doloop.cc
index 529e810e530..8953e1de960 100644
--- a/gcc/loop-doloop.cc
+++ b/gcc/loop-doloop.cc
@@ -85,10 +85,10 @@ doloop_condition_get (rtx_insn *doloop_pat)
  forms:
 
  1)  (parallel [(set (pc) (if_then_else (condition)
-	  			(label_ref (label))
-(pc)))
-	 (set (reg) (plus (reg) (const_int -1)))
-	 (additional clobbers and uses)])
+	(label_ref (label))
+	(pc)))
+		 (set (reg) (plus (reg) (const_int -1)))
+		 (additional clobbers and uses)])
 
  The branch must be the first entry of the parallel (also required
  by jump.cc), and the second entry of the parallel must be a set of
@@ -96,19 +96,33 @@ doloop_condition_get (rtx_insn *doloop_pat)
  the loop counter in an if_then_else too.
 
  2)  (set (reg) (plus (reg) (const_int -1))
- (set (pc) (if_then_else (reg != 0)
-	 (label_ref (label))
-			 (pc))).  
+	 (set (pc) (if_then_else (reg != 0)
+ (label_ref (label))
+ (pc))).
 
- Some targets (ARM) do the comparison before the branch, as in the
+ 3) Some targets (Arm) do the comparison before the branch, as in the
  following form:
 
- 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
-   (set (reg) (plus (reg) (const_int -1)))])
-(set (pc) (if_then_else (cc == NE)
-(label_ref (label))
-(pc))) */
-
+ (parallel [(set (cc) (compare (plus (reg) (const_int -1)) 0))
+		(set (reg) (plus (reg) (const_int -1)))])
+ (set (pc) (if_then_else (cc == NE)
+			 (label_ref (label))
+			 (pc)))
+
+  4) This form supports a construct that is used to represent a vectorized
+  do loop with predication, however we do not need to care about the
+  details of the predication here.
+  Arm uses this construct to support MVE tail predication.
+
+  (parallel
+   [(set (pc)
+	 (if_then_else (gtu (plus (reg) (const_int -n))
+(const_int n-1))
+			   (label_ref)
+			   (pc)))
+	(set (reg) (plus (reg) (const_int -n)))
+	(additional clobbers and uses)])
+ */
   pattern = PATTERN (doloop_pat);
 
   if (GET_CODE (pattern) != PARALLEL)
@@ -173,15 +187,17 @@ doloop_condition_get (rtx_insn *doloop_pat)
   if (! REG_P (reg))
 return 0;
 
-  /* Check if something = (plus (reg) (const_int -1)).
+  /* Check if something = (plus (reg) (const_int -n)).
  On IA-64, this decrement is wrapped in an if_then_else.  */
   inc_src = SET_SRC (inc);
   if (GET_CODE (inc_src) == IF_THEN_ELSE)
 inc_src = XEXP (inc_src, 1);
   if (GET_CODE (inc_src) != PLUS
-  || XEXP (inc_src, 0) != reg
-  || XEXP (inc_src, 1) != constm1_rtx)
+  || !rtx_equal_p (XEXP (inc_src, 0), reg)
+  ||