Re: [Patch] PR rtl-optimization/87763 - generate more bfi instructions on aarch64

Steve Ellcey Mon, 01 Apr 2019 10:23:33 -0700

This is a ping**3 for a patch to fix one of the test failures in PR 877763.
It fixes the gcc.target/aarch64/combine_bfi_1.c failure, but not the other
ones.


Could one of the Aarch64 maintainers take a look at it?  This version of
the patch was originally submitted on February 11 after incorporating some
changes suggested by Wilco Dijkstra but I have not gotten any comments
since then despite pinging it.  I updated it to apply cleanly on ToT but
did not make any other changes since the February 11th version.

If we want to encourage people to fix bugs in the run up to a release it
would help to get feedback on bugfix patches.

Steve Ellcey
sell...@marvell.com


2018-04-01  Steve Ellcey  <sell...@marvell.com>

        PR rtl-optimization/87763
        * config/aarch64/aarch64-protos.h (aarch64_masks_and_shift_for_bfi_p):
        New prototype.
        * config/aarch64/aarch64.c (aarch64_masks_and_shift_for_bfi_p):
        New function.
        * config/aarch64/aarch64.md (*aarch64_bfi<GPI:mode>5_shift):
        New instruction.
        (*aarch64_bfi<GPI:mode>4_noand): Ditto.
        (*aarch64_bfi<GPI:mode>4_noshift): Ditto.
        (*aarch64_bfi<GPI:mode>4_noshift_alt): Ditto.


2018-04-01  Steve Ellcey  <sell...@marvell.com>

        PR rtl-optimization/87763
        * gcc.target/aarch64/combine_bfxil.c: Change some bfxil checks
        to bfi.
        * gcc.target/aarch64/combine_bfi_2.c: New test.

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index b035e35..b6c0d0a 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -429,6 +429,9 @@ bool aarch64_label_mentioned_p (rtx);
 void aarch64_declare_function_name (FILE *, const char*, tree);
 bool aarch64_legitimate_pic_operand_p (rtx);
 bool aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode, rtx, rtx);
+bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, unsigned HOST_WIDE_INT,
+					unsigned HOST_WIDE_INT,
+					unsigned HOST_WIDE_INT);
 bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
 bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b38505b..3017e99 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9336,6 +9336,35 @@ aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
 	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
 }
 
+/* Return true if the masks and a shift amount from an RTX of the form
+   ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
+   a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
+
+bool
+aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
+				   unsigned HOST_WIDE_INT mask1,
+				   unsigned HOST_WIDE_INT shft_amnt,
+				   unsigned HOST_WIDE_INT mask2)
+{
+  unsigned HOST_WIDE_INT t;
+
+  /* Verify that there is no overlap in what bits are set in the two masks.  */
+  if (mask1 != ~mask2)
+    return false;
+
+  /* Verify that mask2 is not all zeros or ones.  */
+  if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
+    return false;
+
+  /* The shift amount should always be less than the mode size.  */
+  gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
+
+  /* Verify that the mask being shifted is contiguous and would be in the
+     least significant bits after shifting by shft_amnt.  */
+  t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
+  return (t == (t & -t));
+}
+
 /* Calculate the cost of calculating X, storing it in *COST.  Result
    is true if the total cost of the operation has now been calculated.  */
 static bool
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 70f0418..baa8983 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5490,6 +5490,76 @@
   [(set_attr "type" "bfm")]
 )
 
+;;  Match a bfi instruction where the shift of OP3 means that we are
+;;  actually copying the least significant bits of OP3 into OP0 by way
+;;  of the AND masks and the IOR instruction.  A similar instruction
+;;  with the two parts of the IOR swapped around was never triggered
+;;  in a bootstrap build and test of GCC so it was not included.
+
+(define_insn "*aarch64_bfi<GPI:mode>5_shift"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (ashift:GPI
+                           (match_operand:GPI 3 "register_operand" "r")
+                           (match_operand:GPI 4 "aarch64_simd_shift_imm_<mode>" "n"))
+                          (match_operand:GPI 5 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[2]),
+				      UINTVAL (operands[4]),
+				      UINTVAL(operands[5]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>3, %4, %P5"
+  [(set_attr "type" "bfm")]
+)
+
+;; Like *aarch64_bfi<GPI:mode>5_shift but with no and of the ashift because
+;; the shift is large enough to remove the need for an AND instruction.
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noand"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (ashift:GPI
+                          (match_operand:GPI 3 "register_operand" "r")
+                          (match_operand:GPI 4 "aarch64_simd_shift_imm_<mode>" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[2]),
+				      UINTVAL (operands[4]),
+				      HOST_WIDE_INT_M1U << UINTVAL (operands[4]) )"
+{
+  operands[5] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - UINTVAL (operands[4]));
+  return "bfi\t%<GPI:w>0, %<GPI:w>3, %4, %5";
+}
+  [(set_attr "type" "bfm")]
+)
+
+;; Like *aarch64_bfi<GPI:mode>5_shift but with no shifting, we are just
+;; copying the least significant bits of OP3 to OP0.  In this case we do
+;; need two versions of the instruction to handle different checks on the
+;; constant values.
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noshift"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "0")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (match_operand:GPI 3 "register_operand" "r")
+                          (match_operand:GPI 4 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[2]), 0,
+				      UINTVAL (operands[4]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>3, 0, %P4"
+  [(set_attr "type" "bfm")]
+)
+
+(define_insn "*aarch64_bfi<GPI:mode>4_noshift_alt"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+        (ior:GPI (and:GPI (match_operand:GPI 1 "register_operand" "r")
+                          (match_operand:GPI 2 "const_int_operand" "n"))
+                 (and:GPI (match_operand:GPI 3 "register_operand" "0")
+                          (match_operand:GPI 4 "const_int_operand" "n"))))]
+  "aarch64_masks_and_shift_for_bfi_p (<MODE>mode, UINTVAL (operands[4]), 0,
+				      UINTVAL (operands[2]))"
+  "bfi\t%<GPI:w>0, %<GPI:w>1, 0, %P2"
+  [(set_attr "type" "bfm")]
+)
+
 (define_insn "*extr_insv_lower_reg<mode>"
   [(set (zero_extract:GPI (match_operand:GPI 0 "register_operand" "+r")
 			  (match_operand 1 "const_int_operand" "n")

diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfi_2.c b/gcc/testsuite/gcc.target/aarch64/combine_bfi_2.c
index e69de29..145282d 100644
--- a/gcc/testsuite/gcc.target/aarch64/combine_bfi_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfi_2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int f1(int x, int y)
+{
+  return (y & 0xfffffff) | (((x <<28) & 0xf0000000));
+}
+
+
+int f2(int x, int y)
+{
+  return (((x <<28) & 0xf0000000)) | (y & 0xfffffff);
+}
+
+/* { dg-final { scan-assembler-times {\tbfi\t} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
index 109f989..c3b5fc5 100644
--- a/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
+++ b/gcc/testsuite/gcc.target/aarch64/combine_bfxil.c
@@ -114,4 +114,5 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-assembler-times "bfxil\\t" 18 } } */
+/* { dg-final { scan-assembler-times "bfxil\\t" 3 } } */
+/* { dg-final { scan-assembler-times "bfi\\t" 15 } } */

Re: [Patch] PR rtl-optimization/87763 - generate more bfi instructions on aarch64

Reply via email to