在 2023/12/13 上午2:27, Xi Ruoyao 写道:

        fld.s   $f1,$r4,0
        fld.s   $f0,$r4,4
        fld.s   $f3,$r4,8
        fld.s   $f2,$r4,12
        fcmp.slt.s      $fcc1,$f0,$f3
        fcmp.sgt.s      $fcc0,$f1,$f2
        movcf2gr        $r13,$fcc1
        movcf2gr        $r12,$fcc0
        or      $r12,$r12,$r13
        bnez    $r12,.L3
        fld.s   $f4,$r4,16
        fld.s   $f5,$r4,20
        or      $r4,$r0,$r0
        fcmp.sgt.s      $fcc1,$f1,$f5
        fcmp.slt.s      $fcc0,$f0,$f4
        movcf2gr        $r12,$fcc1
        movcf2gr        $r13,$fcc0
        or      $r12,$r12,$r13
        bnez    $r12,.L2
        fcmp.sgt.s      $fcc1,$f3,$f5
        fcmp.slt.s      $fcc0,$f2,$f4
        movcf2gr        $r4,$fcc1
        movcf2gr        $r12,$fcc0
        or      $r4,$r4,$r12
        xori    $r4,$r4,1
        slli.w  $r4,$r4,0
        jr      $r1
        .align  4
.L3:
        or      $r4,$r0,$r0
        .align  4
.L2:
        jr      $r1

Per my micro-benchmark this is much faster than
LOGICAL_OP_NON_SHORT_CIRCUIT = 0 for randomly generated inputs (i.e.
when the branches are not predictable).

Note that there is a redundant slli.w instruction in the compiled code
and I couldn't find a way to remove it (my trick in the TARGET_64BIT
branch only works for simple examples).  We may be able to handle via
the ext_dce pass [1] in the future.

Patches in attachments can remove the remaining symbol extension directives from

the assembly.

[1]:https://gcc.gnu.org/pipermail/gcc-patches/2023-November/637320.html

>From 01eea237e13056fad9839219ed1aa70037cd3b60 Mon Sep 17 00:00:00 2001
From: Lulu Cheng <chengl...@loongson.cn>
Date: Fri, 8 Dec 2023 10:16:48 +0800
Subject: [PATCH v1] LoongArch: Optimized some of the symbolic expansion
 instructions generated during bitwise operations

There are two mode iterators defined in the loongarch.md:
	(define_mode_iterator GPR [SI (DI "TARGET_64BIT")])
  and
	(define_mode_iterator X [(SI "!TARGET_64BIT") (DI "TARGET_64BIT")])
Replace the mode in the bit arithmetic from GPR to X.

Since the bitwise operation instruction does not distinguish between 64-bit,
32-bit, etc., it is necessary to perform symbolic expansion if the bitwise
operation is less than 64 bits.
The original definition would have generated a lot of redundant symbolic
extension instructions. This problem is optimized with reference to the
implementation of RISCV.

gcc/ChangeLog:

	* config/loongarch/loongarch.md (one_cmpl<mode>2): Replace GPR with X.
	(*nor<mode>3): Likewise.
	(nor<mode>3): Likewise.
	(*branch_on_bit<X:mode>): Likewise.
	(*branch_on_bit_range<X:mode>): Likewise.
	(*negsi2_extended): New template.
	(*<optab>si3_internal): Likewise.
	(*one_cmplsi2_internal): Likewise.
	(*norsi3_internal): Likewise.
	(*<optab>nsi_internal): Likewise.
	(bytepick_w_<bytepick_imm>_extend): Modify this template according to the
	modified bit operation to make the optimization work.
	* config/loongarch/predicates.md (branch_on_bit_operand): New predicate.

gcc/testsuite/ChangeLog:

	* gcc.target/loongarch/sign-extend-1.c: New test.
---
 gcc/config/loongarch/loongarch.md             | 148 +++++++++++++++---
 gcc/config/loongarch/predicates.md            |   5 +
 .../gcc.target/loongarch/sign-extend-1.c      |  21 +++
 3 files changed, 151 insertions(+), 23 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/loongarch/sign-extend-1.c

diff --git a/gcc/config/loongarch/loongarch.md b/gcc/config/loongarch/loongarch.md
index 7a101dd64b7..35788deafc7 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -721,7 +721,7 @@ (define_insn "sub<mode>3"
 
 (define_insn "sub<mode>3"
   [(set (match_operand:GPR 0 "register_operand" "=r")
-	(minus:GPR (match_operand:GPR 1 "register_operand" "rJ")
+	(minus:GPR (match_operand:GPR 1 "register_operand" "r")
 		   (match_operand:GPR 2 "register_operand" "r")))]
   ""
   "sub.<d>\t%0,%z1,%2"
@@ -1327,13 +1327,13 @@ (define_insn "neg<mode>2"
   [(set_attr "alu_type"	"sub")
    (set_attr "mode" "<MODE>")])
 
-(define_insn "one_cmpl<mode>2"
-  [(set (match_operand:GPR 0 "register_operand" "=r")
-	(not:GPR (match_operand:GPR 1 "register_operand" "r")))]
-  ""
-  "nor\t%0,%.,%1"
-  [(set_attr "alu_type" "not")
-   (set_attr "mode" "<MODE>")])
+(define_insn "*negsi2_extended"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(sign_extend:DI (neg:SI (match_operand:SI 1 "register_operand" "r"))))]
+  "TARGET_64BIT"
+  "sub.w\t%0,%.,%1"
+  [(set_attr "alu_type"	"sub")
+   (set_attr "mode" "SI")])
 
 (define_insn "neg<mode>2"
   [(set (match_operand:ANYF 0 "register_operand" "=f")
@@ -1353,14 +1353,39 @@ (define_insn "neg<mode>2"
 ;;
 
 (define_insn "<optab><mode>3"
-  [(set (match_operand:GPR 0 "register_operand" "=r,r")
-	(any_bitwise:GPR (match_operand:GPR 1 "register_operand" "%r,r")
-			 (match_operand:GPR 2 "uns_arith_operand" "r,K")))]
+  [(set (match_operand:X 0 "register_operand" "=r,r")
+	(any_bitwise:X (match_operand:X 1 "register_operand" "%r,r")
+		       (match_operand:X 2 "uns_arith_operand" "r,K")))]
   ""
   "<insn>%i2\t%0,%1,%2"
   [(set_attr "type" "logical")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*<optab>si3_internal"
+  [(set (match_operand:SI                 0 "register_operand" "=r,r")
+	(any_bitwise:SI (match_operand:SI 1 "register_operand" "%r,r")
+			(match_operand:SI 2 "uns_arith_operand"    " r,K")))]
+  "TARGET_64BIT"
+  "<insn>%i2\t%0,%1,%2"
+  [(set_attr "type" "logical")
+   (set_attr "mode" "SI")])
+
+(define_insn "one_cmpl<mode>2"
+  [(set (match_operand:X 0 "register_operand" "=r")
+	(not:X (match_operand:X 1 "register_operand" "r")))]
+  ""
+  "nor\t%0,%.,%1"
+  [(set_attr "alu_type" "not")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*one_cmplsi2_internal"
+  [(set (match_operand:SI         0 "register_operand" "=r")
+	(not:SI (match_operand:SI 1 "register_operand" " r")))]
+  "TARGET_64BIT"
+  "nor\t%0,%.,%1"
+  [(set_attr "type" "logical")
+   (set_attr "mode" "SI")])
+
 (define_insn "and<mode>3_extended"
   [(set (match_operand:GPR 0 "register_operand" "=r")
 	(and:GPR (match_operand:GPR 1 "nonimmediate_operand" "r")
@@ -1476,25 +1501,43 @@ (define_insn "*iorhi3"
   [(set_attr "type" "logical")
    (set_attr "mode" "HI")])
 
-(define_insn "*nor<mode>3"
-  [(set (match_operand:GPR 0 "register_operand" "=r")
-	(and:GPR (not:GPR (match_operand:GPR 1 "register_operand" "%r"))
-		 (not:GPR (match_operand:GPR 2 "register_operand" "r"))))]
+(define_insn "nor<mode>3"
+  [(set (match_operand:X 0 "register_operand" "=r")
+	(and:X (not:X (match_operand:X 1 "register_operand" "%r"))
+		 (not:X (match_operand:X 2 "register_operand" "r"))))]
   ""
   "nor\t%0,%1,%2"
   [(set_attr "type" "logical")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*norsi3_internal"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(and:SI (not:SI (match_operand:SI 1 "register_operand" "%r"))
+		 (not:SI (match_operand:SI 2 "register_operand" "r"))))]
+  "TARGET_64BIT"
+  "nor\t%0,%1,%2"
+  [(set_attr "type" "logical")
+   (set_attr "mode" "SI")])
+
 (define_insn "<optab>n<mode>"
-  [(set (match_operand:GPR 0 "register_operand" "=r")
-	(neg_bitwise:GPR
-	    (not:GPR (match_operand:GPR 1 "register_operand" "r"))
-	    (match_operand:GPR 2 "register_operand" "r")))]
+  [(set (match_operand:X 0 "register_operand" "=r")
+	(neg_bitwise:X
+	    (not:X (match_operand:X 1 "register_operand" "r"))
+	    (match_operand:X 2 "register_operand" "r")))]
   ""
   "<insn>n\t%0,%2,%1"
   [(set_attr "type" "logical")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*<optab>nsi_internal"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(neg_bitwise:SI
+	    (not:SI (match_operand:SI 1 "register_operand" "r"))
+	    (match_operand:SI 2 "register_operand" "r")))]
+  "TARGET_64BIT"
+  "<insn>n\t%0,%2,%1"
+  [(set_attr "type" "logical")
+   (set_attr "mode" "SI")])
 
 ;;
 ;;  ....................
@@ -2976,6 +3019,62 @@ (define_expand "condjump"
 		      (label_ref (match_operand 1))
 		      (pc)))])
 
+(define_insn_and_split "*branch_on_bit<X:mode>"
+  [(set (pc)
+	(if_then_else
+	    (match_operator 0 "equality_operator"
+	        [(zero_extract:X (match_operand:X 2 "register_operand" "r")
+				 (const_int 1)
+				 (match_operand 3 "branch_on_bit_operand"))
+				 (const_int 0)])
+	    (label_ref (match_operand 1))
+	    (pc)))
+   (clobber (match_scratch:X 4 "=&r"))]
+  ""
+  "#"
+  "reload_completed"
+  [(set (match_dup 4)
+	(ashift:X (match_dup 2) (match_dup 3)))
+   (set (pc)
+	(if_then_else
+	    (match_op_dup 0 [(match_dup 4) (const_int 0)])
+	    (label_ref (match_operand 1))
+	    (pc)))]
+{
+  int shift = GET_MODE_BITSIZE (<MODE>mode) - 1 - INTVAL (operands[3]);
+  operands[3] = GEN_INT (shift);
+
+  if (GET_CODE (operands[0]) == EQ)
+    operands[0] = gen_rtx_GE (<MODE>mode, operands[4], const0_rtx);
+  else
+    operands[0] = gen_rtx_LT (<MODE>mode, operands[4], const0_rtx);
+})
+
+(define_insn_and_split "*branch_on_bit_range<X:mode>"
+  [(set (pc)
+	(if_then_else
+	    (match_operator 0 "equality_operator"
+		[(zero_extract:X (match_operand:X 2 "register_operand" "r")
+				 (match_operand 3 "branch_on_bit_operand")
+				 (const_int 0))
+				 (const_int 0)])
+	    (label_ref (match_operand 1))
+	    (pc)))
+   (clobber (match_scratch:X 4 "=&r"))]
+  ""
+  "#"
+  "reload_completed"
+  [(set (match_dup 4)
+	(ashift:X (match_dup 2) (match_dup 3)))
+   (set (pc)
+	(if_then_else
+	    (match_op_dup 0 [(match_dup 4) (const_int 0)])
+	    (label_ref (match_operand 1))
+	    (pc)))]
+{
+  operands[3] = GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - INTVAL (operands[3]));
+})
+
 
 
 ;;
@@ -3762,10 +3861,13 @@ (define_insn "bytepick_w_<bytepick_imm>"
 (define_insn "bytepick_w_<bytepick_imm>_extend"
   [(set (match_operand:DI 0 "register_operand" "=r")
 	(sign_extend:DI
-	  (ior:SI (lshiftrt (match_operand:SI 1 "register_operand" "r")
-			    (const_int <bytepick_w_lshiftrt_amount>))
-		  (ashift (match_operand:SI 2 "register_operand" "r")
-			  (const_int bytepick_w_ashift_amount)))))]
+	 (subreg:SI
+	  (ior:DI (subreg:DI (lshiftrt
+			      (match_operand:SI 1 "register_operand" "r")
+			      (const_int <bytepick_w_lshiftrt_amount>)) 0)
+		  (subreg:DI (ashift
+			      (match_operand:SI 2 "register_operand" "r")
+			      (const_int bytepick_w_ashift_amount)) 0)) 0)))]
   "TARGET_64BIT"
   "bytepick.w\t%0,%1,%2,<bytepick_imm>"
   [(set_attr "mode" "SI")])
diff --git a/gcc/config/loongarch/predicates.md b/gcc/config/loongarch/predicates.md
index d02e846cb12..5084752171a 100644
--- a/gcc/config/loongarch/predicates.md
+++ b/gcc/config/loongarch/predicates.md
@@ -67,6 +67,11 @@ (define_predicate "arith_operand"
   (ior (match_operand 0 "const_arith_operand")
        (match_operand 0 "register_operand")))
 
+;; Only use branch-on-bit sequences when the mask is not an ANDI immediate.
+(define_predicate "branch_on_bit_operand"
+  (and (match_code "const_int")
+       (match_test "INTVAL (op) >= IMM_BITS - 1")))
+
 (define_predicate "plus_di_operand"
   (ior (match_operand 0 "arith_operand")
        (match_operand 0 "const_dual_imm12_operand")
diff --git a/gcc/testsuite/gcc.target/loongarch/sign-extend-1.c b/gcc/testsuite/gcc.target/loongarch/sign-extend-1.c
new file mode 100644
index 00000000000..c294ba6c407
--- /dev/null
+++ b/gcc/testsuite/gcc.target/loongarch/sign-extend-1.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-mabi=lp64d -O2" } */
+/* { dg-final { scan-assembler-not "slli.w" } } */
+
+struct pmop
+{
+  unsigned int op_pmflags;
+  unsigned int op_pmpermflags;
+};
+unsigned int PL_hints;
+
+struct pmop *pmop;
+void
+Perl_newPMOP (int type, int flags)
+{
+  if (PL_hints & 0x00100000)
+    pmop->op_pmpermflags |= 0x0001;
+  if (PL_hints & 0x00000004)
+    pmop->op_pmpermflags |= 0x0800;
+  pmop->op_pmflags = pmop->op_pmpermflags;
+}
-- 
2.39.3

Reply via email to