Re: [PATCHv4, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769]

2023-08-16 Thread HAO CHEN GUI via Gcc-patches
Committed after fixing the comments.

https://gcc.gnu.org/g:a79cf858b39e01c80537bc5d47a5e9004418c267

Thanks
Gui Haochen

在 2023/8/14 15:47, Kewen.Lin 写道:
> Hi Haochen,
> 
> on 2023/8/14 10:18, HAO CHEN GUI wrote:
>> Hi,
>>   This patch modifies vsx extract expand and generates mfvsrwz/stxsiwx
>> for all sub targets when the mode is V4SI and the extracted element is word
>> 1 from BE order. Also this patch adds a insn pattern for mfvsrwz which
>> helps eliminate redundant zero extend.
>>
>>   Compared to last version, the main change is to put the word index
>> checking in the split condition of "*vsx_extract_v4si_w023". Also modified
>> some comments.
>> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625380.html
>>
>>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
>>
>> Thanks
>> Gui Haochen
>>
>> ChangeLog
>> rs6000: Generate mfvsrwz for all platform and remove redundant zero extend
>>
>> mfvsrwz has lower latency than xxextractuw or vextuw[lr]x.  So it should be
>> generated even with p9 vector enabled.  Also the instruction is already
>> zero extended.  A combine pattern is needed to eliminate redundant zero
>> extend instructions.
>>
>> gcc/
>>  PR target/106769
>>  * config/rs6000/vsx.md (expand vsx_extract_): Set it only
>>  for V8HI and V16QI.
>>  (vsx_extract_v4si): New expand for V4SI extraction.
>>  (vsx_extract_v4si_w1): New insn pattern for V4SI extraction on
>>  word 1 from BE order.   
>>  (*mfvsrwz): New insn pattern for mfvsrwz.
>>  (*vsx_extract__di_p9): Assert that it won't be generated on
>>  word 1 from BE order.
>>  (*vsx_extract_si): Remove.
>>  (*vsx_extract_v4si_w023): New insn and split pattern on word 0, 2,
>>  3 from BE order.
>>
>> gcc/testsuite/
>>  PR target/106769
>>  * gcc.target/powerpc/pr106769.h: New.
>>  * gcc.target/powerpc/pr106769-p8.c: New.
>>  * gcc.target/powerpc/pr106769-p9.c: New.
>>
>> patch.diff
>> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
>> index 0a34ceebeb5..1cbdc2f1c01 100644
>> --- a/gcc/config/rs6000/vsx.md
>> +++ b/gcc/config/rs6000/vsx.md
>> @@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2__1"
>>  (define_expand  "vsx_extract_"
>>[(parallel [(set (match_operand: 0 "gpc_reg_operand")
>> (vec_select:
>> -(match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand")
>> +(match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand")
>>  (parallel [(match_operand:QI 2 "const_int_operand")])))
>> -  (clobber (match_scratch:VSX_EXTRACT_I 3))])]
>> +  (clobber (match_scratch:VSX_EXTRACT_I2 3))])]
>>"VECTOR_MEM_VSX_P (mode) && TARGET_DIRECT_MOVE_64BIT"
>>  {
>>/* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}.  */
>> @@ -3736,6 +3736,63 @@ (define_expand  "vsx_extract_"
>>  }
>>  })
>>
>> +(define_expand  "vsx_extract_v4si"
>> +  [(parallel [(set (match_operand:SI 0 "gpc_reg_operand")
>> +   (vec_select:SI
>> +(match_operand:V4SI 1 "gpc_reg_operand")
>> +(parallel [(match_operand:QI 2 "const_0_to_3_operand")])))
>> +  (clobber (match_scratch:V4SI 3))])]
>> +  "TARGET_DIRECT_MOVE_64BIT"
>> +{
>> +  /* The word 1 (BE order) can be extracted by mfvsrwz/stxsiwx.  So just
>> + fall through to vsx_extract_v4si_w1.  */
>> +  if (TARGET_P9_VECTOR
>> +  && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2))
>> +{
>> +  emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1],
>> +  operands[2]));
>> +  DONE;
>> +}
>> +})
>> +
>> +/* Extract from word 1 (BE order);  */
> 
> Nit: I guessed I requested this before, please use ";" instead of
> "/* ... */" for the comments, to align with the existing ones.
> 
>> +(define_insn "vsx_extract_v4si_w1"
>> +  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa")
>> +(vec_select:SI
>> + (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0")
>> + (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
>> +   (clobber (match_scratch:V4SI 3 "=v,v,v,v"))]
>> +  "TARGET_DIRECT_MOVE_64BIT
>> +   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
>> +{
>> +   if (which_alternative == 0)
>> + return "mfvsrwz %0,%x1";
>> +
>> +   if (which_alternative == 1)
>> + return "xxlor %x0,%x1,%x1";
>> +
>> +   if (which_alternative == 2)
>> + return "stxsiwx %x1,%y0";
>> +
>> +   return ASM_COMMENT_START " vec_extract to same register";
>> +}
>> +  [(set_attr "type" "mfvsr,veclogical,fpstore,*")
>> +   (set_attr "length" "4,4,4,0")
>> +   (set_attr "isa" "p8v,*,p8v,*")])
>> +
>> +(define_insn "*mfvsrwz"
>> +  [(set (match_operand:DI 0 "register_operand" "=r")
>> +(zero_extend:DI
>> +  (vec_select:SI
>> +(match_operand:V4SI 1 "vsx_register_operand" "wa")
>> +(parallel [(match_operand:QI 2 "const_int_operand" "n")]
>> +   (clobber (match_scratch:V4SI 3 "=v"))]

Re: [PATCHv4, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769]

2023-08-14 Thread Kewen.Lin via Gcc-patches
Hi Haochen,

on 2023/8/14 10:18, HAO CHEN GUI wrote:
> Hi,
>   This patch modifies vsx extract expand and generates mfvsrwz/stxsiwx
> for all sub targets when the mode is V4SI and the extracted element is word
> 1 from BE order. Also this patch adds a insn pattern for mfvsrwz which
> helps eliminate redundant zero extend.
> 
>   Compared to last version, the main change is to put the word index
> checking in the split condition of "*vsx_extract_v4si_w023". Also modified
> some comments.
> https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625380.html
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
> 
> Thanks
> Gui Haochen
> 
> ChangeLog
> rs6000: Generate mfvsrwz for all platform and remove redundant zero extend
> 
> mfvsrwz has lower latency than xxextractuw or vextuw[lr]x.  So it should be
> generated even with p9 vector enabled.  Also the instruction is already
> zero extended.  A combine pattern is needed to eliminate redundant zero
> extend instructions.
> 
> gcc/
>   PR target/106769
>   * config/rs6000/vsx.md (expand vsx_extract_): Set it only
>   for V8HI and V16QI.
>   (vsx_extract_v4si): New expand for V4SI extraction.
>   (vsx_extract_v4si_w1): New insn pattern for V4SI extraction on
>   word 1 from BE order.   
>   (*mfvsrwz): New insn pattern for mfvsrwz.
>   (*vsx_extract__di_p9): Assert that it won't be generated on
>   word 1 from BE order.
>   (*vsx_extract_si): Remove.
>   (*vsx_extract_v4si_w023): New insn and split pattern on word 0, 2,
>   3 from BE order.
> 
> gcc/testsuite/
>   PR target/106769
>   * gcc.target/powerpc/pr106769.h: New.
>   * gcc.target/powerpc/pr106769-p8.c: New.
>   * gcc.target/powerpc/pr106769-p9.c: New.
> 
> patch.diff
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 0a34ceebeb5..1cbdc2f1c01 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2__1"
>  (define_expand  "vsx_extract_"
>[(parallel [(set (match_operand: 0 "gpc_reg_operand")
>  (vec_select:
> - (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand")
> + (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand")
>   (parallel [(match_operand:QI 2 "const_int_operand")])))
> -   (clobber (match_scratch:VSX_EXTRACT_I 3))])]
> +   (clobber (match_scratch:VSX_EXTRACT_I2 3))])]
>"VECTOR_MEM_VSX_P (mode) && TARGET_DIRECT_MOVE_64BIT"
>  {
>/* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}.  */
> @@ -3736,6 +3736,63 @@ (define_expand  "vsx_extract_"
>  }
>  })
> 
> +(define_expand  "vsx_extract_v4si"
> +  [(parallel [(set (match_operand:SI 0 "gpc_reg_operand")
> +(vec_select:SI
> + (match_operand:V4SI 1 "gpc_reg_operand")
> + (parallel [(match_operand:QI 2 "const_0_to_3_operand")])))
> +   (clobber (match_scratch:V4SI 3))])]
> +  "TARGET_DIRECT_MOVE_64BIT"
> +{
> +  /* The word 1 (BE order) can be extracted by mfvsrwz/stxsiwx.  So just
> + fall through to vsx_extract_v4si_w1.  */
> +  if (TARGET_P9_VECTOR
> +  && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2))
> +{
> +  emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1],
> +   operands[2]));
> +  DONE;
> +}
> +})
> +
> +/* Extract from word 1 (BE order);  */

Nit: I guessed I requested this before, please use ";" instead of
"/* ... */" for the comments, to align with the existing ones.

> +(define_insn "vsx_extract_v4si_w1"
> +  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa")
> + (vec_select:SI
> +  (match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0")
> +  (parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
> +   (clobber (match_scratch:V4SI 3 "=v,v,v,v"))]
> +  "TARGET_DIRECT_MOVE_64BIT
> +   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
> +{
> +   if (which_alternative == 0)
> + return "mfvsrwz %0,%x1";
> +
> +   if (which_alternative == 1)
> + return "xxlor %x0,%x1,%x1";
> +
> +   if (which_alternative == 2)
> + return "stxsiwx %x1,%y0";
> +
> +   return ASM_COMMENT_START " vec_extract to same register";
> +}
> +  [(set_attr "type" "mfvsr,veclogical,fpstore,*")
> +   (set_attr "length" "4,4,4,0")
> +   (set_attr "isa" "p8v,*,p8v,*")])
> +
> +(define_insn "*mfvsrwz"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> + (zero_extend:DI
> +   (vec_select:SI
> + (match_operand:V4SI 1 "vsx_register_operand" "wa")
> + (parallel [(match_operand:QI 2 "const_int_operand" "n")]
> +   (clobber (match_scratch:V4SI 3 "=v"))]
> +  "TARGET_DIRECT_MOVE_64BIT
> +   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
> +  "mfvsrwz %0,%x1"
> +  [(set_attr "type" "mfvsr")
> +   (set_attr "isa" "p8v")])
> +
>  (define_insn "vsx_extract__p9"
>[(set (match_operand: 0 

[PATCHv4, rs6000] Generate mfvsrwz for all subtargets and remove redundant zero extend [PR106769]

2023-08-13 Thread HAO CHEN GUI via Gcc-patches
Hi,
  This patch modifies vsx extract expand and generates mfvsrwz/stxsiwx
for all sub targets when the mode is V4SI and the extracted element is word
1 from BE order. Also this patch adds a insn pattern for mfvsrwz which
helps eliminate redundant zero extend.

  Compared to last version, the main change is to put the word index
checking in the split condition of "*vsx_extract_v4si_w023". Also modified
some comments.
https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625380.html

  Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.

Thanks
Gui Haochen

ChangeLog
rs6000: Generate mfvsrwz for all platform and remove redundant zero extend

mfvsrwz has lower latency than xxextractuw or vextuw[lr]x.  So it should be
generated even with p9 vector enabled.  Also the instruction is already
zero extended.  A combine pattern is needed to eliminate redundant zero
extend instructions.

gcc/
PR target/106769
* config/rs6000/vsx.md (expand vsx_extract_): Set it only
for V8HI and V16QI.
(vsx_extract_v4si): New expand for V4SI extraction.
(vsx_extract_v4si_w1): New insn pattern for V4SI extraction on
word 1 from BE order.   
(*mfvsrwz): New insn pattern for mfvsrwz.
(*vsx_extract__di_p9): Assert that it won't be generated on
word 1 from BE order.
(*vsx_extract_si): Remove.
(*vsx_extract_v4si_w023): New insn and split pattern on word 0, 2,
3 from BE order.

gcc/testsuite/
PR target/106769
* gcc.target/powerpc/pr106769.h: New.
* gcc.target/powerpc/pr106769-p8.c: New.
* gcc.target/powerpc/pr106769-p9.c: New.

patch.diff
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 0a34ceebeb5..1cbdc2f1c01 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -3722,9 +3722,9 @@ (define_insn "vsx_xxpermdi2__1"
 (define_expand  "vsx_extract_"
   [(parallel [(set (match_operand: 0 "gpc_reg_operand")
   (vec_select:
-   (match_operand:VSX_EXTRACT_I 1 "gpc_reg_operand")
+   (match_operand:VSX_EXTRACT_I2 1 "gpc_reg_operand")
(parallel [(match_operand:QI 2 "const_int_operand")])))
- (clobber (match_scratch:VSX_EXTRACT_I 3))])]
+ (clobber (match_scratch:VSX_EXTRACT_I2 3))])]
   "VECTOR_MEM_VSX_P (mode) && TARGET_DIRECT_MOVE_64BIT"
 {
   /* If we have ISA 3.0, we can do a xxextractuw/vextractu{b,h}.  */
@@ -3736,6 +3736,63 @@ (define_expand  "vsx_extract_"
 }
 })

+(define_expand  "vsx_extract_v4si"
+  [(parallel [(set (match_operand:SI 0 "gpc_reg_operand")
+  (vec_select:SI
+   (match_operand:V4SI 1 "gpc_reg_operand")
+   (parallel [(match_operand:QI 2 "const_0_to_3_operand")])))
+ (clobber (match_scratch:V4SI 3))])]
+  "TARGET_DIRECT_MOVE_64BIT"
+{
+  /* The word 1 (BE order) can be extracted by mfvsrwz/stxsiwx.  So just
+ fall through to vsx_extract_v4si_w1.  */
+  if (TARGET_P9_VECTOR
+  && INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2))
+{
+  emit_insn (gen_vsx_extract_v4si_p9 (operands[0], operands[1],
+ operands[2]));
+  DONE;
+}
+})
+
+/* Extract from word 1 (BE order);  */
+(define_insn "vsx_extract_v4si_w1"
+  [(set (match_operand:SI 0 "nonimmediate_operand" "=r,wa,Z,wa")
+   (vec_select:SI
+(match_operand:V4SI 1 "gpc_reg_operand" "v,v,v,0")
+(parallel [(match_operand:QI 2 "const_0_to_3_operand" "n,n,n,n")])))
+   (clobber (match_scratch:V4SI 3 "=v,v,v,v"))]
+  "TARGET_DIRECT_MOVE_64BIT
+   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
+{
+   if (which_alternative == 0)
+ return "mfvsrwz %0,%x1";
+
+   if (which_alternative == 1)
+ return "xxlor %x0,%x1,%x1";
+
+   if (which_alternative == 2)
+ return "stxsiwx %x1,%y0";
+
+   return ASM_COMMENT_START " vec_extract to same register";
+}
+  [(set_attr "type" "mfvsr,veclogical,fpstore,*")
+   (set_attr "length" "4,4,4,0")
+   (set_attr "isa" "p8v,*,p8v,*")])
+
+(define_insn "*mfvsrwz"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+   (zero_extend:DI
+ (vec_select:SI
+   (match_operand:V4SI 1 "vsx_register_operand" "wa")
+   (parallel [(match_operand:QI 2 "const_int_operand" "n")]
+   (clobber (match_scratch:V4SI 3 "=v"))]
+  "TARGET_DIRECT_MOVE_64BIT
+   && INTVAL (operands[2]) == (BYTES_BIG_ENDIAN ? 1 : 2)"
+  "mfvsrwz %0,%x1"
+  [(set_attr "type" "mfvsr")
+   (set_attr "isa" "p8v")])
+
 (define_insn "vsx_extract__p9"
   [(set (match_operand: 0 "gpc_reg_operand" "=r,")
(vec_select:
@@ -3807,6 +3864,9 @@ (define_insn_and_split "*vsx_extract__di_p9"
(parallel [(match_dup 2)])))
  (clobber (match_dup 3))])]
 {
+  gcc_assert (mode != V4SImode
+ || INTVAL (operands[2]) != (BYTES_BIG_ENDIAN ? 1 : 2));
+
   operands[4] = gen_rtx_REG (mode, REGNO