[PATCH] rs6000: Enhance lowpart/highpart DI->SF by mtvsrws/mtvsrd

2023-02-16 Thread Jiufu Guo via Gcc-patches
Hi,

Compare with previous version:
https://gcc.gnu.org/pipermail/gcc-patches/2023-February/611823.html
This patch does not define new insn for mtvsrws, but use exit one.

As mentioned in PR108338, on p9, we could use mtvsrws to implement
the bitcast from SI#0 to SF (or lowpart DI to SF).

For code:
  *(long long*)buff = di;
  float f = *(float*)(buff);

We generate "sldi 9,3,32 ; mtvsrd 1,9 ; xscvspdpn 1,1" instead of
"mtvsrws 1,3 ; xscvspdpn 1,1".

This patch update this, and also enhance the bitcast from highpart
DI to SF.

Bootstrap and regtests pass on ppc64{,le}.
Is this ok for trunk?

BR,
Jeff (Jiufu)


PR target/108338

gcc/ChangeLog:

* config/rs6000/predicates.md (lowpart_subreg_operator): New
define_predicate.
* config/rs6000/rs6000.md (any_rshift): New code_iterator.
(movsf_from_si): Update to generate mtvsrws.
(movsf_from_si2): Rename to...
(movsf_from_si2_): ... this.

gcc/testsuite/ChangeLog:

* gcc.target/powerpc/pr108338.c: New test.

---
 gcc/config/rs6000/predicates.md |  5 +++
 gcc/config/rs6000/rs6000.md | 34 +++--
 gcc/testsuite/gcc.target/powerpc/pr108338.c | 42 +
 3 files changed, 70 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108338.c

diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
index 52c65534e51..e57c9d99c6b 100644
--- a/gcc/config/rs6000/predicates.md
+++ b/gcc/config/rs6000/predicates.md
@@ -2064,3 +2064,8 @@ (define_predicate "macho_pic_address"
   else
 return false;
 })
+
+(define_predicate "lowpart_subreg_operator"
+  (and (match_code "subreg")
+   (match_test "subreg_lowpart_offset (mode, GET_MODE (SUBREG_REG (op)))
+   == SUBREG_BYTE (op)")))
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 4a7812fa592..74b1c9cee6a 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -8200,13 +8200,24 @@ (define_insn_and_split "movsf_from_si"
 {
   rtx op0 = operands[0];
   rtx op1 = operands[1];
-  rtx op2 = operands[2];
-  rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
 
-  /* Move SF value to upper 32-bits for xscvspdpn.  */
-  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
-  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
-  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
+  if (TARGET_P9_VECTOR)
+{
+  rtx op0_v = gen_rtx_REG (V4SImode, REGNO (op0));
+  emit_insn (gen_vsx_splat_v4si (op0_v, op1));
+  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
+}
+  else
+{
+  rtx op2 = operands[2];
+  rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
+
+  /* Move SF value to upper 32-bits for xscvspdpn.  */
+  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
+  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
+  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
+}
+
   DONE;
 }
   [(set_attr "length"
@@ -8219,18 +8230,19 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
 
+(define_code_iterator any_rshift [ashiftrt lshiftrt])
+
 ;; For extracting high part element from DImode register like:
 ;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
 ;; split it before reload with "and mask" to avoid generating shift right
 ;; 32 bit then shift left 32 bit.
-(define_insn_and_split "movsf_from_si2"
+(define_insn_and_split "movsf_from_si2_"
   [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
(unspec:SF
-[(subreg:SI
-  (ashiftrt:DI
+[(match_operator:SI 3 "lowpart_subreg_operator"
+  [(any_rshift:DI
(match_operand:DI 1 "input_operand" "r")
-   (const_int 32))
-  0)]
+   (const_int 32))])]
 UNSPEC_SF_FROM_SI))
   (clobber (match_scratch:DI 2 "=r"))]
   "TARGET_NO_SF_SUBREG"
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108338.c 
b/gcc/testsuite/gcc.target/powerpc/pr108338.c
new file mode 100644
index 000..2438dc13f41
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108338.c
@@ -0,0 +1,42 @@
+// { dg-do run }
+// { dg-options "-O2 -save-temps" }
+
+float __attribute__ ((noipa)) sf_from_di_off0 (long long l)
+{
+  char buff[16];
+  *(long long*)buff = l;
+  float f = *(float*)(buff);
+  return f;
+}
+
+float  __attribute__ ((noipa)) sf_from_di_off4 (long long l)
+{
+  char buff[16];
+  *(long long*)buff = l;
+  float f = *(float*)(buff + 4);
+  return f; 
+}
+
+/* Under lp64, 'l' is in one DI reg, then check sub DI to SF. */
+/* { dg-final { scan-assembler-times {\mrldicr\M} 1 { target { lp64 && 
has_arch_pwr8 } } } } */
+/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 2 { target { lp64 && 
has_arch_pwr8 } } } } */
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 2 { target { lp64 && { 
has_arch_pwr8 && { 

Re: [PATCH] rs6000: Enhance lowpart/highpart DI->SF by mtvsrws/mtvsrd

2023-01-10 Thread Jiufu Guo via Gcc-patches
Hi Segher,

Thanks for your help to review!

Segher Boessenkool  writes:

> Hi!
>
> On Tue, Jan 10, 2023 at 09:45:27PM +0800, Jiufu Guo wrote:
>> As mentioned in PR108338, on p9, we could use mtvsrws to implement
>> the conversion from SI#0 to SF (or lowpart DI to SF).  And we find
>> we can also enhance the conversion from highpart DI to SF (as the
>> case in this patch).
>> 
>> This patch enhances these conversions accordingly.
>
> Those aren't conversions, they are just bitcasts, reinterpreting the
> same datum as something else, but keeping all bits the same.
Yeap, bitcast is accurate. 

>
> The mtvsrws insn moves a SImode value from a GPR to a VSR, splatting it
> in all four lanes.  You'll typically want a xscvspdpn or similar after
> that -- but with the value splat in all lanes it will surely be in the
> lane that later instruction needs the data to be in :-)
Right.  xscvspdpn is needed typically.

>
>> --- a/gcc/config/rs6000/rs6000.md
>> +++ b/gcc/config/rs6000/rs6000.md
>> @@ -158,6 +158,7 @@ (define_c_enum "unspec"
>> UNSPEC_HASHCHK
>> UNSPEC_XXSPLTIDP_CONST
>> UNSPEC_XXSPLTIW_CONST
>> +   UNSPEC_P9V_MTVSRWS
>>])
>
> Is it hard to decribe this without unspec?  Unspecs prevent the compiler
> from optimising things (unless you very carefully implement all of that
> manually -- but if you just write it as plain RTL most things fall into
> place automatically).
>
> There are many existing patterns that needlessly and detrimentally use
> unspecs, but we should improve on that, not make it worse :-)
Thanks for pointing out this!
I also notice this.  Some patterns for bitcast int->float are also using
unspecs. 
For example, in expand pass:
(set (reg:SF 117 [  ])
(unspec:SF [
(reg:SI 121)
] UNSPEC_SF_FROM_SI)) 
it is "movsf_from_si" which is generated for "BIT_FIELD_REF ".  "movsf_from_si2" is also using unspec. And clobbers is used in
"movsf_from_si".
While they may be not needed for some cases.

I'm wondering if "TARGET_NO_SF_SUBREG" is accurated for those patterns.
/* Whether we should avoid (SUBREG:SI (REG:SF) and (SUBREG:SF (REG:SI).  */
#define TARGET_NO_SF_SUBREG TARGET_DIRECT_MOVE_64BIT
#define TARGET_ALLOW_SF_SUBREG  (!TARGET_DIRECT_MOVE_64BIT)

We may allow (SUBREG:SF (REG:SI) at early passes, and keep it untill
later passes (RA/reload, or splitter) at least.


In this patch, to avoid risk and make it straightforward, I define a new
insn 'mtvsrws' with unspec.

I would try another ways to avoid using unspec. Maybe keep to use subreg
pattern: "(set (reg:SF 117) (subreg:SF (reg/v:SI 118) 0))"?

>
>> @@ -8203,10 +8204,19 @@ (define_insn_and_split "movsf_from_si"
>>rtx op2 = operands[2];
>>rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
>>  
>> -  /* Move SF value to upper 32-bits for xscvspdpn.  */
>> -  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
>> -  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
>> -  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
>> +  if (TARGET_P9_VECTOR && TARGET_POWERPC64 && TARGET_DIRECT_MOVE)
>> +{
>> +   emit_insn (gen_p9_mtvsrws (op0, op1_di));
>> +   emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
>> +}
>
> This does not require TARGET_POWERPC64?
Oh, accurately speaking: 'mtvsrws' is using 32bit only. :)
>
> P9 implies we have direct moves (those are implied by P8 already).  We
> also do not need to test for vector I think?

Adding TARGET_P9_VECTOR, because I think, 'mtvsrws' operates on vector
register. (Or just treat them as FP regiters? But I feel it seems more
accurate vector registers.)

We have TARGET_DIRECT_MOVE_128 defined for P9:
"(TARGET_P9_VECTOR && TARGET_DIRECT_MOVE && TARGET_POWERPC64)"
like "TARGET_DIRECT_MOVE_64BIT" for P8.
So, we still need TARGET_DIRECT_MOVE, right?

>
>> +(define_code_iterator any_rshift [ashiftrt lshiftrt])
>> +
>>  ;; For extracting high part element from DImode register like:
>>  ;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>>  ;; split it before reload with "and mask" to avoid generating shift right
>>  ;; 32 bit then shift left 32 bit.
>> -(define_insn_and_split "movsf_from_si2"
>> +(define_insn_and_split "movsf_from_si2_"
>>[(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
>>  (unspec:SF
>>   [(subreg:SI
>> -   (ashiftrt:DI
>> +   (any_rshift:DI
>>  (match_operand:DI 1 "input_operand" "r")
>>  (const_int 32))
>> 0)]
>
> Hrm.  You can write this with just a subreg, no shift is needed at all.
> Can you please try that instead?  That is nastiness for endianness, but
> that is still preferable over introducing new complexities like this.
Currently, this define_insn_and_split would be used to combine
"shift;subreg"; because "highpart subreg DI->SF" is expanded to
"rightshift:DI 32 ; lowpart subreg:DI->SI; SI#0->SF". It seems, we are
not in favor of generating highpart subreg. :)

While I agree with you: this is just a subreg (lo

Re: [PATCH] rs6000: Enhance lowpart/highpart DI->SF by mtvsrws/mtvsrd

2023-01-10 Thread Segher Boessenkool
Hi!

On Tue, Jan 10, 2023 at 09:45:27PM +0800, Jiufu Guo wrote:
> As mentioned in PR108338, on p9, we could use mtvsrws to implement
> the conversion from SI#0 to SF (or lowpart DI to SF).  And we find
> we can also enhance the conversion from highpart DI to SF (as the
> case in this patch).
> 
> This patch enhances these conversions accordingly.

Those aren't conversions, they are just bitcasts, reinterpreting the
same datum as something else, but keeping all bits the same.

The mtvsrws insn moves a SImode value from a GPR to a VSR, splatting it
in all four lanes.  You'll typically want a xscvspdpn or similar after
that -- but with the value splat in all lanes it will surely be in the
lane that later instruction needs the data to be in :-)

> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -158,6 +158,7 @@ (define_c_enum "unspec"
> UNSPEC_HASHCHK
> UNSPEC_XXSPLTIDP_CONST
> UNSPEC_XXSPLTIW_CONST
> +   UNSPEC_P9V_MTVSRWS
>])

Is it hard to decribe this without unspec?  Unspecs prevent the compiler
from optimising things (unless you very carefully implement all of that
manually -- but if you just write it as plain RTL most things fall into
place automatically).

There are many existing patterns that needlessly and detrimentally use
unspecs, but we should improve on that, not make it worse :-)

> @@ -8203,10 +8204,19 @@ (define_insn_and_split "movsf_from_si"
>rtx op2 = operands[2];
>rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
>  
> -  /* Move SF value to upper 32-bits for xscvspdpn.  */
> -  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
> -  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
> -  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
> +  if (TARGET_P9_VECTOR && TARGET_POWERPC64 && TARGET_DIRECT_MOVE)
> +{
> +   emit_insn (gen_p9_mtvsrws (op0, op1_di));
> +   emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
> +}

This does not require TARGET_POWERPC64?

P9 implies we have direct moves (those are implied by P8 already).  We
also do not need to test for vector I think?

> +(define_code_iterator any_rshift [ashiftrt lshiftrt])
> +
>  ;; For extracting high part element from DImode register like:
>  ;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
>  ;; split it before reload with "and mask" to avoid generating shift right
>  ;; 32 bit then shift left 32 bit.
> -(define_insn_and_split "movsf_from_si2"
> +(define_insn_and_split "movsf_from_si2_"
>[(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
>   (unspec:SF
>[(subreg:SI
> -(ashiftrt:DI
> +(any_rshift:DI
>   (match_operand:DI 1 "input_operand" "r")
>   (const_int 32))
>  0)]

Hrm.  You can write this with just a subreg, no shift is needed at all.
Can you please try that instead?  That is nastiness for endianness, but
that is still preferable over introducing new complexities like this.

> +(define_insn "p9_mtvsrws"
> +  [(set (match_operand:SF 0 "register_operand" "=wa")
> + (unspec:SF [(match_operand:DI 1 "register_operand" "r")]
> +UNSPEC_P9V_MTVSRWS))]
> +  "TARGET_P9_VECTOR && TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> +  "mtvsrws %x0,%1"
> +  [(set_attr "type" "mtvsr")])

(Same issues here as above).

> +/* { dg-final { scan-assembler-times {\mmtvsrws\M} 1 { target { 
> has_arch_ppc64 && has_arch_pwr9 } } } } */

mtvsrws does not need ppc64.

> +/* { dg-final { scan-assembler-times {\mmtvsrd\M} 1 { target { 
> has_arch_ppc64 && has_arch_pwr9 } } } } */
> +/* { dg-final { scan-assembler-times {\mrldicr\M} 1 { target { 
> has_arch_ppc64 && has_arch_pwr9 } } } } */

These two do of course.

> +/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 2 { target { 
> has_arch_pwr8 && has_arch_ppc64 } } } } */

But this doesn't.


Segher


[PATCH] rs6000: Enhance lowpart/highpart DI->SF by mtvsrws/mtvsrd

2023-01-10 Thread Jiufu Guo via Gcc-patches
Hi,

As mentioned in PR108338, on p9, we could use mtvsrws to implement
the conversion from SI#0 to SF (or lowpart DI to SF).  And we find
we can also enhance the conversion from highpart DI to SF (as the
case in this patch).

This patch enhances these conversions accordingly.

Bootstrap and regtests pass on ppc64{,le}.
Is this ok for trunk?

BR,
Jeff (Jiufu)

PR target/108338

gcc/ChangeLog:

* config/rs6000/rs6000.md (any_rshift): New code_iterator.
(movsf_from_si2): Rename to...
(movsf_from_si2_): ... this.
(p9_mtvsrws): New define_insn.

gcc/testsuite/ChangeLog:

* gcc.target/powerpc/pr108338.c: New test.
---
 gcc/config/rs6000/rs6000.md | 32 +---
 gcc/testsuite/gcc.target/powerpc/pr108338.c | 41 +
 2 files changed, 67 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108338.c

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 3cae64a264a..9025a912141 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -158,6 +158,7 @@ (define_c_enum "unspec"
UNSPEC_HASHCHK
UNSPEC_XXSPLTIDP_CONST
UNSPEC_XXSPLTIW_CONST
+   UNSPEC_P9V_MTVSRWS
   ])
 
 ;;
@@ -8203,10 +8204,19 @@ (define_insn_and_split "movsf_from_si"
   rtx op2 = operands[2];
   rtx op1_di = gen_rtx_REG (DImode, REGNO (op1));
 
-  /* Move SF value to upper 32-bits for xscvspdpn.  */
-  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
-  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
-  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
+  if (TARGET_P9_VECTOR && TARGET_POWERPC64 && TARGET_DIRECT_MOVE)
+{
+   emit_insn (gen_p9_mtvsrws (op0, op1_di));
+   emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
+}
+  else
+{
+  /* Move SF value to upper 32-bits for xscvspdpn.  */
+  emit_insn (gen_ashldi3 (op2, op1_di, GEN_INT (32)));
+  emit_insn (gen_p8_mtvsrd_sf (op0, op2));
+  emit_insn (gen_vsx_xscvspdpn_directmove (op0, op0));
+}
+
   DONE;
 }
   [(set_attr "length"
@@ -8219,15 +8229,17 @@ (define_insn_and_split "movsf_from_si"
"*,  *, p9v,   p8v,   *, *,
 p8v,p8v,   p8v,   *")])
 
+(define_code_iterator any_rshift [ashiftrt lshiftrt])
+
 ;; For extracting high part element from DImode register like:
 ;; {%1:SF=unspec[r122:DI>>0x20#0] 86;clobber scratch;}
 ;; split it before reload with "and mask" to avoid generating shift right
 ;; 32 bit then shift left 32 bit.
-(define_insn_and_split "movsf_from_si2"
+(define_insn_and_split "movsf_from_si2_"
   [(set (match_operand:SF 0 "gpc_reg_operand" "=wa")
(unspec:SF
 [(subreg:SI
-  (ashiftrt:DI
+  (any_rshift:DI
(match_operand:DI 1 "input_operand" "r")
(const_int 32))
   0)]
@@ -9475,6 +9487,14 @@ (define_insn "p8_mtvsrd_sf"
   "mtvsrd %x0,%1"
   [(set_attr "type" "mtvsr")])
 
+(define_insn "p9_mtvsrws"
+  [(set (match_operand:SF 0 "register_operand" "=wa")
+   (unspec:SF [(match_operand:DI 1 "register_operand" "r")]
+  UNSPEC_P9V_MTVSRWS))]
+  "TARGET_P9_VECTOR && TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrws %x0,%1"
+  [(set_attr "type" "mtvsr")])
+
 (define_insn_and_split "reload_vsx_from_gprsf"
   [(set (match_operand:SF 0 "register_operand" "=wa")
(unspec:SF [(match_operand:SF 1 "register_operand" "r")]
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108338.c 
b/gcc/testsuite/gcc.target/powerpc/pr108338.c
new file mode 100644
index 000..2afac79ea4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108338.c
@@ -0,0 +1,41 @@
+// { dg-do run }
+// { dg-options "-O2 -save-temps" }
+
+float __attribute__ ((noipa)) sf_from_di_off0 (long long l)
+{
+  char buff[16];
+  *(long long*)buff = l;
+  float f = *(float*)(buff);
+  return f;
+}
+
+float  __attribute__ ((noipa)) sf_from_di_off4 (long long l)
+{
+  char buff[16];
+  *(long long*)buff = l;
+  float f = *(float*)(buff + 4);
+  return f; 
+}
+
+/* { dg-final { scan-assembler-times {\mmtvsrws\M} 1 { target { has_arch_ppc64 
&& has_arch_pwr9 } } } } */
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 1 { target { has_arch_ppc64 
&& has_arch_pwr9 } } } } */
+/* { dg-final { scan-assembler-times {\mrldicr\M} 1 { target { has_arch_ppc64 
&& has_arch_pwr9 } } } } */
+/* { dg-final { scan-assembler-times {\mxscvspdpn\M} 2 { target { 
has_arch_pwr8 && has_arch_ppc64 } } } } */
+
+/* { dg-final { scan-assembler-times {\mmtvsrd\M} 2 { target { has_arch_pwr8 
&& { has_arch_ppc64 && { ! has_arch_pwr9 } } } } } } */
+
+union di_sf_sf
+{
+  struct {float f1; float f2;};
+  long long l;
+};
+
+int main()
+{
+  union di_sf_sf v;
+  v.f1 = 1.0f;
+  v.f2 = 2.0f;
+  if (sf_from_di_off0 (v.l) != 1.0f || sf_from_di_off4 (v.l) != 2.0f )
+__builtin_abort ();
+  return 0;
+}
-- 
2.17.1