https://gcc.gnu.org/g:08c64787513c2cc944b98f14a72ded893f568287

commit 08c64787513c2cc944b98f14a72ded893f568287
Author: Michael Meissner <meiss...@linux.ibm.com>
Date:   Wed Jul 2 19:35:03 2025 -0400

    PR target/108958 -- simplify mtvsrdd to zero extend GPR DImode to VSX TImode
    
    Before this patch GCC would zero extend a DImode GPR value to TImode by
    first zero extending the DImode value into a GPR TImode register pair,
    and then do a MTVSRDD to move this value to a VSX register.
    
    For example, consider the following code:
    
            #ifndef TYPE
            #define TYPE unsigned long long
            #endif
    
            void
            gpr_to_vsx (TYPE x, __uint128_t *p)
            {
              __uint128_t y = x;
              __asm__ (" # %x0" : "+wa" (y));
              *p = y;
            }
    
    Currently GCC generates:
    
            gpr_to_vsx:
                    mr 10,3
                    li 11,0
                    mtvsrdd 0,11,10
            #APP
                     # 0
            #NO_APP
                    stxv 0,0(4)
                    blr
    
    I.e. the mr and li instructions create the zero extended TImode value
    in a GPR, and then the mtvsrdd instruction moves both registers into a
    single vector register.
    
    Instead, GCC should generate the following code.  Since the mtvsrdd
    instruction will clear the upper 64 bits if the 2nd argument is 0
    (non-zero values are a GPR to put in the upper 64 bits):
    
            gpr_to_vsx:
                    mtvsrdd 0,0,3
            #APP
                     # 0
            #NO_APP
                    stxv 0,0(4)
                    blr
    
    Originally, I posted a patch that added the zero_extendsiti2 insn.  I
    got some pushback about using reload_completed in the split portion of
    the define_insn_and_split.  However, this is a case where you
    absolutely have to use the reload_completed test, because if you split
    the code before register allocation to handle the normal, the split
    insns will not be compiled to generate the appropriate mtvsrdd without
    creating the TImode value in the GPR register.  I can imagine there
    might be concern about favoring generating code using the vector
    registers instead of using the GPR registers if the code does not
    require the TImode value to be in a vector register.
    
    I completely rewrote the patch.  This patch creates a peephole2 to
    catch this case, and it eliminates creating the TImode variable.
    Instead it just does the MTVSRDD instruction directly.  That way it
    will not influence register allocation, and the code will only be
    generated in the specific case where we need the TImode value in a
    vector register.
    
    I have built GCC with the patches in this patch set applied on both
    little and big endian PowerPC systems and there were no regressions.
    Can I apply this patch to GCC 16?
    
    2025-07-02  Michael Meissner  <meiss...@linux.ibm.com>
    
    gcc/
    
            PR target/108958
            * config/rs6000/rs6000.md (UNSPEC_ZERO_EXTEND): New unspec.
            (zero_extendsiti2 peephole2): Add a peephole2 to simplify zero
            extend between DImode value in a GPR to a TImode target in a
            vector register.
            (zero_extendsiti2_vsx): New insn.
    
    gcc/testsuite/
    
            PR target/108958
            * gcc.target/powerpc/pr108958.c: New test.

Diff:
---
 gcc/config/rs6000/rs6000.md                 | 26 ++++++++++++++++
 gcc/testsuite/gcc.target/powerpc/pr108958.c | 47 +++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index c13101eb4318..1f87587ef91b 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -173,6 +173,7 @@
    UNSPEC_XXSPLTIW_CONST
    UNSPEC_FMAX
    UNSPEC_FMIN
+   UNSPEC_ZERO_EXTEND
   ])
 
 ;;
@@ -969,6 +970,31 @@
    (set_attr "dot" "yes")
    (set_attr "length" "4,8")])
 
+;; Optimize zero_extendsiti2 from a GPR to a GPR and then moving the GPR to a
+;; VSX register
+(define_peephole2
+  [(set (match_operand:DI 0 "int_reg_operand")
+       (match_operand:DI 1 "int_reg_operand"))
+   (set (match_operand:DI 2 "int_reg_operand")
+       (const_int 0))
+   (set (match_operand:TI 3 "vsx_register_operand")
+       (match_operand:TI 4 "int_reg_operand"))]
+  "TARGET_DIRECT_MOVE_64BIT
+   && (reg_or_subregno (operands[0])
+       == reg_or_subregno (operands[4]) + !!WORDS_BIG_ENDIAN)
+   && (reg_or_subregno (operands[2])
+       == reg_or_subregno (operands[4]) + !WORDS_BIG_ENDIAN)
+   && peep2_reg_dead_p (3, operands[4])"
+  [(set (match_dup 3)
+       (unspec:TI [(match_dup 1)] UNSPEC_ZERO_EXTEND))])
+
+(define_insn "*zero_extendsiti2_vsx"
+  [(set (match_operand:TI 0 "vsx_register_operand" "=wa")
+       (unspec:TI [(match_operand:DI 1 "int_reg_operand" "r")]
+                  UNSPEC_ZERO_EXTEND))]
+  "TARGET_DIRECT_MOVE_64BIT"
+  "mtvsrdd %x0,0,%1"
+  [(set_attr "type" "mtvsr")])
 
 (define_insn "zero_extendsi<mode>2"
   [(set (match_operand:EXTSI 0 "gpc_reg_operand" "=r,r,d,wa,wa,r,wa")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108958.c 
b/gcc/testsuite/gcc.target/powerpc/pr108958.c
new file mode 100644
index 000000000000..21b3f2766918
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108958.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
+
+#ifndef TYPE
+#define TYPE unsigned long long
+#endif
+
+/* PR target/108958, when zero extending a DImode to a TImode, and the TImode 
variable is in a VSX register, generate:
+
+       mtvsrdd vreg,0,gpr
+
+   instead of:
+
+       mr tmp,gpr
+       li tmp+1,0
+       mtvsrdd vreg,tmp+1,tmp.  */
+
+void
+gpr_to_vsx (TYPE x, __uint128_t *p)
+{
+  /* mtvsrdd 0,0,3
+     stvx 0,0(4)  */
+
+  __uint128_t y = x;
+  __asm__ (" # %x0" : "+wa" (y));
+  *p = y;
+}
+
+void
+gpr_to_gpr (TYPE x, __uint128_t *p)
+{
+  /* mr 2,3
+     li 3,0
+     std 2,0(4)
+     std 3,8(4)  */
+
+  __uint128_t y = x;
+  __asm__ (" # %0" : "+r" (y));
+  *p = y;
+}
+
+/* { dg-final { scan-assembler-times {\mli\M}              1 } } */
+/* { dg-final { scan-assembler-times {\mmtvsrdd .*,0,.*\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mstd\M}             2 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M}            1 } } */

Reply via email to