Before this patch GCC would zero extend a DImode GPR value to TImode by first
zero extending the DImode value into a GPR TImode register pair, and then do a
MTVSRDD to move this value to a VSX register.
For example, consider the following code:
#ifndef TYPE
#define TYPE unsigned long long
#endif
void
gpr_to_vsx (TYPE x, __uint128_t *p)
{
__uint128_t y = x;
__asm__ (" # %x0" : "+wa" (y));
*p = y;
}
Currently GCC generates:
gpr_to_vsx:
mr 10,3
li 11,0
mtvsrdd 0,11,10
#APP
# 0
#NO_APP
stxv 0,0(4)
blr
I.e. the mr and li instructions create the zero extended TImode value
in a GPR, and then the mtvsrdd instruction moves both registers into a
single vector register.
Instead, GCC should generate the following code. Since the mtvsrdd
instruction will clear the upper 64 bits if the 2nd argument is 0
(non-zero values are a GPR to put in the upper 64 bits):
gpr_to_vsx:
mtvsrdd 0,0,3
#APP
# 0
#NO_APP
stxv 0,0(4)
blr
Originally, I posted a patch that added the zero_extendsiti2 insn. I
got some pushback about using reload_completed in the split portion of
the define_insn_and_split. However, this is a case where you
absolutely have to use the reload_completed test, because if you split
the code before register allocation to handle the normal, the split
insns will not be compiled to generate the appropriate mtvsrdd without
creating the TImode value in the GPR register. I can imagine there
might be concern about favoring generating code using the vector
registers instead of using the GPR registers if the code does not
require the TImode value to be in a vector register.
I completely rewrote the patch. This patch creates a peephole2 to
catch this case, and it eliminates creating the TImode variable.
Instead it just does the MTVSRDD instruction directly. That way it
will not influence register allocation, and the code will only be
generated in the specific case where we need the TImode value in a
vector register.
I have built GCC with the patches in this patch set applied on both
little and big endian PowerPC systems and there were no regressions.
Can I apply this patch to GCC 16?
2025-11-15 Michael Meissner <[email protected]>
gcc/
PR target/108958
* config/rs6000/rs6000.md (UNSPEC_ZERO_EXTEND): New unspec.
(zero_extendsiti2 peephole2): Add a peephole2 to simplify zero
extend between DImode value in a GPR to a TImode target in a
vector register.
(zero_extendsiti2_vsx): New insn.
gcc/testsuite/
PR target/108958
* gcc.target/powerpc/pr108958.c: New test.
---
gcc/config/rs6000/rs6000.md | 26 ++++++++++++
gcc/testsuite/gcc.target/powerpc/pr108958.c | 47 +++++++++++++++++++++
2 files changed, 73 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/powerpc/pr108958.c
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 6b47d2ce8cf..95aa925e9e4 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -173,6 +173,7 @@ (define_c_enum "unspec"
UNSPEC_XXSPLTIW_CONST
UNSPEC_FMAX
UNSPEC_FMIN
+ UNSPEC_ZERO_EXTEND
])
;;
@@ -969,6 +970,31 @@ (define_insn_and_split "*zero_extendhi<mode>2_dot2"
(set_attr "dot" "yes")
(set_attr "length" "4,8")])
+;; Optimize zero_extendsiti2 from a GPR to a GPR and then moving the GPR to a
+;; VSX register
+(define_peephole2
+ [(set (match_operand:DI 0 "int_reg_operand")
+ (match_operand:DI 1 "int_reg_operand"))
+ (set (match_operand:DI 2 "int_reg_operand")
+ (const_int 0))
+ (set (match_operand:TI 3 "vsx_register_operand")
+ (match_operand:TI 4 "int_reg_operand"))]
+ "TARGET_DIRECT_MOVE_64BIT
+ && (reg_or_subregno (operands[0])
+ == reg_or_subregno (operands[4]) + !!WORDS_BIG_ENDIAN)
+ && (reg_or_subregno (operands[2])
+ == reg_or_subregno (operands[4]) + !WORDS_BIG_ENDIAN)
+ && peep2_reg_dead_p (3, operands[4])"
+ [(set (match_dup 3)
+ (unspec:TI [(match_dup 1)] UNSPEC_ZERO_EXTEND))])
+
+(define_insn "*zero_extendsiti2_vsx"
+ [(set (match_operand:TI 0 "vsx_register_operand" "=wa")
+ (unspec:TI [(match_operand:DI 1 "int_reg_operand" "r")]
+ UNSPEC_ZERO_EXTEND))]
+ "TARGET_DIRECT_MOVE_64BIT"
+ "mtvsrdd %x0,0,%1"
+ [(set_attr "type" "mtvsr")])
;; On power10, optimize zero extending a QI/HI/SI/DImode value from memory that
;; is going to a vector register target by generating a LXVR{B,H,W,D}X
diff --git a/gcc/testsuite/gcc.target/powerpc/pr108958.c
b/gcc/testsuite/gcc.target/powerpc/pr108958.c
new file mode 100644
index 00000000000..21b3f276691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr108958.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-mdejagnu-cpu=power9 -O2" } */
+
+#ifndef TYPE
+#define TYPE unsigned long long
+#endif
+
+/* PR target/108958, when zero extending a DImode to a TImode, and the TImode
variable is in a VSX register, generate:
+
+ mtvsrdd vreg,0,gpr
+
+ instead of:
+
+ mr tmp,gpr
+ li tmp+1,0
+ mtvsrdd vreg,tmp+1,tmp. */
+
+void
+gpr_to_vsx (TYPE x, __uint128_t *p)
+{
+ /* mtvsrdd 0,0,3
+ stvx 0,0(4) */
+
+ __uint128_t y = x;
+ __asm__ (" # %x0" : "+wa" (y));
+ *p = y;
+}
+
+void
+gpr_to_gpr (TYPE x, __uint128_t *p)
+{
+ /* mr 2,3
+ li 3,0
+ std 2,0(4)
+ std 3,8(4) */
+
+ __uint128_t y = x;
+ __asm__ (" # %0" : "+r" (y));
+ *p = y;
+}
+
+/* { dg-final { scan-assembler-times {\mli\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mmtvsrdd .*,0,.*\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstxv\M} 1 } } */
--
2.51.1
--
Michael Meissner, IBM
PO Box 98, Ayer, Massachusetts, USA, 01432
email: [email protected]