Hi!

WIDEN_MULT_PLUS_EXPR as documented has the factor operands with
the same precision and the addend and result another one at least twice
as wide.
Similarly, {,u}maddMN4 is documented as
'maddMN4'
     Multiply operands 1 and 2, sign-extend them to mode N, add operand
     3, and store the result in operand 0.  Operands 1 and 2 have mode M
     and operands 0 and 3 have mode N.  Both modes must be integer or
     fixed-point modes and N must be twice the size of M.

     In other words, 'maddMN4' is like 'mulMN3' except that it also adds
     operand 3.

     These instructions are not allowed to 'FAIL'.

'umaddMN4'
     Like 'maddMN4', but zero-extend the multiplication operands instead
     of sign-extending them.
The PR103109 addition of these expanders to rs6000 didn't handle this
correctly though, it treated the last argument as also having mode M
sign or zero extended into N.  Unfortunately this means incorrect code
generation whenever the last operand isn't really sign or zero extended
from DImode to TImode.

The following patch removes maddditi4 expander altogether from rs6000.md,
because we'd need
        maddhd 9,3,4,5
        sradi 10,5,63
        maddld 3,3,4,5
        sub 9,9,10
        add 4,9,6
which is longer than
        mulld 9,3,4
        mulhd 4,3,4
        addc 3,9,5
        adde 4,4,6
and nothing would be able to optimize the case of last operand already
sign-extended from DImode to TImode into just
        mr 9,3
        maddld 3,3,4,5
        maddhd 4,9,4,5
or so.  And fixes umaddditi4, so that it emits an add at the end to add
the high half of the last operand, fortunately in this case if the high
half of the last operand is known to be zero (i.e. last operand is zero
extended from DImode to TImode) then combine will drop the useless add.

If we wanted to get back the signed op1 * op2 + op3 all in the DImode
into TImode op0, we'd need to introduce a new tree code next to
WIDEN_MULT_PLUS_EXPR and maddMN4 expander, because I'm afraid it can't
be done at expansion time in maddMN4 expander to detect whether the
operand is sign extended especially because of SUBREGs and the awkwardness
of looking at earlier emitted instructions, and combine would need 5
instruction combination.

Bootstrapped/regtested on powerpc64-linux (power7, tested -m32/-m64),
powerpc64le-linux (power8 and another on power9 with
--with-cpu-64=power9 --with-tune-64=power9), preapproved by Segher in the
PR, committed to trunk.

2023-02-15  Jakub Jelinek  <ja...@redhat.com>

        PR target/108787
        PR target/103109
        * config/rs6000/rs6000.md (<u>maddditi4): Change into umaddditi4 only
        expander, change operand 3 to be TImode, emit maddlddi4 and
        umadddi4_highpart{,_le} with its low half and finally add the high
        half to the result.

        * gcc.dg/pr108787.c: New test.
        * gcc.target/powerpc/pr108787.c: New test.
        * gcc.target/powerpc/pr103109-1.c: Adjust expected instruction counts.

--- gcc/config/rs6000/rs6000.md.jj      2023-01-16 11:52:16.036734757 +0100
+++ gcc/config/rs6000/rs6000.md 2023-02-14 21:02:05.637399466 +0100
@@ -3226,25 +3226,40 @@ (define_insn "maddld<mode>4"
   "maddld %0,%1,%2,%3"
   [(set_attr "type" "mul")])
 
-(define_expand "<u>maddditi4"
+;; umaddditi4 generally needs maddhdu + maddld + add instructions,
+;; unless last operand is zero extended from DImode, then needs
+;; maddhdu + maddld, which is both faster than mulld + mulhdu + addc + adde
+;; resp. mulld + mulhdu + addc + addze.
+;; We don't define maddditi4, as that one needs
+;; maddhd + sradi + maddld + add + sub and for last operand sign extended
+;; from DImode nothing is able to optimize it into maddhd + maddld, while
+;; without maddditi4 mulld + mulhd + addc + adde or
+;; mulld + mulhd + sradi + addc + adde is needed.  See PR108787.
+(define_expand "umaddditi4"
   [(set (match_operand:TI 0 "gpc_reg_operand")
        (plus:TI
-         (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand"))
-                  (any_extend:TI (match_operand:DI 2 "gpc_reg_operand")))
-         (any_extend:TI (match_operand:DI 3 "gpc_reg_operand"))))]
+         (mult:TI (zero_extend:TI (match_operand:DI 1 "gpc_reg_operand"))
+                  (zero_extend:TI (match_operand:DI 2 "gpc_reg_operand")))
+         (match_operand:TI 3 "gpc_reg_operand")))]
   "TARGET_MADDLD && TARGET_POWERPC64"
 {
   rtx op0_lo = gen_rtx_SUBREG (DImode, operands[0], BYTES_BIG_ENDIAN ? 8 : 0);
   rtx op0_hi = gen_rtx_SUBREG (DImode, operands[0], BYTES_BIG_ENDIAN ? 0 : 8);
+  rtx op3_lo = gen_rtx_SUBREG (DImode, operands[3], BYTES_BIG_ENDIAN ? 8 : 0);
+  rtx op3_hi = gen_rtx_SUBREG (DImode, operands[3], BYTES_BIG_ENDIAN ? 0 : 8);
+  rtx hi_temp = gen_reg_rtx (DImode);
 
-  emit_insn (gen_maddlddi4 (op0_lo, operands[1], operands[2], operands[3]));
+  emit_insn (gen_maddlddi4 (op0_lo, operands[1], operands[2], op3_lo));
 
   if (BYTES_BIG_ENDIAN)
-    emit_insn (gen_<u>madddi4_highpart (op0_hi, operands[1], operands[2],
-                                       operands[3]));
+    emit_insn (gen_umadddi4_highpart (hi_temp, operands[1], operands[2],
+                                     op3_lo));
   else
-    emit_insn (gen_<u>madddi4_highpart_le (op0_hi, operands[1], operands[2],
-                                          operands[3]));
+    emit_insn (gen_umadddi4_highpart_le (hi_temp, operands[1], operands[2],
+                                        op3_lo));
+
+  emit_insn (gen_adddi3 (op0_hi, hi_temp, op3_hi));
+
   DONE;
 })
 
--- gcc/testsuite/gcc.dg/pr108787.c.jj  2023-02-14 21:18:21.285191941 +0100
+++ gcc/testsuite/gcc.dg/pr108787.c     2023-02-14 21:17:03.545324004 +0100
@@ -0,0 +1,27 @@
+/* PR target/108787 */
+/* { dg-do run { target int128 } } */
+/* { dg-options "-O2" } */
+
+__attribute__((noipa)) unsigned __int128
+foo (unsigned long long x, unsigned long long y, unsigned long long z, 
unsigned long long u, unsigned long long v, unsigned long long w)
+{
+  unsigned __int128 r, d;
+  r = ((unsigned __int128) x * u);
+  d = ((unsigned __int128) y * w);
+  r += d;
+  d = ((unsigned __int128) z * v);
+  r += d;
+  return r;
+}
+
+int
+main ()
+{
+  if (__CHAR_BIT__ != 8 || __SIZEOF_LONG_LONG__ != 8 || __SIZEOF_INT128__ != 
16)
+    return 0;
+  unsigned __int128 x = foo (0x3efe88da491ULL, 0xd105e9b4a44ULL, 
0x4efa677b3dbULL, 0x42c052bac7bULL, 0x99638a13199cULL, 0x56b640d064ULL);
+  if ((unsigned long long) (x >> 64) != 0x000000000309ff93ULL
+      || (unsigned long long) x != 0xbd5c98fdf2bdbcafULL)
+    __builtin_abort ();
+  return 0;
+}
--- gcc/testsuite/gcc.target/powerpc/pr108787.c.jj      2023-02-14 
21:19:13.292434605 +0100
+++ gcc/testsuite/gcc.target/powerpc/pr108787.c 2023-02-14 21:19:25.499256860 
+0100
@@ -0,0 +1,6 @@
+/* PR target/108787 */
+/* { dg-do run { target int128 } } */
+/* { dg-require-effective-target p9vector_hw } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#include "../../gcc.dg/pr108787.c"
--- gcc/testsuite/gcc.target/powerpc/pr103109-1.c.jj    2022-08-19 
16:00:05.198388230 +0200
+++ gcc/testsuite/gcc.target/powerpc/pr103109-1.c       2023-02-15 
09:40:42.183123930 +0100
@@ -3,8 +3,8 @@
 /* { dg-require-effective-target int128 } */
 /* { dg-require-effective-target powerpc_p9modulo_ok } */
 /* { dg-require-effective-target has_arch_ppc64 } */
-/* { dg-final { scan-assembler-times {\mmaddld\M} 2 } } */
-/* { dg-final { scan-assembler-times {\mmaddhd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mmaddld\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mmaddhd\M} 0 } } */
 /* { dg-final { scan-assembler-times {\mmaddhdu\M} 1 } } */
 
 #include "pr103109.h"

        Jakub

Reply via email to