Hi, 

PR81833 identifies a problem with the little-endian vector multiply-sum
instructions.  The original implementation is quite poor (and I am allowed
to say that, since it was mine).  This patch fixes the code properly.

The revised code still uses UNSPECs for these ops, which is not strictly
necessary, although descriptive rtl for them would be pretty complex.  I've
put in a FIXME to make note of that for a future cleanup.

Bootstrapped and tested on powerpc64le-linux-gnu with no regressions.  I am
currently testing on powerpc64-linux-gnu for 32- and 64-bit.  Provided that
testing succeeds, is this ok for trunk, and for eventual backport to all
supported releases?

Thanks,
Bill


[gcc]

2017-08-28  Bill Schmidt  <wschm...@linux.vnet.ibm.com>

        PR target/81833
        * config/rs6000/altivec.md (altivec_vsum2sws): Convert from a
        define_insn to a define_expand.
        (altivec_vsum2sws_direct): New define_insn.
        (altivec_vsumsws): Convert from a define_insn to a define_expand.

[gcc/testsuite]

2017-08-28  Bill Schmidt  <wschm...@linux.vnet.ibm.com>

        PR target/81833
        * gcc.target/powerpc/pr81833.c: New file.


Index: gcc/config/rs6000/altivec.md
===================================================================
--- gcc/config/rs6000/altivec.md        (revision 251369)
+++ gcc/config/rs6000/altivec.md        (working copy)
@@ -1804,51 +1804,61 @@
   "vsum4s<VI_char>s %0,%1,%2"
   [(set_attr "type" "veccomplex")])
 
-;; FIXME: For the following two patterns, the scratch should only be
-;; allocated for !VECTOR_ELT_ORDER_BIG, and the instructions should
-;; be emitted separately.
-(define_insn "altivec_vsum2sws"
-  [(set (match_operand:V4SI 0 "register_operand" "=v")
-        (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
-                      (match_operand:V4SI 2 "register_operand" "v")]
-                    UNSPEC_VSUM2SWS))
-   (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
-   (clobber (match_scratch:V4SI 3 "=v"))]
+(define_expand "altivec_vsum2sws"
+  [(use (match_operand:V4SI 0 "register_operand"))
+   (use (match_operand:V4SI 1 "register_operand"))
+   (use (match_operand:V4SI 2 "register_operand"))]
   "TARGET_ALTIVEC"
 {
   if (VECTOR_ELT_ORDER_BIG)
-    return "vsum2sws %0,%1,%2";
+    emit_insn (gen_altivec_vsum2sws_direct (operands[0], operands[1],
+                                            operands[2]));
   else
-    return "vsldoi %3,%2,%2,12\n\tvsum2sws %3,%1,%3\n\tvsldoi %0,%3,%3,4";
-}
-  [(set_attr "type" "veccomplex")
-   (set (attr "length")
-     (if_then_else
-       (match_test "VECTOR_ELT_ORDER_BIG")
-       (const_string "4")
-       (const_string "12")))])
+    {
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_altivec_vsldoi_v4si (tmp1, operands[2],
+                                          operands[2], GEN_INT (12)));
+      emit_insn (gen_altivec_vsum2sws_direct (tmp2, operands[1], tmp1));
+      emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
+                                          GEN_INT (4)));
+    }
+  DONE;
+})
 
-(define_insn "altivec_vsumsws"
+; FIXME: This can probably be expressed without an UNSPEC.
+(define_insn "altivec_vsum2sws_direct"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
-                      (match_operand:V4SI 2 "register_operand" "v")]
-                    UNSPEC_VSUMSWS))
-   (set (reg:SI VSCR_REGNO) (unspec:SI [(const_int 0)] UNSPEC_SET_VSCR))
-   (clobber (match_scratch:V4SI 3 "=v"))]
+                     (match_operand:V4SI 2 "register_operand" "v")]
+                    UNSPEC_VSUM2SWS))]
   "TARGET_ALTIVEC"
+  "vsum2sws %0,%1,%2"
+  [(set_attr "type" "veccomplex")
+   (set_attr "length" "4")])
+
+(define_expand "altivec_vsumsws"
+  [(use (match_operand:V4SI 0 "register_operand"))
+   (use (match_operand:V4SI 1 "register_operand"))
+   (use (match_operand:V4SI 2 "register_operand"))]
+  "TARGET_ALTIVEC"
 {
   if (VECTOR_ELT_ORDER_BIG)
-    return "vsumsws %0,%1,%2";
+    emit_insn (gen_altivec_vsumsws_direct (operands[0], operands[1],
+                                           operands[2]));
   else
-    return "vspltw %3,%2,0\n\tvsumsws %3,%1,%3\n\tvsldoi %0,%3,%3,12";
-}
-  [(set_attr "type" "veccomplex")
-   (set (attr "length")
-     (if_then_else
-       (match_test "(VECTOR_ELT_ORDER_BIG)")
-       (const_string "4")
-       (const_string "12")))])
+    {
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_altivec_vspltw_direct (tmp1, operands[2], const0_rtx));
+      emit_insn (gen_altivec_vsumsws_direct (tmp2, operands[1], tmp1));
+      emit_insn (gen_altivec_vsldoi_v4si (operands[0], tmp2, tmp2,
+                                          GEN_INT (12)));
+    }
+  DONE;
+})
 
+; FIXME: This can probably be expressed without an UNSPEC.
 (define_insn "altivec_vsumsws_direct"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
         (unspec:V4SI [(match_operand:V4SI 1 "register_operand" "v")
Index: gcc/testsuite/gcc.target/powerpc/pr81833.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr81833.c  (nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/pr81833.c  (working copy)
@@ -0,0 +1,54 @@
+/* PR81833: This used to fail due to improper implementation of vec_msum.  */
+
+/* { dg-do run {target { lp64 } } } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+
+#include <altivec.h>
+
+#define vec_u8  vector unsigned char
+#define vec_s8  vector signed char
+#define vec_u16 vector unsigned short
+#define vec_s16 vector signed short
+#define vec_u32 vector unsigned int
+#define vec_s32 vector signed int
+#define vec_f   vector float
+
+#define LOAD_ZERO const vec_u8 zerov = vec_splat_u8 (0)
+
+#define zero_u8v  (vec_u8)  zerov
+#define zero_s8v  (vec_s8)  zerov
+#define zero_u16v (vec_u16) zerov
+#define zero_s16v (vec_s16) zerov
+#define zero_u32v (vec_u32) zerov
+#define zero_s32v (vec_s32) zerov
+
+signed int __attribute__((noinline))
+scalarproduct_int16_vsx (const signed short *v1, const signed short *v2,
+                        int order)
+{
+  int i;
+  LOAD_ZERO;
+  register vec_s16 vec1;
+  register vec_s32 res = vec_splat_s32 (0), t;
+  signed int ires;
+
+  for (i = 0; i < order; i += 8) {
+    vec1 = vec_vsx_ld (0, v1);
+    t    = vec_msum (vec1, vec_ld (0, v2), zero_s32v);
+    res  = vec_sums (t, res);
+    v1  += 8;
+    v2  += 8;
+  }
+  res = vec_splat (res, 3);
+  vec_ste (res, 0, &ires);
+
+  return ires;
+}
+
+int main(void)
+{
+  const signed short test_vec[] = { 1, 1, 1, 1, 1, 1, 1, 1 };
+  if (scalarproduct_int16_vsx (test_vec, test_vec, 8) != 8)
+    __builtin_abort ();
+  return 0;
+}

Reply via email to