https://gcc.gnu.org/g:2f719014bfec1d21eceb409db6eff78eb92942f3

commit r16-4481-g2f719014bfec1d21eceb409db6eff78eb92942f3
Author: Tamar Christina <[email protected]>
Date:   Sat Oct 18 08:21:56 2025 +0100

    AArch64: Implement widen_[us]sum using dotproduct for SVE [PR122069]
    
    This patch implements support for using dotproduct to do sum reductions by
    changing += a into += (a * 1).  i.e. we seed the multiplication with 1.
    
    Given the example
    
    int foo_int(unsigned char *x, unsigned char * restrict y) {
      int sum = 0;
      for (int i = 0; i < 8000; i++)
         sum += char_abs(x[i] - y[i]);
      return sum;
    }
    
    we used to generate
    
    .L2:
            ld1b    z1.b, p7/z, [x0, x2]
            ld1b    z29.b, p7/z, [x1, x2]
            sub     z29.b, z1.b, z29.b
            uunpklo z0.h, z29.b
            uunpkhi z29.h, z29.b
            uunpklo z30.s, z0.h
            add     z31.s, p6/m, z31.s, z30.s
            uunpkhi z0.s, z0.h
            add     z31.s, p5/m, z31.s, z0.s
            uunpklo z28.s, z29.h
            add     z31.s, p4/m, z31.s, z28.s
            uunpkhi z29.s, z29.h
            add     z31.s, p3/m, z31.s, z29.s
            add     x2, x2, x7
            whilelo p7.b, w2, w3
            whilelo p3.s, w2, w6
            whilelo p4.s, w2, w5
            whilelo p5.s, w2, w4
            whilelo p6.s, w2, w3
            b.any   .L2
            ptrue   p7.b, all
            uaddv   d31, p7, z31.s
    
    but now generates with +dotprod
    
    .L3:
            ld1b    z30.b, p7/z, [x5, x2]
            ld1b    z29.b, p7/z, [x1, x2]
            sub     z30.b, z30.b, z29.b
            udot    z31.s, z30.b, z28.b
            mov     x3, x2
            add     x2, x2, x6
            cmp     w2, w0
            bls     .L3
            incb    x3
            uaddv   d31, p7, z31.s
    
    gcc/ChangeLog:
    
            PR middle-end/122069
            * config/aarch64/aarch64-sve.md (widen_<sur>sum<mode><vsi2qi>3): 
New.
    
    gcc/testsuite/ChangeLog:
    
            PR middle-end/122069
            * gcc.target/aarch64/sve/pr122069_1.c: New test.
            * gcc.target/aarch64/sve/pr122069_2.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md                 | 16 +++++
 gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c | 45 +++++++++++++
 gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c | 81 +++++++++++++++++++++++
 3 files changed, 142 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 8c47d441c3fd..550ff0a3cde6 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -7722,6 +7722,22 @@
   [(set_attr "sve_type" "sve_int_dot")]
 )
 
+;; Define double widen_[su]sum as dotproduct
+;; Use dot product to perform double widening sum reductions by
+;; changing += a into += (a * 1).  i.e. we seed the multiplication with 1.
+(define_expand "widen_<sur>sum<mode><vsi2qi>3"
+  [(set (match_operand:SVE_FULL_SDI 0 "register_operand")
+       (plus:SVE_FULL_SDI
+         (unspec:SVE_FULL_SDI
+           [(match_operand:<VSI2QI> 1 "register_operand")
+            (match_dup 3)]
+           DOTPROD)
+         (match_operand:SVE_FULL_SDI 2 "register_operand")))]
+  "TARGET_SVE"
+{
+  operands[3] = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+})
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Sum of absolute differences
 ;; -------------------------------------------------------------------------
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c
new file mode 100644
index 000000000000..5d1f61f4a6a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_1.c
@@ -0,0 +1,45 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only --param 
vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks 
-fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+  return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+**     ...
+**     sub     z[0-9]+.b, z[0-9]+.b, z[0-9]+.b
+**     udot    z[0-9]+.s, z[0-9]+.b, z[0-9]+.b
+**     ...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 8000; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+/* 
+** foo2_int:
+**     ...
+**     add     z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+**     punpklo p[0-9]+.h, p[0-9]+.b
+**     uunpklo z[0-9]+.s, z[0-9]+.h
+**     add     z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**     punpkhi p[0-9]+.h, p[0-9]+.b
+**     uunpkhi z[0-9]+.s, z[0-9]+.h
+**     add     z[0-9]+.s, p[0-9]+/m, z[0-9]+.s, z[0-9]+.s
+**     ...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 8000; i++)
+    {
+      x[i] = x[i] + y[i];
+      sum += x[i];
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c
new file mode 100644
index 000000000000..b9e0010114f3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122069_2.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+/* { dg-require-effective-target aarch64_sve_hw }  */
+/* { dg-options "-O3 -march=armv8-a+sve -mautovec-preference=sve-only 
-fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+  return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 100; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+            unsigned short * restrict z) {
+  int sum = 0;
+  for (int i = 0; i < 100; i++)
+    {
+      z[i] = x[i] + y[i];
+      sum += z[i];
+    }
+  return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+             unsigned short * restrict z) {
+  int sum = 0;
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    {
+      z[i] = x[i] + y[i];
+      sum += z[i];
+    }
+  return sum;
+}
+
+int main ()
+{
+  unsigned short a[100];
+  unsigned short b[100];
+  unsigned short r1[100];
+  unsigned short r2[100];
+  unsigned char c[100];
+  unsigned char d[100];
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    {
+      a[i] = c[i] = i;
+      b[i] = d[i] = 100 - i;
+    }
+
+  if (foo_int (c, d) != foo_int2 (c, d))
+    __builtin_abort();
+
+
+  if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+    __builtin_abort();
+
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    if (r1[i] != r2[i])
+      __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file

Reply via email to