https://gcc.gnu.org/g:c8dc5d5070c09792bf8d224cac90989885818aaf

commit r16-4477-gc8dc5d5070c09792bf8d224cac90989885818aaf
Author: Tamar Christina <[email protected]>
Date:   Sat Oct 18 08:20:07 2025 +0100

    AArch64: add double widen_sum optab using dotprod for Adv.SIMD [PR122069]
    
    This patch implements support for using dotproduct to do sum reductions by
    changing += a into += (a * 1).  i.e. we seed the multiplication with 1.
    
    Given the example
    
    int foo_int(unsigned char *x, unsigned char * restrict y) {
      int sum = 0;
      for (int i = 0; i < 8000; i++)
         sum += char_abs(x[i] - y[i]);
      return sum;
    }
    
    we used to generate
    
    .L2:
            ldr     q0, [x0, x2]
            ldr     q28, [x1, x2]
            sub     v28.16b, v0.16b, v28.16b
            zip1    v29.16b, v28.16b, v31.16b
            zip2    v28.16b, v28.16b, v31.16b
            uaddw   v30.4s, v30.4s, v29.4h
            uaddw2  v30.4s, v30.4s, v29.8h
            uaddw   v30.4s, v30.4s, v28.4h
            uaddw2  v30.4s, v30.4s, v28.8h
            add     x2, x2, 16
            cmp     x2, x3
            bne     .L2
            addv    s31, v30.4s
    
    but now generates with +dotprod
    
    .L2:
            ldr     q29, [x0, x2]
            ldr     q28, [x1, x2]
            sub     v28.16b, v29.16b, v28.16b
            udot    v31.4s, v28.16b, v30.16b
            add     x2, x2, 16
            cmp     x2, x3
            bne     .L2
            addv    s31, v31.4s
    
    gcc/ChangeLog:
    
            PR middle-end/122069
            * config/aarch64/aarch64-simd.md (widen_ssum<mode><vsi2qi>3): New.
            (widen_usum<mode><vsi2qi>3): New.
    
    gcc/testsuite/ChangeLog:
    
            PR middle-end/122069
            * gcc.target/aarch64/pr122069_3.c: New test.
            * gcc.target/aarch64/pr122069_4.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-simd.md            | 32 +++++++++++
 gcc/testsuite/gcc.target/aarch64/pr122069_3.c | 41 ++++++++++++++
 gcc/testsuite/gcc.target/aarch64/pr122069_4.c | 81 +++++++++++++++++++++++++++
 3 files changed, 154 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 6488119a1402..eaa8d57cc413 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -4703,6 +4703,38 @@
   DONE;
 })
 
+(define_expand "widen_ssum<mode><vsi2qi>3"
+  [(set (match_operand:VS 0 "register_operand")
+       (plus:VS (sign_extend:VS
+                  (match_operand:<VSI2QI> 1 "register_operand"))
+                (match_operand:VS 2 "register_operand")))]
+  "TARGET_DOTPROD"
+  {
+    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+    emit_insn (gen_sdot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
+                                           operands[2]));
+    DONE;
+  }
+)
+
+;; Use dot product to perform double widening sum reductions by
+;; changing += a into += (a * 1).  i.e. we seed the multiplication with 1.
+(define_expand "widen_usum<mode><vsi2qi>3"
+  [(set (match_operand:VS 0 "register_operand")
+       (plus:VS (zero_extend:VS
+                       (match_operand:<VSI2QI> 1 "register_operand"))
+                     (match_operand:VS 2 "register_operand")))]
+  "TARGET_DOTPROD"
+  {
+    rtx ones = force_reg (<VSI2QI>mode, CONST1_RTX (<VSI2QI>mode));
+    emit_insn (gen_udot_prod<mode><vsi2qi> (operands[0], operands[1], ones,
+                                           operands[2]));
+    DONE;
+  }
+)
+
+;; Use dot product to perform double widening sum reductions by
+;; changing += a into += (a * 1).  i.e. we seed the multiplication with 1.
 (define_insn "aarch64_<ANY_EXTEND:su>subw<mode>"
   [(set (match_operand:<VWIDE> 0 "register_operand" "=w")
        (minus:<VWIDE> (match_operand:<VWIDE> 1 "register_operand" "w")
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_3.c 
b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
new file mode 100644
index 000000000000..0e832c43032a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_3.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=armv8.2-a+dotprod -mautovec-preference=asimd-only 
--param vect-epilogues-nomask=0 -fno-schedule-insns -fno-reorder-blocks 
-fno-schedule-insns2 -fdump-tree-vect-details" }*/
+/* { dg-final { check-function-bodies "**" "" } } */
+
+inline char char_abs(char i) {
+  return (i < 0 ? -i : i);
+}
+
+/*
+** foo_int:
+**     ...
+**     sub     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**     udot    v[0-9]+.4s, v[0-9]+.16b, v[0-9]+.16b
+**     ...
+*/
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 8000; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+/*
+** foo2_int:
+**     ...
+**     add     v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+**     uaddw   v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4h
+**     uaddw2  v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.8h
+**     ...
+*/
+int foo2_int(unsigned short *x, unsigned short * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 8000; i++)
+    {
+      x[i] = x[i] + y[i];
+      sum += x[i];
+    }
+  return sum;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122069_4.c 
b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
new file mode 100644
index 000000000000..22d5f631de21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122069_4.c
@@ -0,0 +1,81 @@
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_dotprod_neon_hw }*/
+/* { dg-options "-O3 -march=armv8.2-a+dotprod -mautovec-preference=asimd-only 
-fdump-tree-vect-details" }*/
+
+inline char char_abs(char i) {
+  return (i < 0 ? -i : i);
+}
+
+__attribute__((noipa))
+int foo_int(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+  for (int i = 0; i < 100; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+__attribute__((noipa))
+int foo2_int(unsigned short *x, unsigned short * restrict y,
+            unsigned short * restrict z) {
+  int sum = 0;
+  for (int i = 0; i < 100; i++)
+    {
+      z[i] = x[i] + y[i];
+      sum += z[i];
+    }
+  return sum;
+}
+
+__attribute__((noipa))
+int foo_int2(unsigned char *x, unsigned char * restrict y) {
+  int sum = 0;
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+     sum += char_abs(x[i] - y[i]);
+  return sum;
+}
+
+__attribute__((noipa))
+int foo2_int2(unsigned short *x, unsigned short * restrict y,
+             unsigned short * restrict z) {
+  int sum = 0;
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    {
+      z[i] = x[i] + y[i];
+      sum += z[i];
+    }
+  return sum;
+}
+
+int main ()
+{
+  unsigned short a[100];
+  unsigned short b[100];
+  unsigned short r1[100];
+  unsigned short r2[100];
+  unsigned char c[100];
+  unsigned char d[100];
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    {
+      a[i] = c[i] = i;
+      b[i] = d[i] = 100 - i;
+    }
+
+  if (foo_int (c, d) != foo_int2 (c, d))
+    __builtin_abort();
+
+
+  if (foo2_int (a, b, r1) != foo2_int2 (a, b, r2))
+    __builtin_abort();
+
+#pragma GCC novector
+  for (int i = 0; i < 100; i++)
+    if (r1[i] != r2[i])
+      __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" } } */
\ No newline at end of file

Reply via email to