https://gcc.gnu.org/g:84e5bd1e7dbdd106956ed0f5d8ddee7bf7b3be7c

commit r17-915-g84e5bd1e7dbdd106956ed0f5d8ddee7bf7b3be7c
Author: Roger Sayle <[email protected]>
Date:   Thu May 28 20:46:04 2026 +0100

    x86 SSE: Improve vector increment/decrement on x86.
    
    This patch improves the code generated by the i386 backend for incrementing
    (adding one to) and decrementing (subtracting one from) a vector.  With SSE
    materializing the vector -1 is more efficient than materializing the
    vector +1, hence x + 1 (increment) is better expressed as x - (-1), and
    x - 1 (decrement) is better expressed as x + (-1).  Conveniently the
    relevant additions and subtractions are specified as a single pattern,
    using a plusminus iterator, in the machine description.
    
    For the four example functions:
    
    typedef char v16sqi __attribute__ ((vector_size(16)));
    typedef unsigned char v16uqi __attribute__ ((vector_size(16)));
    
    v16sqi sadd1(v16sqi x) { return x+1; }
    v16uqi uadd1(v16uqi x) { return x+1; }
    v16sqi saddm1(v16sqi x) { return x-1; }
    v16uqi uaddm1(v16uqi x) { return x-1; }
    
    GCC with -O2 -mavx2 previously generated:
    
    sadd1:  vpcmpeqd        %xmm1, %xmm1, %xmm1
            vpabsb  %xmm1, %xmm1
            vpaddb  %xmm1, %xmm0, %xmm0
            ret
    
    uadd1:  vpcmpeqd        %xmm1, %xmm1, %xmm1
            vpabsb  %xmm1, %xmm1
            vpaddb  %xmm1, %xmm0, %xmm0
            ret
    
    saddm1: vpcmpeqd        %xmm1, %xmm1, %xmm1
            vpabsb  %xmm1, %xmm1
            vpsubb  %xmm1, %xmm0, %xmm0
            ret
    
    uaddm1: vpcmpeqd        %xmm1, %xmm1, %xmm1
            vpaddb  %xmm1, %xmm0, %xmm0
            ret
    
    With this patch, we now consistently generate:
    
    sadd1:  vpcmpeqd        %xmm1, %xmm1, %xmm1
            vpsubb  %xmm1, %xmm0, %xmm0
            ret
    
    uadd1:  vpcmpeqd        %xmm1, %xmm1, %xmm1
            vpsubb  %xmm1, %xmm0, %xmm0
            ret
    
    saddm1: vpcmpeqd        %xmm1, %xmm1, %xmm1
            vpaddb  %xmm1, %xmm0, %xmm0
            ret
    
    uaddm1: vpcmpeqd        %xmm1, %xmm1, %xmm1
            vpaddb  %xmm1, %xmm0, %xmm0
            ret
    
    2026-05-28  Roger Sayle  <[email protected]>
                Hongtao Liu  <[email protected]>
                Uros Bizjak  <[email protected]>
    
    gcc/ChangeLog
            * config/i386/i386.md (inv_insn): New define_code_attr.
            * config/i386/sse.md (<plusminus><mode>3): Accept a CONST_VECTOR
            as the second operand.  If the second operand is CONST1_RTX,
            canonicalize to use CONSTM1_RTX instead.
            (*add<mode>3_one): New define_insn_and_split to convert padd +1
            to psub -1.
            (*sub<mode>3_one): Likewise, a new define_insn_and_split to
            convert psub +1 to padd -1.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/avx512f-simd-1.c: Tweak test case.
            * gcc.target/i386/sse2-paddb-2.c: New test case.
            * gcc.target/i386/sse2-paddd-2.c: Likewise.
            * gcc.target/i386/sse2-paddw-2.c: Likewise.
            * gcc.target/i386/sse2-psubb-2.c: Likewise.
            * gcc.target/i386/sse2-psubd-2.c: Likewise.
            * gcc.target/i386/sse2-psubw-2.c: Likewise.

Diff:
---
 gcc/config/i386/i386.md                        |  3 ++
 gcc/config/i386/sse.md                         | 45 ++++++++++++++++++++++++--
 gcc/testsuite/gcc.target/i386/avx512f-simd-1.c |  6 ++--
 gcc/testsuite/gcc.target/i386/sse2-paddb-2.c   | 20 ++++++++++++
 gcc/testsuite/gcc.target/i386/sse2-paddd-2.c   | 20 ++++++++++++
 gcc/testsuite/gcc.target/i386/sse2-paddw-2.c   | 20 ++++++++++++
 gcc/testsuite/gcc.target/i386/sse2-psubb-2.c   | 20 ++++++++++++
 gcc/testsuite/gcc.target/i386/sse2-psubd-2.c   | 20 ++++++++++++
 gcc/testsuite/gcc.target/i386/sse2-psubw-2.c   | 20 ++++++++++++
 9 files changed, 169 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 20d57c69bd3b..9b64843cec8e 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -1021,6 +1021,9 @@
   [(plus "add") (ss_plus "adds") (us_plus "addus")
    (minus "sub") (ss_minus "subs") (us_minus "subus")])
 
+;; Inverse instruction base name
+(define_code_attr inv_insn [(plus "sub") (minus "add")])
+
 (define_code_iterator multdiv [mult div])
 
 (define_code_attr multdiv_mnemonic
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index de092f4b9ae1..39d8d196fbe9 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -16590,9 +16590,23 @@
   [(set (match_operand:VI_AVX2 0 "register_operand")
        (plusminus:VI_AVX2
          (match_operand:VI_AVX2 1 "vector_operand")
-         (match_operand:VI_AVX2 2 "vector_operand")))]
+         (match_operand:VI_AVX2 2 "vector_or_const_vector_operand")))]
   "TARGET_SSE2"
-  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+{
+  /* Expand vector add/sub 1 as vector sub/add -1.  */
+  if (rtx_equal_p (operands[2], CONST1_RTX (<MODE>mode)))
+    {
+      operands[2] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+      emit_insn (gen_<inv_insn><mode>3 (operands[0], operands[1],
+                                       operands[2]));
+      DONE;
+    }
+
+  if (CONST_VECTOR_P (operands[2]))
+    operands[2] = force_reg (<MODE>mode, operands[2]);
+
+  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
+})
 
 (define_expand "cond_<insn><mode>"
   [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand")
@@ -16677,6 +16691,33 @@
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+/* Split vector add 1 into vector sub -1.  */
+(define_insn_and_split "*add<mode>3_one"
+  [(set (match_operand:VI_AVX2 0 "register_operand")
+       (plus:VI_AVX2
+         (match_operand:VI_AVX2 1 "nonimmediate_operand")
+         (match_operand:VI_AVX2 2 "const1_operand")))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (minus:VI_AVX2 (match_dup 1) (match_dup 3)))]
+{
+  operands[1] = force_reg (<MODE>mode, operands[1]);
+  operands[3] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));
+})
+
+/* Split vector sub 1 into vector add -1.  */
+(define_insn_and_split "*sub<mode>3_one"
+  [(set (match_operand:VI_AVX2 0 "register_operand")
+       (minus:VI_AVX2
+         (match_operand:VI_AVX2 1 "nonimmediate_operand")
+         (match_operand:VI_AVX2 2 "const1_operand")))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (plus:VI_AVX2 (match_dup 3) (match_dup 1)))]
+  "operands[3] = force_reg (<MODE>mode, CONSTM1_RTX (<MODE>mode));")
+
 (define_expand "<insn><mode>3<mask_name>"
   [(set (match_operand:VI12_AVX2_AVX512BW 0 "register_operand")
        (sat_plusminus:VI12_AVX2_AVX512BW
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c 
b/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c
index 235fb917e17f..77c5f202e2f5 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-simd-1.c
@@ -13,7 +13,7 @@ f1 (void)
   int i;
   #pragma omp simd simdlen (4)
   for (i = 0; i < N; ++i)
-    a[i] = a[i] + 1;
+    a[i] = a[i] + 11;
 }
 
 void
@@ -22,7 +22,7 @@ f2 (void)
   int i;
   #pragma omp simd simdlen (8)
   for (i = 0; i < N; ++i)
-    a[i] = a[i] + 2;
+    a[i] = a[i] + 12;
 }
 
 void
@@ -31,5 +31,5 @@ f3 (void)
   int i;
   #pragma omp simd simdlen (16)
   for (i = 0; i < N; ++i)
-    a[i] = a[i] + 3;
+    a[i] = a[i] + 13;
 }
diff --git a/gcc/testsuite/gcc.target/i386/sse2-paddb-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-paddb-2.c
new file mode 100644
index 000000000000..f4acff29a206
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-paddb-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef char v16sqi __attribute__ ((vector_size(16)));
+typedef unsigned char v16uqi __attribute__ ((vector_size(16)));
+
+v16sqi si,so;
+v16uqi ui,uo;
+
+void foo()
+{
+  so = si - 1;
+}
+
+void bar()
+{
+  uo = ui - 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]paddb\[ \t\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-paddd-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-paddd-2.c
new file mode 100644
index 000000000000..d48022cbfdae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-paddd-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef int v4ssi __attribute__ ((vector_size(16)));
+typedef unsigned int v4usi __attribute__ ((vector_size(16)));
+
+v4ssi si,so;
+v4usi ui,uo;
+
+void foo()
+{
+  so = si - 1;
+}
+
+void bar()
+{
+  uo = ui - 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]paddd\[ \t\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-paddw-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-paddw-2.c
new file mode 100644
index 000000000000..be81170cbf7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-paddw-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef short v8shi __attribute__ ((vector_size(16)));
+typedef unsigned short v8uhi __attribute__ ((vector_size(16)));
+
+v8shi si,so;
+v8uhi ui,uo;
+
+void foo()
+{
+  so = si - 1;
+}
+
+void bar()
+{
+  uo = ui - 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]paddw\[ \t\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psubb-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-psubb-2.c
new file mode 100644
index 000000000000..e6f421eb276e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-psubb-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef char v16sqi __attribute__ ((vector_size(16)));
+typedef unsigned char v16uqi __attribute__ ((vector_size(16)));
+
+v16sqi si,so;
+v16uqi ui,uo;
+
+void foo()
+{
+  so = si + 1;
+}
+
+void bar()
+{
+  uo = ui + 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]psubb\[ \t\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psubd-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-psubd-2.c
new file mode 100644
index 000000000000..aaf7e5a5aae6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-psubd-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef int v4ssi __attribute__ ((vector_size(16)));
+typedef unsigned int v4usi __attribute__ ((vector_size(16)));
+
+v4ssi si,so;
+v4usi ui,uo;
+
+void foo()
+{
+  so = si + 1;
+}
+
+void bar()
+{
+  uo = ui + 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]psubd\[ \t\]" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psubw-2.c 
b/gcc/testsuite/gcc.target/i386/sse2-psubw-2.c
new file mode 100644
index 000000000000..8c11012af9a0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-psubw-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef short v8shi __attribute__ ((vector_size(16)));
+typedef unsigned short v8uhi __attribute__ ((vector_size(16)));
+
+v8shi si,so;
+v8uhi ui,uo;
+
+void foo()
+{
+  so = si + 1;
+}
+
+void bar()
+{
+  uo = ui + 1;
+}
+
+/* { dg-final { scan-assembler-times "\[ \t\]psubw\[ \t\]" 2 } } */

Reply via email to