Hi all, 

This patch removes support for the widening subtract operation in the aarch64 
backend as it is causing a performance regression.

In the following example:

#include <stdint.h>
extern void wdiff( int16_t d[16], uint8_t *restrict pix1, uint8_t *restrict 
pix2)
{
   for( int y = 0; y < 4; y++ )
  {    
    for( int x = 0; x < 4; x++ )
      d[x + y*4] = pix1[x] - pix2[x];
    pix1 += 16;  
    pix2 += 16;
 }

The widening minus pattern is recognized and substituted, but cannot be used 
due to the input vector type chosen in slp vectorization. This results in an 
attempt to do an 8 byte->8 short widening subtract operation, which is not 
supported. 

The issue is documented in PR 98772.


[AArch64] Remove backend support for widen-sub

This patch removes support for the widening subtract operation in the aarch64 
backend as it is causing a performance regression.

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md    
        (vec_widen_<su>subl_lo_<mode>): Removed.
        (vec_widen_<su>subl_hi_<mode>): Removed.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/vect-widen-sub.c: Removed.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 41071b668fd0982f55f9e48510403b9f50fe0f60..c685c512e06917f9cf6bdcffcc41dd091dabfb4e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3478,30 +3478,6 @@
   DONE;
 })
 
-(define_expand "vec_widen_<su>subl_lo_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand")
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
-  "TARGET_SIMD"
-{
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
-  emit_insn (gen_aarch64_<su>subl<mode>_lo_internal (operands[0], operands[1],
-						     operands[2], p));
-  DONE;
-})
-
-(define_expand "vec_widen_<su>subl_hi_<mode>"
-  [(match_operand:<VWIDE> 0 "register_operand")
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
-   (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
-  "TARGET_SIMD"
-{
-  rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
-  emit_insn (gen_aarch64_<su>subl<mode>_hi_internal (operands[0], operands[1],
-						     operands[2], p));
-  DONE;
-})
-
 (define_expand "aarch64_saddl2<mode>"
   [(match_operand:<VWIDE> 0 "register_operand")
    (match_operand:VQW 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-widen-sub.c b/gcc/testsuite/gcc.target/aarch64/vect-widen-sub.c
deleted file mode 100644
index a2bed63affbd091977df95a126da1f5b8c1d41d2..0000000000000000000000000000000000000000
--- a/gcc/testsuite/gcc.target/aarch64/vect-widen-sub.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/* { dg-do run } */
-/* { dg-options "-O3 -save-temps" } */
-#include <stdint.h>
-#include <string.h>
-
-#pragma GCC target "+nosve"
-
-#define ARR_SIZE 1024
-
-/* Should produce an usubl */
-void usub_opt (uint32_t *foo, uint16_t *a, uint16_t *b)
-{
-    for( int i = 0; i < ARR_SIZE - 3;i=i+4)
-    {
-        foo[i]   = a[i]   - b[i];
-        foo[i+1] = a[i+1] - b[i+1];
-        foo[i+2] = a[i+2] - b[i+2];
-        foo[i+3] = a[i+3] - b[i+3];
-    }
-}
-
-__attribute__((optimize (0)))
-void usub_nonopt (uint32_t *foo, uint16_t *a, uint16_t *b)
-{
-    for( int i = 0; i < ARR_SIZE - 3;i=i+4)
-    {
-        foo[i]   = a[i]   - b[i];
-        foo[i+1] = a[i+1] - b[i+1];
-        foo[i+2] = a[i+2] - b[i+2];
-        foo[i+3] = a[i+3] - b[i+3];
-    }
-}
-
-/* Should produce an ssubl */
-void ssub_opt (int32_t *foo, int16_t *a, int16_t *b)
-{
-    for( int i = 0; i < ARR_SIZE - 3;i=i+4)
-    {
-        foo[i]   = a[i]   - b[i];
-        foo[i+1] = a[i+1] - b[i+1];
-        foo[i+2] = a[i+2] - b[i+2];
-        foo[i+3] = a[i+3] - b[i+3];
-    }
-}
-
-__attribute__((optimize (0)))
-void ssub_nonopt (int32_t *foo, int16_t *a, int16_t *b)
-{
-    for( int i = 0; i < ARR_SIZE - 3;i=i+4)
-    {
-        foo[i]   = a[i]   - b[i];
-        foo[i+1] = a[i+1] - b[i+1];
-        foo[i+2] = a[i+2] - b[i+2];
-        foo[i+3] = a[i+3] - b[i+3];
-    }
-}
-
-
-void __attribute__((optimize (0)))
-init(uint16_t *a, uint16_t *b)
-{
-    for( int i = 0; i < ARR_SIZE;i++)
-    {
-      a[i] = i;
-      b[i] = 2*i;
-    }
-}
-
-int __attribute__((optimize (0)))
-main()
-{
-    uint32_t foo_arr[ARR_SIZE];
-    uint32_t bar_arr[ARR_SIZE];
-    uint16_t a[ARR_SIZE];
-    uint16_t b[ARR_SIZE];
-
-    init(a, b);
-    usub_opt(foo_arr, a, b);
-    usub_nonopt(bar_arr, a, b);
-    if (memcmp(foo_arr, bar_arr, ARR_SIZE) != 0)
-      return 1;
-    ssub_opt((int32_t*) foo_arr, (int16_t*) a, (int16_t*) b);
-    ssub_nonopt((int32_t*) bar_arr, (int16_t*) a, (int16_t*) b);
-    if (memcmp(foo_arr, bar_arr, ARR_SIZE) != 0)
-      return 1;
-    return 0;
-}
-
-/* { dg-final { scan-assembler-times {\tusubl\t} 1} } */
-/* { dg-final { scan-assembler-times {\tusubl2\t} 1} } */
-/* { dg-final { scan-assembler-times {\tssubl\t} 1} } */
-/* { dg-final { scan-assembler-times {\tssubl2\t} 1} } */

Reply via email to