Hi all,
This patch removes support for the widening subtract operation in the aarch64
backend as it is causing a performance regression.
In the following example:
#include <stdint.h>
extern void wdiff( int16_t d[16], uint8_t *restrict pix1, uint8_t *restrict
pix2)
{
for( int y = 0; y < 4; y++ )
{
for( int x = 0; x < 4; x++ )
d[x + y*4] = pix1[x] - pix2[x];
pix1 += 16;
pix2 += 16;
}
The widening minus pattern is recognized and substituted, but cannot be used
due to the input vector type chosen in slp vectorization. This results in an
attempt to do an 8 byte->8 short widening subtract operation, which is not
supported.
The issue is documented in PR 98772.
[AArch64] Remove backend support for widen-sub
This patch removes support for the widening subtract operation in the aarch64
backend as it is causing a performance regression.
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md
(vec_widen_<su>subl_lo_<mode>): Removed.
(vec_widen_<su>subl_hi_<mode>): Removed.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/vect-widen-sub.c: Removed.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 41071b668fd0982f55f9e48510403b9f50fe0f60..c685c512e06917f9cf6bdcffcc41dd091dabfb4e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3478,30 +3478,6 @@
DONE;
})
-(define_expand "vec_widen_<su>subl_lo_<mode>"
- [(match_operand:<VWIDE> 0 "register_operand")
- (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
- (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
- "TARGET_SIMD"
-{
- rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
- emit_insn (gen_aarch64_<su>subl<mode>_lo_internal (operands[0], operands[1],
- operands[2], p));
- DONE;
-})
-
-(define_expand "vec_widen_<su>subl_hi_<mode>"
- [(match_operand:<VWIDE> 0 "register_operand")
- (ANY_EXTEND:<VWIDE> (match_operand:VQW 1 "register_operand"))
- (ANY_EXTEND:<VWIDE> (match_operand:VQW 2 "register_operand"))]
- "TARGET_SIMD"
-{
- rtx p = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
- emit_insn (gen_aarch64_<su>subl<mode>_hi_internal (operands[0], operands[1],
- operands[2], p));
- DONE;
-})
-
(define_expand "aarch64_saddl2<mode>"
[(match_operand:<VWIDE> 0 "register_operand")
(match_operand:VQW 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-widen-sub.c b/gcc/testsuite/gcc.target/aarch64/vect-widen-sub.c
deleted file mode 100644
index a2bed63affbd091977df95a126da1f5b8c1d41d2..0000000000000000000000000000000000000000
--- a/gcc/testsuite/gcc.target/aarch64/vect-widen-sub.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/* { dg-do run } */
-/* { dg-options "-O3 -save-temps" } */
-#include <stdint.h>
-#include <string.h>
-
-#pragma GCC target "+nosve"
-
-#define ARR_SIZE 1024
-
-/* Should produce an usubl */
-void usub_opt (uint32_t *foo, uint16_t *a, uint16_t *b)
-{
- for( int i = 0; i < ARR_SIZE - 3;i=i+4)
- {
- foo[i] = a[i] - b[i];
- foo[i+1] = a[i+1] - b[i+1];
- foo[i+2] = a[i+2] - b[i+2];
- foo[i+3] = a[i+3] - b[i+3];
- }
-}
-
-__attribute__((optimize (0)))
-void usub_nonopt (uint32_t *foo, uint16_t *a, uint16_t *b)
-{
- for( int i = 0; i < ARR_SIZE - 3;i=i+4)
- {
- foo[i] = a[i] - b[i];
- foo[i+1] = a[i+1] - b[i+1];
- foo[i+2] = a[i+2] - b[i+2];
- foo[i+3] = a[i+3] - b[i+3];
- }
-}
-
-/* Should produce an ssubl */
-void ssub_opt (int32_t *foo, int16_t *a, int16_t *b)
-{
- for( int i = 0; i < ARR_SIZE - 3;i=i+4)
- {
- foo[i] = a[i] - b[i];
- foo[i+1] = a[i+1] - b[i+1];
- foo[i+2] = a[i+2] - b[i+2];
- foo[i+3] = a[i+3] - b[i+3];
- }
-}
-
-__attribute__((optimize (0)))
-void ssub_nonopt (int32_t *foo, int16_t *a, int16_t *b)
-{
- for( int i = 0; i < ARR_SIZE - 3;i=i+4)
- {
- foo[i] = a[i] - b[i];
- foo[i+1] = a[i+1] - b[i+1];
- foo[i+2] = a[i+2] - b[i+2];
- foo[i+3] = a[i+3] - b[i+3];
- }
-}
-
-
-void __attribute__((optimize (0)))
-init(uint16_t *a, uint16_t *b)
-{
- for( int i = 0; i < ARR_SIZE;i++)
- {
- a[i] = i;
- b[i] = 2*i;
- }
-}
-
-int __attribute__((optimize (0)))
-main()
-{
- uint32_t foo_arr[ARR_SIZE];
- uint32_t bar_arr[ARR_SIZE];
- uint16_t a[ARR_SIZE];
- uint16_t b[ARR_SIZE];
-
- init(a, b);
- usub_opt(foo_arr, a, b);
- usub_nonopt(bar_arr, a, b);
- if (memcmp(foo_arr, bar_arr, ARR_SIZE) != 0)
- return 1;
- ssub_opt((int32_t*) foo_arr, (int16_t*) a, (int16_t*) b);
- ssub_nonopt((int32_t*) bar_arr, (int16_t*) a, (int16_t*) b);
- if (memcmp(foo_arr, bar_arr, ARR_SIZE) != 0)
- return 1;
- return 0;
-}
-
-/* { dg-final { scan-assembler-times {\tusubl\t} 1} } */
-/* { dg-final { scan-assembler-times {\tusubl2\t} 1} } */
-/* { dg-final { scan-assembler-times {\tssubl\t} 1} } */
-/* { dg-final { scan-assembler-times {\tssubl2\t} 1} } */