[gcc r15-1191] Fix ICE in rtl check due to CONST_WIDE_INT in CONST_VECTOR_DUPLICATE_P
https://gcc.gnu.org/g:1d496d2cd1d5d8751a1637abca89339d6f9ddd3b commit r15-1191-g1d496d2cd1d5d8751a1637abca89339d6f9ddd3b Author: liuhongt Date: Tue Jun 11 10:23:27 2024 +0800 Fix ICE in rtl check due to CONST_WIDE_INT in CONST_VECTOR_DUPLICATE_P The patch add extra check to make sure the component of CONST_VECTOR is CONST_INT_P. gcc/ChangeLog: PR target/115384 * simplify-rtx.cc (simplify_context::simplify_binary_operation_1): Only do the simplification of (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) when the component of const_vector is CONST_INT_P. gcc/testsuite/ChangeLog: * gcc.target/i386/pr115384.c: New test. Diff: --- gcc/simplify-rtx.cc | 6 -- gcc/testsuite/gcc.target/i386/pr115384.c | 12 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index 9bc3ef9ad9fd..3ee95f74d3db 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -4072,9 +4072,11 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT && (CONST_INT_P (XEXP (op0, 1)) || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR - && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1 + && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1)) + && CONST_INT_P (XVECEXP (XEXP (op0, 1), 0, 0 && GET_CODE (op1) == CONST_VECTOR - && CONST_VECTOR_DUPLICATE_P (op1)) + && CONST_VECTOR_DUPLICATE_P (op1) + && CONST_INT_P (XVECEXP (op1, 0, 0))) { unsigned HOST_WIDE_INT shift_count = (CONST_INT_P (XEXP (op0, 1)) diff --git a/gcc/testsuite/gcc.target/i386/pr115384.c b/gcc/testsuite/gcc.target/i386/pr115384.c new file mode 100644 index ..31dd6f4eb18a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr115384.c @@ -0,0 +1,12 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O" } */ + +typedef __attribute__((__vector_size__(sizeof(__int128 __int128 W; + +W w; + +void +foo() +{ + w = w >> 4 & 18446744073709551600llu; +}
[gcc r12-10497] Disable FMADD in chains for Zen4 and generic
https://gcc.gnu.org/g:5d52558a531130675329d72ca5c4713abf5bf885 commit r12-10497-g5d52558a531130675329d72ca5c4713abf5bf885 Author: Jan Hubicka Date: Fri Dec 29 23:51:03 2023 +0100 Disable FMADD in chains for Zen4 and generic this patch disables use of FMA in matrix multiplication loop for generic (for x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U. For Intel this is neutral both on the matrix multiplication microbenchmark (attached) and spec2k17 where the difference was within noise for Core. On core the micro-benchmark runs as follows: With FMA: 578,500,241 cycles:u #3.645 GHz ( +- 0.12% ) 753,318,477 instructions:u #1.30 insn per cycle ( +- 0.00% ) 125,417,701 branches:u # 790.227 M/sec ( +- 0.00% ) 0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% ) No FMA: 577,573,960 cycles:u #3.514 GHz ( +- 0.15% ) 878,318,479 instructions:u #1.52 insn per cycle ( +- 0.00% ) 125,417,702 branches:u # 763.035 M/sec ( +- 0.00% ) 0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% ) So the cycle count is unchanged and discrete multiply+add takes same time as FMA. While on zen: With FMA: 484875179 cycles:u #3.599 GHz ( +- 0.05% ) (82.11%) 752031517 instructions:u #1.55 insn per cycle 125106525 branches:u # 928.712 M/sec ( +- 0.03% ) (85.09%) 128356 branch-misses:u #0.10% of all branches ( +- 0.06% ) (83.58%) No FMA: 375875209 cycles:u #3.592 GHz ( +- 0.08% ) (80.74%) 875725341 instructions:u #2.33 insn per cycle 124903825 branches:u #1.194 G/sec ( +- 0.04% ) (84.59%) 0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% ) The diffrerence is that Cores understand the fact that fmadd does not need all three parameters to start computation, while Zen cores doesn't. Since this seems noticeable win on zen and not loss on Core it seems like good default for generic. float a[SIZE][SIZE]; float b[SIZE][SIZE]; float c[SIZE][SIZE]; void init(void) { int i, j, k; for(i=0; i
[gcc r13-8825] Disable FMADD in chains for Zen4 and generic
https://gcc.gnu.org/g:e4f85ea6271a10e13c6874709a05e04ab0508fbf commit r13-8825-ge4f85ea6271a10e13c6874709a05e04ab0508fbf Author: Jan Hubicka Date: Fri Dec 29 23:51:03 2023 +0100 Disable FMADD in chains for Zen4 and generic this patch disables use of FMA in matrix multiplication loop for generic (for x86-64-v3) and zen4. I tested this on zen4 and Xenon Gold Gold 6212U. For Intel this is neutral both on the matrix multiplication microbenchmark (attached) and spec2k17 where the difference was within noise for Core. On core the micro-benchmark runs as follows: With FMA: 578,500,241 cycles:u #3.645 GHz ( +- 0.12% ) 753,318,477 instructions:u #1.30 insn per cycle ( +- 0.00% ) 125,417,701 branches:u # 790.227 M/sec ( +- 0.00% ) 0.159146 +- 0.000363 seconds time elapsed ( +- 0.23% ) No FMA: 577,573,960 cycles:u #3.514 GHz ( +- 0.15% ) 878,318,479 instructions:u #1.52 insn per cycle ( +- 0.00% ) 125,417,702 branches:u # 763.035 M/sec ( +- 0.00% ) 0.164734 +- 0.000321 seconds time elapsed ( +- 0.19% ) So the cycle count is unchanged and discrete multiply+add takes same time as FMA. While on zen: With FMA: 484875179 cycles:u #3.599 GHz ( +- 0.05% ) (82.11%) 752031517 instructions:u #1.55 insn per cycle 125106525 branches:u # 928.712 M/sec ( +- 0.03% ) (85.09%) 128356 branch-misses:u #0.10% of all branches ( +- 0.06% ) (83.58%) No FMA: 375875209 cycles:u #3.592 GHz ( +- 0.08% ) (80.74%) 875725341 instructions:u #2.33 insn per cycle 124903825 branches:u #1.194 G/sec ( +- 0.04% ) (84.59%) 0.105203 +- 0.000188 seconds time elapsed ( +- 0.18% ) The diffrerence is that Cores understand the fact that fmadd does not need all three parameters to start computation, while Zen cores doesn't. Since this seems noticeable win on zen and not loss on Core it seems like good default for generic. float a[SIZE][SIZE]; float b[SIZE][SIZE]; float c[SIZE][SIZE]; void init(void) { int i, j, k; for(i=0; i
[gcc r15-1088] Add additional option --param max-completely-peeled-insns=200 for power64*-*-*
https://gcc.gnu.org/g:b24f2954dbc13d85e9fb62e05a88e9df21e4d4f4 commit r15-1088-gb24f2954dbc13d85e9fb62e05a88e9df21e4d4f4 Author: liuhongt Date: Fri Jun 7 09:29:24 2024 +0800 Add additional option --param max-completely-peeled-insns=200 for power64*-*-* gcc/testsuite/ChangeLog: * gcc.dg/vect/pr112325.c:Add additional option --param max-completely-peeled-insns=200 for power64*-*-*. Diff: --- gcc/testsuite/gcc.dg/vect/pr112325.c | 1 + 1 file changed, 1 insertion(+) diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c b/gcc/testsuite/gcc.dg/vect/pr112325.c index dea6cca3b86..143903beab2 100644 --- a/gcc/testsuite/gcc.dg/vect/pr112325.c +++ b/gcc/testsuite/gcc.dg/vect/pr112325.c @@ -3,6 +3,7 @@ /* { dg-require-effective-target vect_int } */ /* { dg-require-effective-target vect_shift } */ /* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */ +/* { dg-additional-options "--param max-completely-peeled-insns=200" { target powerpc64*-*-* } } */ typedef unsigned short ggml_fp16_t; static float table_f32_f16[1 << 16];
[gcc r15-1050] Refine testcase for power10.
https://gcc.gnu.org/g:fcfce55c85f842ed843cbc4aabe744c6a004dead commit r15-1050-gfcfce55c85f842ed843cbc4aabe744c6a004dead Author: liuhongt Date: Thu Jun 6 11:27:53 2024 +0800 Refine testcase for power10. For power10, there're extra 3 REG_EQUIV notes with (fix:SI. to avoid the failure. Check (fix:SI is from the pattern not NOTE. gcc/testsuite/ChangeLog: PR target/115365 * gcc.dg/pr100927.c: Don't scan fix:SI from the note. Diff: --- gcc/testsuite/gcc.dg/pr100927.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.dg/pr100927.c b/gcc/testsuite/gcc.dg/pr100927.c index ea0e627befa..8a7d69c3831 100644 --- a/gcc/testsuite/gcc.dg/pr100927.c +++ b/gcc/testsuite/gcc.dg/pr100927.c @@ -1,7 +1,7 @@ /* { dg-do compile } */ /* { dg-options "-O2 -ftrapping-math -fdump-tree-optimized -fdump-rtl-final" } */ /* { dg-final { scan-tree-dump-times {(?n)= \(int\)} 3 "optimized" } } */ -/* { dg-final { scan-rtl-dump-times {(?n)\(fix:SI} 3 "final" } } */ +/* { dg-final { scan-rtl-dump-times {(?n)^[ \t]*\(fix:SI} 3 "final" } } */ int foo_ofr ()
[gcc r15-1048] Adjust rtx_cost for MEM to enable more simplication
https://gcc.gnu.org/g:961dd0d635217c703a38c48903981e0d60962546 commit r15-1048-g961dd0d635217c703a38c48903981e0d60962546 Author: liuhongt Date: Fri Apr 19 10:39:53 2024 +0800 Adjust rtx_cost for MEM to enable more simplication For CONST_VECTOR_DUPLICATE_P in constant_pool, it is just broadcast or variants in ix86_vector_duplicate_simode_const. Adjust the cost to COSTS_N_INSNS (2) + speed which should be a little bit larger than broadcast. gcc/ChangeLog: PR target/114428 * config/i386/i386.cc (ix86_rtx_costs): Adjust cost for CONST_VECTOR_DUPLICATE_P in constant_pool. * config/i386/i386-expand.cc (ix86_broadcast_from_constant): Remove static. * config/i386/i386-protos.h (ix86_broadcast_from_constant): Declare. gcc/testsuite/ChangeLog: * gcc.target/i386/pr114428.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 2 +- gcc/config/i386/i386-protos.h| 1 + gcc/config/i386/i386.cc | 13 + gcc/testsuite/gcc.target/i386/pr114428.c | 18 ++ 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 215a998fc26..56d29c15f9a 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -588,7 +588,7 @@ ix86_expand_move (machine_mode mode, rtx operands[]) /* OP is a memref of CONST_VECTOR, return scalar constant mem if CONST_VECTOR is a vec_duplicate, else return NULL. */ -static rtx +rtx ix86_broadcast_from_constant (machine_mode mode, rtx op) { int nunits = GET_MODE_NUNITS (mode); diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index dbc861fb1ea..90712769200 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -107,6 +107,7 @@ extern void ix86_expand_clear (rtx); extern void ix86_expand_move (machine_mode, rtx[]); extern void ix86_expand_vector_move (machine_mode, rtx[]); extern void ix86_expand_vector_move_misalign (machine_mode, rtx[]); +extern rtx ix86_broadcast_from_constant (machine_mode, rtx); extern rtx ix86_fixup_binary_operands (enum rtx_code, machine_mode, rtx[], bool = false); extern void ix86_fixup_binary_operands_no_copy (enum rtx_code, machine_mode, diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 271da127a89..a9d62c84c52 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -22191,6 +22191,19 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, return true; case MEM: + /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast. +or variants in ix86_vector_duplicate_simode_const. */ + + if (GET_MODE_SIZE (mode) >= 16 + && VECTOR_MODE_P (mode) + && SYMBOL_REF_P (XEXP (x, 0)) + && CONSTANT_POOL_ADDRESS_P (XEXP (x, 0)) + && ix86_broadcast_from_constant (mode, x)) + { + *total = COSTS_N_INSNS (2) + speed; + return true; + } + /* An insn that accesses memory is slightly more expensive than one that does not. */ if (speed) diff --git a/gcc/testsuite/gcc.target/i386/pr114428.c b/gcc/testsuite/gcc.target/i386/pr114428.c new file mode 100644 index 000..bbbc5a080f6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr114428.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v3 -mno-avx512f -O2" } */ +/* { dg-final { scan-assembler-not "vpsra[dw]" } } */ + +void +foo2 (char* __restrict a, short* b) +{ + for (int i = 0; i != 32; i++) +a[i] = b[i] >> (short)8; +} + +void +foo3 (char* __restrict a, short* b) +{ + for (int i = 0; i != 16; i++) +a[i] = b[i] >> (short)8; +} +
[gcc r15-1047] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.
https://gcc.gnu.org/g:7876cde25cbd2f026a0ae488e5263e72f8e9bfa0 commit r15-1047-g7876cde25cbd2f026a0ae488e5263e72f8e9bfa0 Author: liuhongt Date: Fri Apr 19 10:29:34 2024 +0800 Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode. When mask is (1 << (prec - imm) - 1) which is used to clear upper bits of A, then it can be simplified to LSHIFTRT. i.e Simplify (and:v8hi (ashifrt:v8hi A 8) (const_vector 0xff x8)) to (lshifrt:v8hi A 8) gcc/ChangeLog: PR target/114428 * simplify-rtx.cc (simplify_context::simplify_binary_operation_1): Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for specific mask. gcc/testsuite/ChangeLog: * gcc.target/i386/pr114428-1.c: New test. Diff: --- gcc/simplify-rtx.cc| 25 +++ gcc/testsuite/gcc.target/i386/pr114428-1.c | 39 ++ 2 files changed, 64 insertions(+) diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index f6b4d73b593..9bc3ef9ad9f 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -4065,6 +4065,31 @@ simplify_context::simplify_binary_operation_1 (rtx_code code, return tem; } + /* (and:v4si + (ashiftrt:v4si A 16) + (const_vector: 0x x4)) +is just (lshiftrt:v4si A 16). */ + if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT + && (CONST_INT_P (XEXP (op0, 1)) + || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR + && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1 + && GET_CODE (op1) == CONST_VECTOR + && CONST_VECTOR_DUPLICATE_P (op1)) + { + unsigned HOST_WIDE_INT shift_count + = (CONST_INT_P (XEXP (op0, 1)) + ? UINTVAL (XEXP (op0, 1)) + : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0))); + unsigned HOST_WIDE_INT inner_prec + = GET_MODE_PRECISION (GET_MODE_INNER (mode)); + + /* Avoid UD shift count. */ + if (shift_count < inner_prec + && (UINTVAL (XVECEXP (op1, 0, 0)) + == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1)) + return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP (op0, 1)); + } + tem = simplify_byte_swapping_operation (code, mode, op0, op1); if (tem) return tem; diff --git a/gcc/testsuite/gcc.target/i386/pr114428-1.c b/gcc/testsuite/gcc.target/i386/pr114428-1.c new file mode 100644 index 000..927476f2269 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr114428-1.c @@ -0,0 +1,39 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-final { scan-assembler-times "psrlw" 1 } } */ +/* { dg-final { scan-assembler-times "psrld" 1 } } */ +/* { dg-final { scan-assembler-times "psrlq" 1 { target { ! ia32 } } } } */ + + +#define SHIFTC 12 + +typedef int v4si __attribute__((vector_size(16))); +typedef short v8hi __attribute__((vector_size(16))); +typedef long long v2di __attribute__((vector_size(16))); + +v8hi +foo1 (v8hi a) +{ + return +(a >> (16 - SHIFTC)) & (__extension__(v8hi){(1<> (32 - SHIFTC)) & (__extension__(v4si){(1<> (long long)(64 - SHIFTC)) & (__extension__(v2di){(1ULL<
[gcc r15-1022] Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.
https://gcc.gnu.org/g:b05288d1f1e4b632eddf8830b4369d4659f6c2ff commit r15-1022-gb05288d1f1e4b632eddf8830b4369d4659f6c2ff Author: liuhongt Date: Tue May 21 16:57:17 2024 +0800 Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX. According to IEEE standard, for conversions from floating point to integer. When a NaN or infinite operand cannot be represented in the destination format and this cannot otherwise be indicated, the invalid operation exception shall be signaled. When a numeric operand would convert to an integer outside the range of the destination format, the invalid operation exception shall be signaled if this situation cannot otherwise be indicated. The patch prevent simplication of the conversion from floating point to integer for NAN/INF/out-of-range constant when flag_trapping_math. gcc/ChangeLog: PR rtl-optimization/100927 PR rtl-optimization/115161 PR rtl-optimization/115115 * simplify-rtx.cc (simplify_const_unary_operation): Prevent simplication of FIX/UNSIGNED_FIX for NAN/INF/out-of-range constant when flag_trapping_math. * fold-const.cc (fold_convert_const_int_from_real): Don't fold for overflow value when_trapping_math. gcc/testsuite/ChangeLog: * gcc.dg/pr100927.c: New test. * c-c++-common/Wconversion-1.c: Add -fno-trapping-math. * c-c++-common/dfp/convert-int-saturate.c: Ditto. * g++.dg/ubsan/pr63956.C: Ditto. * g++.dg/warn/Wconversion-real-integer.C: Ditto. * gcc.c-torture/execute/20031003-1.c: Ditto. * gcc.dg/Wconversion-complex-c99.c: Ditto. * gcc.dg/Wconversion-real-integer.c: Ditto. * gcc.dg/c90-const-expr-11.c: Ditto. * gcc.dg/overflow-warn-8.c: Ditto. Diff: --- gcc/fold-const.cc | 13 - gcc/simplify-rtx.cc| 23 +--- gcc/testsuite/c-c++-common/Wconversion-1.c | 2 +- .../c-c++-common/dfp/convert-int-saturate.c| 1 + gcc/testsuite/g++.dg/ubsan/pr63956.C | 7 - .../g++.dg/warn/Wconversion-real-integer.C | 2 +- gcc/testsuite/gcc.c-torture/execute/20031003-1.c | 2 ++ gcc/testsuite/gcc.dg/Wconversion-complex-c99.c | 2 +- gcc/testsuite/gcc.dg/Wconversion-real-integer.c| 2 +- gcc/testsuite/gcc.dg/c90-const-expr-11.c | 2 +- gcc/testsuite/gcc.dg/overflow-warn-8.c | 1 + gcc/testsuite/gcc.dg/pr100927.c| 31 ++ 12 files changed, 77 insertions(+), 11 deletions(-) diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index 92b048c307e..710d697c021 100644 --- a/gcc/fold-const.cc +++ b/gcc/fold-const.cc @@ -2246,7 +2246,18 @@ fold_convert_const_int_from_real (enum tree_code code, tree type, const_tree arg if (! overflow) val = real_to_integer (, , TYPE_PRECISION (type)); - t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1)); + /* According to IEEE standard, for conversions from floating point to + integer. When a NaN or infinite operand cannot be represented in the + destination format and this cannot otherwise be indicated, the invalid + operation exception shall be signaled. When a numeric operand would + convert to an integer outside the range of the destination format, the + invalid operation exception shall be signaled if this situation cannot + otherwise be indicated. */ + if (!flag_trapping_math || !overflow) +t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1)); + else +t = NULL_TREE; + return t; } diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index 5caf1dfd957..f6b4d73b593 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -2256,14 +2256,25 @@ simplify_const_unary_operation (enum rtx_code code, machine_mode mode, switch (code) { case FIX: + /* According to IEEE standard, for conversions from floating point to +integer. When a NaN or infinite operand cannot be represented in +the destination format and this cannot otherwise be indicated, the +invalid operation exception shall be signaled. When a numeric +operand would convert to an integer outside the range of the +destination format, the invalid operation exception shall be +signaled if this situation cannot otherwise be indicated. */ if (REAL_VALUE_ISNAN (*x)) - return const0_rtx; + return flag_trapping_math ? NULL_RTX : const0_rtx; + + if (REAL_VALUE_ISINF (*x) && flag_trapping_math) + return NULL_RTX; /* Test against the signed upper bound. */ wmax = wi::max_value (width, SIGNED); real_from_integer (, VOIDmode, wmax, SIGNED);
[gcc r15-1003] Adjust testcase for -march=cascadelake
https://gcc.gnu.org/g:4d207044195b97ecb27c72a7dc987eb8b86644a0 commit r15-1003-g4d207044195b97ecb27c72a7dc987eb8b86644a0 Author: liuhongt Date: Tue Jun 4 10:13:09 2024 +0800 Adjust testcase for -march=cascadelake gcc/testsuite/ChangeLog: PR target/115299 * gcc.target/i386/pr86722.c: Also scan for blendvpd. Diff: --- gcc/testsuite/gcc.target/i386/pr86722.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/i386/pr86722.c b/gcc/testsuite/gcc.target/i386/pr86722.c index e266a1e56c2..95ddbd8ddb9 100644 --- a/gcc/testsuite/gcc.target/i386/pr86722.c +++ b/gcc/testsuite/gcc.target/i386/pr86722.c @@ -6,5 +6,5 @@ void f(double*d,double*e){ *d=(*d<.5)?.7:0; } -/* { dg-final { scan-assembler-times {(?n)(?:andnpd|andpd)} 1 } } */ +/* { dg-final { scan-assembler-times {(?n)(?:andnpd|andpd|blendvpd)} 1 } } */ /* { dg-final { scan-assembler-not "orpd" } } */
[gcc r15-984] Add some preference for floating point rtl ifcvt when sse4.1 is not available
https://gcc.gnu.org/g:ac306de7d5100d3682eae2270995a9abbe19db38 commit r15-984-gac306de7d5100d3682eae2270995a9abbe19db38 Author: liuhongt Date: Fri May 31 14:38:07 2024 +0800 Add some preference for floating point rtl ifcvt when sse4.1 is not available W/o TARGET_SSE4_1, it takes 3 instructions (pand, pandn and por) for movdfcc/movsfcc, and could possibly fail cost comparison. Increase branch cost could hurt performance for other modes, so specially add some preference for floating point ifcvt. gcc/ChangeLog: PR target/115299 * config/i386/i386.cc (ix86_noce_conversion_profitable_p): Add some preference for floating point ifcvt when SSE4.1 is not available. gcc/testsuite/ChangeLog: * gcc.target/i386/pr115299.c: New test. * gcc.target/i386/pr86722.c: Adjust testcase. Diff: --- gcc/config/i386/i386.cc | 17 + gcc/testsuite/gcc.target/i386/pr115299.c | 10 ++ gcc/testsuite/gcc.target/i386/pr86722.c | 2 +- 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 1a0206ab573..271da127a89 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -24879,6 +24879,23 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info) return false; } } + + /* W/o TARGET_SSE4_1, it takes 3 instructions (pand, pandn and por) + for movdfcc/movsfcc, and could possibly fail cost comparison. + Increase branch cost will hurt performance for other modes, so + specially add some preference for floating point ifcvt. */ + if (!TARGET_SSE4_1 && if_info->x + && GET_MODE_CLASS (GET_MODE (if_info->x)) == MODE_FLOAT + && if_info->speed_p) +{ + unsigned cost = seq_cost (seq, true); + + if (cost <= if_info->original_cost) + return true; + + return cost <= (if_info->max_seq_cost + COSTS_N_INSNS (2)); +} + return default_noce_conversion_profitable_p (seq, if_info); } diff --git a/gcc/testsuite/gcc.target/i386/pr115299.c b/gcc/testsuite/gcc.target/i386/pr115299.c new file mode 100644 index 000..53c5899136a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr115299.c @@ -0,0 +1,10 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mno-sse4.1 -msse2" } */ + +void f(double*d,double*e){ + for(;d
[gcc r15-932] Rename double_u with __double_u to avoid pulluting the namespace.
https://gcc.gnu.org/g:3a873c0a7bc8183de95a6103b507101a25eed413 commit r15-932-g3a873c0a7bc8183de95a6103b507101a25eed413 Author: liuhongt Date: Thu May 30 14:15:48 2024 +0800 Rename double_u with __double_u to avoid pulluting the namespace. gcc/ChangeLog: * config/i386/emmintrin.h (__double_u): Rename from double_u. (_mm_load_sd): Replace double_u with __double_u. (_mm_store_sd): Ditto. (_mm_loadh_pd): Ditto. (_mm_loadl_pd): Ditto. * config/i386/xmmintrin.h (__float_u): Rename from float_u. (_mm_load_ss): Ditto. (_mm_store_ss): Ditto. Diff: --- gcc/config/i386/emmintrin.h | 10 +- gcc/config/i386/xmmintrin.h | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h index fa301103daf..356ca218fcb 100644 --- a/gcc/config/i386/emmintrin.h +++ b/gcc/config/i386/emmintrin.h @@ -56,7 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); /* Unaligned version of the same types. */ typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); -typedef double double_u __attribute__ ((__may_alias__, __aligned__ (1))); +typedef double __double_u __attribute__ ((__may_alias__, __aligned__ (1))); /* Create a selector for use with the SHUFPD instruction. */ #define _MM_SHUFFLE2(fp1,fp0) \ @@ -146,7 +146,7 @@ _mm_load1_pd (double const *__P) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_sd (double const *__P) { - return __extension__ (__m128d) { *(double_u *)__P, 0.0 }; + return __extension__ (__m128d) { *(__double_u *)__P, 0.0 }; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -181,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A) extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_sd (double *__P, __m128d __A) { - *(double_u *)__P = ((__v2df)__A)[0] ; + *(__double_u *)__P = ((__v2df)__A)[0] ; } extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -974,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B) extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadh_pd (__m128d __A, double const *__B) { - return __extension__ (__m128d) { ((__v2df)__A)[0], *(double_u*)__B }; + return __extension__ (__m128d) { ((__v2df)__A)[0], *(__double_u*)__B }; } extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loadl_pd (__m128d __A, double const *__B) { - return __extension__ (__m128d) { *(double_u*)__B, ((__v2df)__A)[1] }; + return __extension__ (__m128d) { *(__double_u*)__B, ((__v2df)__A)[1] }; } extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h index 87515ecb218..c90fc71331a 100644 --- a/gcc/config/i386/xmmintrin.h +++ b/gcc/config/i386/xmmintrin.h @@ -72,7 +72,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); /* Unaligned version of the same type. */ typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); -typedef float float_u __attribute__ ((__may_alias__, __aligned__ (1))); +typedef float __float_u __attribute__ ((__may_alias__, __aligned__ (1))); /* Internal data types for implementing the intrinsics. */ typedef float __v4sf __attribute__ ((__vector_size__ (16))); @@ -910,7 +910,7 @@ _mm_set_ps1 (float __F) extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_load_ss (float const *__P) { - return __extension__ (__m128) (__v4sf){ *(float_u *)__P, 0.0f, 0.0f, 0.0f }; + return __extension__ (__m128) (__v4sf){ *(__float_u *)__P, 0.0f, 0.0f, 0.0f }; } /* Create a vector with all four elements equal to *P. */ @@ -966,7 +966,7 @@ _mm_setr_ps (float __Z, float __Y, float __X, float __W) extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_store_ss (float *__P, __m128 __A) { - *(float_u *)__P = ((__v4sf)__A)[0]; + *(__float_u *)__P = ((__v4sf)__A)[0]; } extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
[gcc r15-920] Support vcond_mask_qiqi and friends.
https://gcc.gnu.org/g:b6c6d5abf0d31c936f50f8f9073c5e335b9e24b7 commit r15-920-gb6c6d5abf0d31c936f50f8f9073c5e335b9e24b7 Author: liuhongt Date: Wed Feb 28 11:17:10 2024 +0800 Support vcond_mask_qiqi and friends. gcc/ChangeLog: * config/i386/sse.md (vcond_mask_): New expander. gcc/testsuite/ChangeLog: * gcc.target/i386/pr114125.c: New test. Diff: --- gcc/config/i386/sse.md | 20 gcc/testsuite/gcc.target/i386/pr114125.c | 10 ++ 2 files changed, 30 insertions(+) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 0f4fbcb2c5d..7cd912eeeb1 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -4807,6 +4807,26 @@ DONE; }) +(define_expand "vcond_mask_" + [(match_operand:SWI1248_AVX512BW 0 "register_operand") + (match_operand:SWI1248_AVX512BW 1 "register_operand") + (match_operand:SWI1248_AVX512BW 2 "register_operand") + (match_operand:SWI1248_AVX512BW 3 "register_operand")] + "TARGET_AVX512F" +{ + /* (operand[1] & operand[3]) | (operand[2] & ~operand[3]) */ + rtx op1 = gen_reg_rtx (mode); + rtx op2 = gen_reg_rtx (mode); + rtx op3 = gen_reg_rtx (mode); + + emit_insn (gen_and3 (op1, operands[1], operands[3])); + emit_insn (gen_one_cmpl2 (op3, operands[3])); + emit_insn (gen_and3 (op2, operands[2], op3)); + emit_insn (gen_ior3 (operands[0], op1, op2)); + + DONE; +}) + ; ;; ;; Parallel floating point logical operations diff --git a/gcc/testsuite/gcc.target/i386/pr114125.c b/gcc/testsuite/gcc.target/i386/pr114125.c new file mode 100644 index 000..e63fbffe965 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr114125.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=x86-64-v4 -fdump-tree-forwprop3-raw " } */ + +typedef long vec __attribute__((vector_size(16))); +vec f(vec x){ + vec y = x < 10; + return y & (y == 0); +} + +/* { dg-final { scan-tree-dump-not "_expr" "forwprop3" } } */
[gcc r15-919] Don't reduce estimated unrolled size for innermost loop.
https://gcc.gnu.org/g:ef27b91b62c3aa8841c02665dffa8914c742fd37 commit r15-919-gef27b91b62c3aa8841c02665dffa8914c742fd37 Author: liuhongt Date: Tue Feb 27 15:34:57 2024 +0800 Don't reduce estimated unrolled size for innermost loop. For the innermost loop, after completely loop unroll, it will most likely not be able to reduce the body size to 2/3. The current 2/3 reduction will make some of the larger loops completely unrolled during cunrolli, which will then result in them not being able to be vectorized. It also increases the register pressure. The patch move the 2/3 reduction from estimated_unrolled_size to tree_unroll_loops_completely. gcc/ChangeLog: PR tree-optimization/112325 * tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the 2 / 3 loop body size reduction to .. (try_unroll_loop_completely): .. here, add it for the check of body size shrink, and the check of comparison against param_max_completely_peeled_insns when (!cunrolli ||loop->inner). (canonicalize_loop_induction_variables): Add new parameter cunrolli and pass down. (tree_unroll_loops_completely_1): Ditto. (canonicalize_induction_variables): Pass cunrolli as false to canonicalize_loop_induction_variables. (tree_unroll_loops_completely): Set cunrolli to true at beginning and set it to false after CHANGED is true. gcc/testsuite/ChangeLog: * gcc.dg/vect/pr112325.c: New test. Diff: --- gcc/testsuite/gcc.dg/vect/pr112325.c | 59 gcc/tree-ssa-loop-ivcanon.cc | 49 -- 2 files changed, 86 insertions(+), 22 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c b/gcc/testsuite/gcc.dg/vect/pr112325.c new file mode 100644 index 000..71cf4099253 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr112325.c @@ -0,0 +1,59 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */ +/* { dg-require-effective-target vect_int } */ +/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */ + +typedef unsigned short ggml_fp16_t; +static float table_f32_f16[1 << 16]; + +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { +unsigned short s; +__builtin_memcpy(, , sizeof(unsigned short)); +return table_f32_f16[s]; +} + +typedef struct { +ggml_fp16_t d; +ggml_fp16_t m; +unsigned char qh[4]; +unsigned char qs[32 / 2]; +} block_q5_1; + +typedef struct { +float d; +float s; +char qs[32]; +} block_q8_1; + +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +const int qk = 32; +const int nb = n / qk; + +const block_q5_1 * restrict x = vx; +const block_q8_1 * restrict y = vy; + +float sumf = 0.0; + +for (int i = 0; i < nb; i++) { +unsigned qh; +__builtin_memcpy(, x[i].qh, sizeof(qh)); + +int sumi = 0; + +for (int j = 0; j < qk/2; ++j) { +const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10; +const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10; + +const int x0 = (x[i].qs[j] & 0xF) | xh_0; +const int x1 = (x[i].qs[j] >> 4) | xh_1; + +sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]); +} + +sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s; +} + +*s = sumf; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc index bf017137260..5ef24a91917 100644 --- a/gcc/tree-ssa-loop-ivcanon.cc +++ b/gcc/tree-ssa-loop-ivcanon.cc @@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge edge_to_cancel, It is (NUNROLL + 1) * size of loop body with taking into account the fact that in last copy everything after exit conditional is dead and that some instructions will be eliminated after - peeling. - - Loop body is likely going to simplify further, this is difficult - to guess, we just decrease the result by 1/3. */ - + peeling. */ static unsigned HOST_WIDE_INT estimated_unrolled_size (struct loop_size *size, unsigned HOST_WIDE_INT nunroll) @@ -453,10 +449,6 @@ estimated_unrolled_size (struct loop_size *size, unr_insns = 0; unr_insns += size->last_iteration - size->last_iteration_eliminated_by_peeling; - unr_insns = unr_insns * 2 / 3; - if (unr_insns <= 0) -unr_insns = 1; - return unr_insns; } @@ -734,7 +726,8 @@ try_unroll_loop_completely (class loop *loop, edge exit, tree niter, bool may_be_zero, enum unroll_level ul,
[gcc r15-882] Reduce cost of MEM (A + imm).
https://gcc.gnu.org/g:1d6199e5f8c1c08083eeb0279f71333234fe14ad commit r15-882-g1d6199e5f8c1c08083eeb0279f71333234fe14ad Author: liuhongt Date: Mon Feb 19 13:57:24 2024 +0800 Reduce cost of MEM (A + imm). For MEM, rtx_cost iterates each subrtx, and adds up the costs, so for MEM (reg) and MEM (reg + 4), the former costs 5, the latter costs 9, it is not accurate for x86. Ideally address_cost should be used, but it reduce cost too much. So current solution is make constant disp as cheap as possible. gcc/ChangeLog: PR target/67325 * config/i386/i386.cc (ix86_rtx_costs): Reduce cost of MEM (A + imm) to "cost of MEM (A)" + 1. gcc/testsuite/ChangeLog: * gcc.target/i386/pr67325.c: New test. Diff: --- gcc/config/i386/i386.cc | 18 +- gcc/testsuite/gcc.target/i386/pr67325.c | 7 +++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 3e2a3a194f1..85d87b9f778 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -22194,7 +22194,23 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, /* An insn that accesses memory is slightly more expensive than one that does not. */ if (speed) -*total += 1; + { + *total += 1; + rtx addr = XEXP (x, 0); + /* For MEM, rtx_cost iterates each subrtx, and adds up the costs, +so for MEM (reg) and MEM (reg + 4), the former costs 5, +the latter costs 9, it is not accurate for x86. Ideally +address_cost should be used, but it reduce cost too much. +So current solution is make constant disp as cheap as possible. */ + if (GET_CODE (addr) == PLUS + && x86_64_immediate_operand (XEXP (addr, 1), Pmode)) + { + *total += 1; + *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed); + return true; + } + } + return false; case ZERO_EXTRACT: diff --git a/gcc/testsuite/gcc.target/i386/pr67325.c b/gcc/testsuite/gcc.target/i386/pr67325.c new file mode 100644 index 000..c3c1e4c5b4d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr67325.c @@ -0,0 +1,7 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2" } */ +/* { dg-final { scan-assembler-not "(?:sar|shr)" } } */ + +int f(long*l){ + return *l>>32; +}
[gcc r15-857] Fix predicate mismatch between vfcmaddcph's define_insn and define_expand.
https://gcc.gnu.org/g:c65002347e595cda8b15e59e734d209283faf2b6 commit r15-857-gc65002347e595cda8b15e59e734d209283faf2b6 Author: liuhongt Date: Tue May 28 10:32:12 2024 +0800 Fix predicate mismatch between vfcmaddcph's define_insn and define_expand. When I applied Roger's patch [1], there's ICE due to it. The patch fix the latent bug. [1] https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651365.html gcc/ChangeLog: * config/i386/sse.md (___mask): Align operands' predicate with corresponding expander. (__): Ditto. Diff: --- gcc/config/i386/sse.md | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b59c988fc31..0f4fbcb2c5d 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -6867,9 +6867,9 @@ [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=") (vec_merge:VHF_AVX512VL (unspec:VHF_AVX512VL - [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v") -(match_operand:VHF_AVX512VL 2 "nonimmediate_operand" "") -(match_operand:VHF_AVX512VL 3 "register_operand" "0")] + [(match_operand:VHF_AVX512VL 1 "" "v") +(match_operand:VHF_AVX512VL 2 "" "") +(match_operand:VHF_AVX512VL 3 "" "0")] UNSPEC_COMPLEX_F_C_MA) (match_dup 1) (unspec: @@ -6892,8 +6892,8 @@ (define_insn "__" [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=") (unspec:VHF_AVX512VL - [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v") -(match_operand:VHF_AVX512VL 2 "nonimmediate_operand" "")] + [(match_operand:VHF_AVX512VL 1 "" "v") +(match_operand:VHF_AVX512VL 2 "" "")] UNSPEC_COMPLEX_F_C_MUL))] "TARGET_AVX512FP16 && " {
[gcc r15-814] Fix typo in the testcase.
https://gcc.gnu.org/g:51f4b47c4f4f61fe31a7bd1fa80e08c2438d76a8 commit r15-814-g51f4b47c4f4f61fe31a7bd1fa80e08c2438d76a8 Author: liuhongt Date: Fri May 24 09:49:08 2024 +0800 Fix typo in the testcase. gcc/testsuite/ChangeLog: PR target/114148 * gcc.target/i386/pr106010-7b.c: Refine testcase. Diff: --- gcc/testsuite/gcc.target/i386/pr106010-7b.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c b/gcc/testsuite/gcc.target/i386/pr106010-7b.c index 26482cc10f5..917e56e45f7 100644 --- a/gcc/testsuite/gcc.target/i386/pr106010-7b.c +++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c @@ -34,11 +34,11 @@ avx_test (void) p_init[i] = i % 2 + 3; memcpy (pd_src, p_init, 2 * N * sizeof (double)); - memcpy (ps_dst, p_init, 2 * N * sizeof (float)); - memcpy (epi64_dst, p_init, 2 * N * sizeof (long long)); - memcpy (epi32_dst, p_init, 2 * N * sizeof (int)); - memcpy (epi16_dst, p_init, 2 * N * sizeof (short)); - memcpy (epi8_dst, p_init, 2 * N * sizeof (char)); + memcpy (ps_src, p_init, 2 * N * sizeof (float)); + memcpy (epi64_src, p_init, 2 * N * sizeof (long long)); + memcpy (epi32_src, p_init, 2 * N * sizeof (int)); + memcpy (epi16_src, p_init, 2 * N * sizeof (short)); + memcpy (epi8_src, p_init, 2 * N * sizeof (char)); foo_pd (pd_dst, pd_src[0]); foo_ps (ps_dst, ps_src[0]);
[gcc r15-717] Use pblendw instead of pand to clear upper 16 bits.
https://gcc.gnu.org/g:0ebaffccb294d90184ad78367de66b6307de3ac0 commit r15-717-g0ebaffccb294d90184ad78367de66b6307de3ac0 Author: liuhongt Date: Fri Mar 22 14:40:00 2024 +0800 Use pblendw instead of pand to clear upper 16 bits. For vec_pack_truncv8si/v4si w/o AVX512, (const_vector:v4si (const_int 0x) x4) is used as mask to clear upper 16 bits, but vpblendw with zero_vector can also be used, and zero vector is cheaper than (const_vector:v4si (const_int 0x) x4). gcc/ChangeLog: PR target/114427 * config/i386/i386-expand.cc (expand_vec_perm_even_odd_pack): Use pblendw instead of pand to clear upper bits. gcc/testsuite/ChangeLog: * gcc.target/i386/pr114427.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 34 gcc/testsuite/gcc.target/i386/pr114427.c | 18 + 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 100fb2afb3a..7142c0a9d77 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -22587,6 +22587,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) { rtx op, dop0, dop1, t; unsigned i, odd, c, s, nelt = d->nelt; + int pblendw_i = 0; bool end_perm = false; machine_mode half_mode; rtx (*gen_and) (rtx, rtx, rtx); @@ -22608,6 +22609,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) gen_and = gen_andv2si3; gen_pack = gen_mmx_packusdw; gen_shift = gen_lshrv2si3; + pblendw_i = 0x5; break; case E_V8HImode: /* Required for "pack". */ @@ -22619,6 +22621,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) gen_and = gen_andv4si3; gen_pack = gen_sse4_1_packusdw; gen_shift = gen_lshrv4si3; + pblendw_i = 0x55; break; case E_V8QImode: /* No check as all instructions are SSE2. */ @@ -22647,6 +22650,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) gen_and = gen_andv8si3; gen_pack = gen_avx2_packusdw; gen_shift = gen_lshrv8si3; + pblendw_i = 0x; end_perm = true; break; case E_V32QImode: @@ -22682,10 +22686,32 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) dop1 = gen_reg_rtx (half_mode); if (odd == 0) { - t = gen_const_vec_duplicate (half_mode, GEN_INT (c)); - t = force_reg (half_mode, t); - emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0))); - emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1))); + /* Use pblendw since const_vector 0 should be cheaper than +const_vector 0x. */ + if (d->vmode == V4HImode + || d->vmode == E_V8HImode + || d->vmode == E_V16HImode) + { + rtx dop0_t = gen_reg_rtx (d->vmode); + rtx dop1_t = gen_reg_rtx (d->vmode); + t = gen_reg_rtx (d->vmode); + emit_move_insn (t, CONST0_RTX (d->vmode)); + + emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t, +GEN_INT (pblendw_i))); + emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t, +GEN_INT (pblendw_i))); + + emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t)); + emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t)); + } + else + { + t = gen_const_vec_duplicate (half_mode, GEN_INT (c)); + t = force_reg (half_mode, t); + emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0))); + emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1))); + } } else { diff --git a/gcc/testsuite/gcc.target/i386/pr114427.c b/gcc/testsuite/gcc.target/i386/pr114427.c new file mode 100644 index 000..58b66db7fff --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr114427.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v3 -O2 -mno-avx512f" } */ +/* { dg-final { scan-assembler-not "vpand" } } */ +/* { dg-final { scan-assembler-not "65535" } } */ + +void +foo (int* a, short* __restrict b, int* c) +{ +for (int i = 0; i != 16; i++) + b[i] = c[i] + a[i]; +} + +void +foo1 (int* a, short* __restrict b, int* c) +{ +for (int i = 0; i != 8; i++) + b[i] = c[i] + a[i]; +}
[gcc r15-530] Set d.one_operand_p to true when TARGET_SSSE3 in ix86_expand_vecop_qihi_partial.
https://gcc.gnu.org/g:090714e6cf8029f4ff8883dce687200024adbaeb commit r15-530-g090714e6cf8029f4ff8883dce687200024adbaeb Author: liuhongt Date: Wed May 15 10:56:24 2024 +0800 Set d.one_operand_p to true when TARGET_SSSE3 in ix86_expand_vecop_qihi_partial. pshufb is available under TARGET_SSSE3, so ix86_expand_vec_perm_const_1 must return true when TARGET_SSSE3. With the patch under -march=x86-64-v2 v8qi foo (v8qi a) { return a >> 5; } < pmovsxbw%xmm0, %xmm0 < psraw $5, %xmm0 < pshufb .LC0(%rip), %xmm0 vs. > movdqa %xmm0, %xmm1 > pcmpeqd %xmm0, %xmm0 > pmovsxbw%xmm1, %xmm1 > psrlw $8, %xmm0 > psraw $5, %xmm1 > pand%xmm1, %xmm0 > packuswb%xmm0, %xmm0 Although there's a memory load from constant pool, but it should be better when it's inside a loop. The load from constant pool can be hoist out. it's 1 instruction vs 4 instructions. < pshufb .LC0(%rip), %xmm0 vs. > pcmpeqd %xmm0, %xmm0 > psrlw $8, %xmm0 > pand%xmm1, %xmm0 > packuswb%xmm0, %xmm0 gcc/ChangeLog: PR target/114514 * config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial): Set d.one_operand_p to true when TARGET_SSSE3. gcc/testsuite/ChangeLog: * gcc.target/i386/pr114514-shufb.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 2 +- gcc/testsuite/gcc.target/i386/pr114514-shufb.c | 35 ++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 4c47cfe468ef..4e16aedc5c13 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -24458,7 +24458,7 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2) d.op0 = d.op1 = qres; d.vmode = V16QImode; d.nelt = 16; - d.one_operand_p = false; + d.one_operand_p = TARGET_SSSE3; d.testing_p = false; for (i = 0; i < d.nelt; ++i) diff --git a/gcc/testsuite/gcc.target/i386/pr114514-shufb.c b/gcc/testsuite/gcc.target/i386/pr114514-shufb.c new file mode 100644 index ..71fdc9d8daf1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr114514-shufb.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-options "-msse4.1 -O2 -mno-avx512f" } */ +/* { dg-final { scan-assembler-not "packuswb" } } */ +/* { dg-final { scan-assembler-times "pshufb" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "pshufb" 6 { target ia32 } } } */ + +typedef unsigned char v8uqi __attribute__((vector_size(8))); +typedef char v8qi __attribute__((vector_size(8))); +typedef unsigned char v4uqi __attribute__((vector_size(4))); +typedef char v4qi __attribute__((vector_size(4))); + +v8qi +foo (v8qi a) +{ + return a >> 5; +} + +v8uqi +foo1 (v8uqi a) +{ + return a >> 5; +} + +v4qi +foo2 (v4qi a) +{ + return a >> 5; +} + +v4uqi +foo3 (v4uqi a) +{ + return a >> 5; +} +
[gcc r15-529] Optimize ashift >> 7 to vpcmpgtb for vector int8.
https://gcc.gnu.org/g:0cc0956b3bb8bcbc9196075b9073a227d799e042 commit r15-529-g0cc0956b3bb8bcbc9196075b9073a227d799e042 Author: liuhongt Date: Tue May 14 18:39:54 2024 +0800 Optimize ashift >> 7 to vpcmpgtb for vector int8. Since there is no corresponding instruction, the shift operation for vector int8 is implemented using the instructions for vector int16, but for some special shift counts, it can be transformed into vpcmpgtb. gcc/ChangeLog: PR target/114514 * config/i386/i386-expand.cc (ix86_expand_vec_shift_qihi_constant): Optimize ashift >> 7 to vpcmpgtb. (ix86_expand_vecop_qihi_partial): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr114514-shift.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 32 + gcc/testsuite/gcc.target/i386/pr114514-shift.c | 49 ++ 2 files changed, 81 insertions(+) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index e846a946de07..4c47cfe468ef 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -24246,6 +24246,28 @@ ix86_expand_vec_shift_qihi_constant (enum rtx_code code, return false; gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT); + + + if (shift_amount == 7 + && code == ASHIFTRT) +{ + if (qimode == V16QImode + || qimode == V32QImode) + { + rtx zero = gen_reg_rtx (qimode); + emit_move_insn (zero, CONST0_RTX (qimode)); + emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1)); + } + else + { + gcc_assert (qimode == V64QImode); + rtx kmask = gen_reg_rtx (DImode); + emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1)); + emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask)); + } + return true; +} + /* Record sign bit. */ xor_constant = 1 << (8 - shift_amount - 1); @@ -24356,6 +24378,16 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2) return; } + if (CONST_INT_P (op2) + && code == ASHIFTRT + && INTVAL (op2) == 7) +{ + rtx zero = gen_reg_rtx (qimode); + emit_move_insn (zero, CONST0_RTX (qimode)); + emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1)); + return; +} + switch (code) { case MULT: diff --git a/gcc/testsuite/gcc.target/i386/pr114514-shift.c b/gcc/testsuite/gcc.target/i386/pr114514-shift.c new file mode 100644 index ..cf8b32b3b1d2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr114514-shift.c @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-mavx512vl -mavx512bw -O2" } */ +/* { dg-final { scan-assembler-times "vpxor" 4 } } */ +/* { dg-final { scan-assembler-times "vpcmpgtb" 4 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpcmpgtb" 5 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "vpmovb2m" 1 } } */ +/* { dg-final { scan-assembler-times "vpmovm2b" 1 } } */ + + +typedef char v16qi __attribute__((vector_size(16))); +typedef char v32qi __attribute__((vector_size(32))); +typedef char v64qi __attribute__((vector_size(64))); +typedef char v8qi __attribute__((vector_size(8))); +typedef char v4qi __attribute__((vector_size(4))); + +v4qi +__attribute__((noipa)) +foo1 (v4qi a) +{ + return a >> 7; +} + +v8qi +__attribute__((noipa)) +foo2 (v8qi a) +{ + return a >> 7; +} + +v16qi +__attribute__((noipa)) +foo3 (v16qi a) +{ + return a >> 7; +} + +v32qi +__attribute__((noipa)) +foo4 (v32qi a) +{ + return a >> 7; +} + +v64qi +__attribute__((noipa)) +foo5 (v64qi a) +{ + return a >> 7; +}
[gcc r15-499] x86: Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]
https://gcc.gnu.org/g:a71f90c5a7ae2942083921033cb23dcd63e70525 commit r15-499-ga71f90c5a7ae2942083921033cb23dcd63e70525 Author: Levy Hsu Date: Thu May 9 16:50:56 2024 +0800 x86: Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563] Hi All We've introduced a new subroutine in ix86_expand_vec_perm_const_1 to optimize vector shifting for the V16QI type on x86. This patch uses a three-instruction sequence psrlw, psllw, and por to handle specific vector shuffle operations more efficiently. The change aims to improve assembly code generation for configurations supporting SSE2. Bootstrapped and tested on x86_64-linux-gnu, OK for trunk? Best Levy gcc/ChangeLog: PR target/107563 * config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New subroutine. (ix86_expand_vec_perm_const_1): Call expand_vec_perm_psrlw_psllw_por. gcc/testsuite/ChangeLog: PR target/107563 * g++.target/i386/pr107563-a.C: New test. * g++.target/i386/pr107563-b.C: New test. Diff: --- gcc/config/i386/i386-expand.cc | 64 ++ gcc/testsuite/g++.target/i386/pr107563-a.C | 13 ++ gcc/testsuite/g++.target/i386/pr107563-b.C | 12 ++ 3 files changed, 89 insertions(+) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 1ab22fe79736..e846a946de07 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn) return true; } +/* A subroutine of ix86_expand_vec_perm_const_1. + Implement a permutation with psrlw, psllw and por. + It handles case: + __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14); + __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */ + +static bool +expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d) +{ + unsigned i; + rtx (*gen_shr) (rtx, rtx, rtx); + rtx (*gen_shl) (rtx, rtx, rtx); + rtx (*gen_or) (rtx, rtx, rtx); + machine_mode mode = VOIDmode; + + if (!TARGET_SSE2 || !d->one_operand_p) +return false; + + switch (d->vmode) +{ +case E_V8QImode: + if (!TARGET_MMX_WITH_SSE) + return false; + mode = V4HImode; + gen_shr = gen_ashrv4hi3; + gen_shl = gen_ashlv4hi3; + gen_or = gen_iorv4hi3; + break; +case E_V16QImode: + mode = V8HImode; + gen_shr = gen_vlshrv8hi3; + gen_shl = gen_vashlv8hi3; + gen_or = gen_iorv8hi3; + break; +default: return false; +} + + if (!rtx_equal_p (d->op0, d->op1)) +return false; + + for (i = 0; i < d->nelt; i += 2) +if (d->perm[i] != i + 1 || d->perm[i + 1] != i) + return false; + + if (d->testing_p) +return true; + + rtx tmp1 = gen_reg_rtx (mode); + rtx tmp2 = gen_reg_rtx (mode); + rtx op0 = force_reg (d->vmode, d->op0); + + emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode)); + emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode)); + emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8))); + emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8))); + emit_insn (gen_or (tmp1, tmp1, tmp2)); + emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode)); + + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF permutation using two vperm2f128, followed by a vshufpd insn blending the two vectors together. */ @@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_2perm_pblendv (d, false)) return true; + if (expand_vec_perm_psrlw_psllw_por (d)) +return true; + /* Try sequences of four instructions. */ if (expand_vec_perm_even_odd_trunc (d)) diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C b/gcc/testsuite/g++.target/i386/pr107563-a.C new file mode 100755 index ..605c1bdf814b --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr107563-a.C @@ -0,0 +1,13 @@ +/* PR target/107563.C */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-std=c++2b -O3 -msse2" } */ +/* { dg-final { scan-assembler-times "psllw" 1 } } */ +/* { dg-final { scan-assembler-times "psraw" 1 } } */ +/* { dg-final { scan-assembler-times "por" 1 } } */ + +using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char; + +void foo2(temp_vec_type2& v) noexcept +{ + v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6); +} diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C b/gcc/testsuite/g++.target/i386/pr107563-b.C new file mode 100755 index ..0ce3e8263bb5 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr107563-b.C @@ -0,0 +1,12 @@ +/* PR target/107563.C */ +/* { dg-options "-std=c++2b -O3 -msse2" } */ +/* { dg-final { scan-assembler-times "psllw" 1 } } */ +/* { dg-final { scan-assembler-times "psrlw" 1 } } */ +/*
[gcc r15-234] Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf.
https://gcc.gnu.org/g:a9f642783853b60bb0a59562b8ab3ed10ec01641 commit r15-234-ga9f642783853b60bb0a59562b8ab3ed10ec01641 Author: liuhongt Date: Wed Dec 20 11:54:43 2023 +0800 Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf. gcc/ChangeLog: PR target/113090 * config/i386/i386-expand.cc (expand_vec_perm_punpckldq_pshuf): New function. (ix86_expand_vec_perm_const_1): Try expand_vec_perm_punpckldq_pshuf for sequence of 2 instructions. gcc/testsuite/ChangeLog: * gcc.target/i386/pr113090.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 71 gcc/testsuite/gcc.target/i386/pr113090.c | 25 +++ 2 files changed, 96 insertions(+) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index a6132911e6a..2f27bfb484c 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -21173,6 +21173,74 @@ expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d) return true; } +/* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle. */ +static bool +expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d) +{ + if (GET_MODE_BITSIZE (d->vmode) != 64 + || !TARGET_MMX_WITH_SSE + || d->one_operand_p) +return false; + + machine_mode widen_vmode; + switch (d->vmode) +{ +/* pshufd. */ +case E_V2SImode: + widen_vmode = V4SImode; + break; + +/* pshufd. */ +case E_V2SFmode: + widen_vmode = V4SFmode; + break; + +case E_V4HImode: + widen_vmode = V8HImode; + /* pshufb. */ + if (!TARGET_SSSE3) + return false; + break; + +case E_V8QImode: + /* pshufb. */ + widen_vmode = V16QImode; + if (!TARGET_SSSE3) + return false; + break; + +default: + return false; +} + + if (d->testing_p) +return true; + + struct expand_vec_perm_d dperm; + dperm.target = gen_reg_rtx (widen_vmode); + rtx op0 = gen_reg_rtx (widen_vmode); + emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1)); + dperm.op0 = op0; + dperm.op1 = op0; + dperm.vmode = widen_vmode; + unsigned nelt = GET_MODE_NUNITS (widen_vmode); + dperm.nelt = nelt; + dperm.one_operand_p = true; + dperm.testing_p = false; + + for (unsigned i = 0; i != nelt / 2; i++) +{ + dperm.perm[i] = d->perm[i]; + dperm.perm[i + nelt / 2] = d->perm[i]; +} + + gcc_assert (expand_vec_perm_1 ()); + emit_move_insn (d->target, lowpart_subreg (d->vmode, +dperm.target, +dperm.vmode)); + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify the permutation using the SSSE3 palignr instruction. This succeeds when all of the elements in PERM fit within one vector and we merely @@ -23685,6 +23753,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_shufps_shufps (d)) return true; + if (expand_vec_perm_punpckldq_pshuf (d)) +return true; + /* Try sequences of three instructions. */ if (expand_vec_perm_even_odd_pack (d)) diff --git a/gcc/testsuite/gcc.target/i386/pr113090.c b/gcc/testsuite/gcc.target/i386/pr113090.c new file mode 100644 index 000..0f0b7cc0084 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr113090.c @@ -0,0 +1,25 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -msse4.1" } */ +/* { dg-final { scan-assembler-times "pshufd" 3 } } */ + +typedef int v2si __attribute__((vector_size(8))); +typedef short v4hi __attribute__((vector_size(8))); +typedef char v8qi __attribute__((vector_size(8))); + +v2si +foo (v2si a, v2si b) +{ +return __builtin_shufflevector (a, b, 1, 2); +} + +v4hi +foo1 (v4hi a, v4hi b) +{ + return __builtin_shufflevector (a, b, 2, 3, 4, 5); +} + +v8qi +foo2 (v8qi a, v8qi b) +{ + return __builtin_shufflevector (a, b, 4, 5, 6, 7, 8, 9, 10, 11); +}
[gcc r15-236] Extend usdot_prodv*qi with vpmaddwd when AVXVNNI/AVX512VNNI is not available.
https://gcc.gnu.org/g:8b974f54393ab2d2d16a0051a68c155455a92aad commit r15-236-g8b974f54393ab2d2d16a0051a68c155455a92aad Author: liuhongt Date: Mon Jan 8 15:13:41 2024 +0800 Extend usdot_prodv*qi with vpmaddwd when AVXVNNI/AVX512VNNI is not available. gcc/ChangeLog: * config/i386/sse.md (usdot_prodv*qi): Extend to VI1_AVX512 with vpmaddwd when avxvnni/avx512vnni is not available. Diff: --- gcc/config/i386/sse.md | 55 +- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 1bf50726e83..f57f36ae380 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -29955,21 +29955,48 @@ (define_expand "usdot_prod" [(match_operand: 0 "register_operand") - (match_operand:VI1_AVX512VNNI 1 "register_operand") - (match_operand:VI1_AVX512VNNI 2 "register_operand") + (match_operand:VI1_AVX512 1 "register_operand") + (match_operand:VI1_AVX512 2 "register_operand") (match_operand: 3 "register_operand")] - "(( == 64 && TARGET_EVEX512) -|| ((TARGET_AVX512VNNI && TARGET_AVX512VL) - || TARGET_AVXVNNI))" -{ - operands[1] = lowpart_subreg (mode, - force_reg (mode, operands[1]), - mode); - operands[2] = lowpart_subreg (mode, - force_reg (mode, operands[2]), - mode); - emit_insn (gen_vpdpbusd_ (operands[0], operands[3], - operands[1], operands[2])); + "TARGET_SSE2" +{ + if ( == 64 + ? TARGET_AVX512VNNI + : ((TARGET_AVX512VNNI && TARGET_AVX512VL) || TARGET_AVXVNNI)) +{ + operands[1] = lowpart_subreg (mode, + force_reg (mode, operands[1]), + mode); + operands[2] = lowpart_subreg (mode, + force_reg (mode, operands[2]), + mode); + emit_insn (gen_vpdpbusd_ (operands[0], operands[3], + operands[1], operands[2])); +} + else +{ + /* Emulate with vpdpwssd. */ + rtx op1_lo = gen_reg_rtx (mode); + rtx op1_hi = gen_reg_rtx (mode); + rtx op2_lo = gen_reg_rtx (mode); + rtx op2_hi = gen_reg_rtx (mode); + + emit_insn (gen_vec_unpacku_lo_ (op1_lo, operands[1])); + emit_insn (gen_vec_unpacks_lo_ (op2_lo, operands[2])); + emit_insn (gen_vec_unpacku_hi_ (op1_hi, operands[1])); + emit_insn (gen_vec_unpacks_hi_ (op2_hi, operands[2])); + + rtx res1 = gen_reg_rtx (mode); + rtx res2 = gen_reg_rtx (mode); + rtx sum = gen_reg_rtx (mode); + + emit_move_insn (sum, CONST0_RTX (mode)); + emit_insn (gen_sdot_prod (res1, op1_lo, + op2_lo, sum)); + emit_insn (gen_sdot_prod (res2, op1_hi, + op2_hi, operands[3])); + emit_insn (gen_add3 (operands[0], res1, res2)); +} DONE; })
[gcc r15-235] Support dot_prod optabs for 64-bit vector.
https://gcc.gnu.org/g:fa911365490a7ca308878517a4af6189ffba7ed6 commit r15-235-gfa911365490a7ca308878517a4af6189ffba7ed6 Author: liuhongt Date: Wed Dec 20 11:43:25 2023 +0800 Support dot_prod optabs for 64-bit vector. gcc/ChangeLog: PR target/113079 * config/i386/mmx.md (usdot_prodv8qi): New expander. (sdot_prodv8qi): Ditto. (udot_prodv8qi): Ditto. (usdot_prodv4hi): Ditto. (udot_prodv4hi): Ditto. (sdot_prodv4hi): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr113079.c: New test. * gcc.target/i386/pr113079-2.c: New test. * gcc.target/i386/sse4-pr113079-2.c: New test. Diff: --- gcc/config/i386/mmx.md | 195 gcc/testsuite/gcc.target/i386/pr113079-2.c | 161 +++ gcc/testsuite/gcc.target/i386/pr113079.c| 57 +++ gcc/testsuite/gcc.target/i386/sse4-pr113079-2.c | 158 +++ 4 files changed, 571 insertions(+) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 9a8d6030d8b..5f342497885 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -6342,6 +6342,201 @@ DONE; }) +(define_expand "usdot_prodv8qi" + [(match_operand:V2SI 0 "register_operand") + (match_operand:V8QI 1 "register_operand") + (match_operand:V8QI 2 "register_operand") + (match_operand:V2SI 3 "register_operand")] + "TARGET_MMX_WITH_SSE && TARGET_SSE4_1" +{ + operands[1] = force_reg (V8QImode, operands[1]); + operands[2] = force_reg (V8QImode, operands[2]); + operands[3] = force_reg (V2SImode, operands[3]); + + if ((TARGET_AVX512VNNI && TARGET_AVX512VL) + || TARGET_AVXVNNI) +{ + rtx op1 = lowpart_subreg (V16QImode, operands[1], V8QImode); + rtx op2 = lowpart_subreg (V16QImode, operands[2], V8QImode); + rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode); + rtx op0 = gen_reg_rtx (V4SImode); + + emit_insn (gen_usdot_prodv16qi (op0, op1, op2, op3)); + emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode)); + } + else + { + rtx op1 = gen_reg_rtx (V8HImode); + rtx op2 = gen_reg_rtx (V8HImode); + rtx op3 = gen_reg_rtx (V4SImode); + rtx op0 = gen_reg_rtx (V4SImode); + rtx op0_1 = gen_reg_rtx (V4SImode); + + emit_move_insn (op3, CONST0_RTX (V4SImode)); + emit_insn (gen_zero_extendv8qiv8hi2 (op1, operands[1])); + emit_insn (gen_extendv8qiv8hi2 (op2, operands[2])); + emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3)); + + /* vec_perm (op0, 2, 3, 0, 1); */ + emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78))); + emit_insn (gen_addv4si3 (op0, op0, op0_1)); + emit_insn (gen_addv2si3 (operands[0], operands[3], + lowpart_subreg (V2SImode, op0, V4SImode))); + } +DONE; +}) + +(define_expand "sdot_prodv8qi" + [(match_operand:V2SI 0 "register_operand") + (match_operand:V8QI 1 "register_operand") + (match_operand:V8QI 2 "register_operand") + (match_operand:V2SI 3 "register_operand")] + "TARGET_MMX_WITH_SSE && TARGET_SSE4_1" +{ + operands[1] = force_reg (V8QImode, operands[1]); + operands[2] = force_reg (V8QImode, operands[2]); + operands[3] = force_reg (V2SImode, operands[3]); + + if (TARGET_AVXVNNIINT8) +{ + rtx op1 = lowpart_subreg (V16QImode, operands[1], V8QImode); + rtx op2 = lowpart_subreg (V16QImode, operands[2], V8QImode); + rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode); + rtx op0 = gen_reg_rtx (V4SImode); + + emit_insn (gen_sdot_prodv16qi (op0, op1, op2, op3)); + emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode)); +} + else +{ + rtx op1 = gen_reg_rtx (V8HImode); + rtx op2 = gen_reg_rtx (V8HImode); + rtx op3 = gen_reg_rtx (V4SImode); + rtx op0 = gen_reg_rtx (V4SImode); + rtx op0_1 = gen_reg_rtx (V4SImode); + + emit_move_insn (op3, CONST0_RTX (V4SImode)); + emit_insn (gen_extendv8qiv8hi2 (op1, operands[1])); + emit_insn (gen_extendv8qiv8hi2 (op2, operands[2])); + emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3)); + + /* vec_perm (op0, 2, 3, 0, 1); */ + emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78))); + emit_insn (gen_addv4si3 (op0, op0, op0_1)); + emit_insn (gen_addv2si3 (operands[0], operands[3], + lowpart_subreg (V2SImode, op0, V4SImode))); +} + DONE; + +}) + +(define_expand "udot_prodv8qi" + [(match_operand:V2SI 0 "register_operand") + (match_operand:V8QI 1 "register_operand") + (match_operand:V8QI 2 "register_operand") + (match_operand:V2SI 3 "register_operand")] + "TARGET_MMX_WITH_SSE && TARGET_SSE4_1" +{ + operands[1] = force_reg (V8QImode, operands[1]); + operands[2] = force_reg (V8QImode, operands[2]); + operands[3] = force_reg (V2SImode, operands[3]); + + if
[gcc r15-167] Update libbid according to the latest Intel Decimal Floating-Point Math Library.
https://gcc.gnu.org/g:affd77d3fe7bfb525b3fb23316d164e847ed02d1 commit r15-167-gaffd77d3fe7bfb525b3fb23316d164e847ed02d1 Author: liuhongt Date: Wed Mar 27 08:20:13 2024 +0800 Update libbid according to the latest Intel Decimal Floating-Point Math Library. The Intel Decimal Floating-Point Math Library is available as open-source on Netlib[1]. [1] https://www.netlib.org/misc/intel/. libgcc/config/libbid/ChangeLog: * bid128_fma.c (add_and_round): Fix bug: the result of (+5E+368)*(+10E-34)+(-10E+369) was returning -99E+336 instead of expected result -10E+337. (bid128_ext_fma): Ditto. (bid64qqq_fma): Ditto. * bid128_noncomp.c: Change return type of bid128_class from int to class_t. * bid128_round_integral.c: Add default case to avoid compiler warning. * bid128_string.c (bid128_to_string): Replace 0x30 with '0' for zero digit. (bid128_from_string): Ditto. * bid32_to_bid128.c (bid128_to_bid32): Fix Bug. In addition to the INEXACT flag, the UNDERFLOW flag needs to be set (and was not) when converting an input such as +6931674235302037148946035460357709E+1857 to +100E-101 * bid32_to_bid64.c (bid64_to_bid32): fix Bug, In addition to the INEXACT flag, the UNDERFLOW flag needs to be set (and was not) when converting an input such as +9991E-111 to +100E-101. Furthermore, significant bits of NaNs are set correctly now. For example, 0x7c3b9aca was returning 0x7c02 instead of 0x 7c000100. * bid64_noncomp.c: Change return type of bid64_class from int to class_t. * bid64_round_integral.c (bid64_round_integral_exact): Add default case to avoid compiler warning. * bid64_string.c (bid64_from_string): Fix bug for rounding up. The input string "1" was returning +1001E+1 instead of +1000E+1. * bid64_to_bid128.c (bid128_to_bid64): Fix bug, in addition to the INEXACT flag, the UNDERFLOW flag needs to be set (and was not) when converting an input such as +99E-417 to +1000E-398. * bid_binarydecimal.c (bid32_to_binary64): Fix bug for conversion between binary and bid types. For example, 0x7c0F4240 was returning 0x7FFFA120 instead of expected double precision 0x7FF8. (binary64_to_bid32): Ditto. (binary80_to_bid32): Ditto. (binary128_to_bid32): Ditto. (binary80_to_bid64): Ditto. (binary128_to_bid64): Ditto. * bid_conf.h (BID_HIGH_128W): New macro. (BID_LOW_128W): Ditto. * bid_functions.h (__ENABLE_BINARY80__): Ditto. (ALIGN): Ditto. * bid_inline_add.h (get_add128): Add default case to avoid compiler warning. * bid_internal.h (get_BID64): Ditto. (fast_get_BID64_check_OF): Ditto. (ALIGN): New macro. Co-authored-by: Anderson, Cristina S Co-authored-by: Akkas, Ahmet Co-authored-by: Cornea, Marius Diff: --- libgcc/config/libbid/bid128_fma.c| 188 ++- libgcc/config/libbid/bid128_noncomp.c| 2 +- libgcc/config/libbid/bid128_round_integral.c | 2 + libgcc/config/libbid/bid128_string.c | 7 +- libgcc/config/libbid/bid32_to_bid128.c | 3 - libgcc/config/libbid/bid32_to_bid64.c| 11 +- libgcc/config/libbid/bid64_noncomp.c | 2 +- libgcc/config/libbid/bid64_round_integral.c | 2 + libgcc/config/libbid/bid64_string.c | 21 ++- libgcc/config/libbid/bid64_to_bid128.c | 3 - libgcc/config/libbid/bid_binarydecimal.c | 167 libgcc/config/libbid/bid_conf.h | 8 ++ libgcc/config/libbid/bid_functions.h | 23 +++- libgcc/config/libbid/bid_inline_add.h| 2 + libgcc/config/libbid/bid_internal.h | 17 +-- 15 files changed, 220 insertions(+), 238 deletions(-) diff --git a/libgcc/config/libbid/bid128_fma.c b/libgcc/config/libbid/bid128_fma.c index 67233193a42..cbcf225546f 100644 --- a/libgcc/config/libbid/bid128_fma.c +++ b/libgcc/config/libbid/bid128_fma.c @@ -417,13 +417,12 @@ add_and_round (int q3, R128.w[1] = R256.w[1]; R128.w[0] = R256.w[0]; } +if (e4 + x0 < expmin) { // for all rounding modes + is_tiny = 1; +} // the rounded result has p34 = 34 digits e4 = e4 + x0 + incr_exp; -if (rnd_mode == ROUNDING_TO_NEAREST) { - if
[gcc r15-22] Adjust alternative *k to ?k for avx512 mask in zero_extend patterns
https://gcc.gnu.org/g:c19a674d03847b900919b97d0957c8ae5164f8f1 commit r15-22-gc19a674d03847b900919b97d0957c8ae5164f8f1 Author: liuhongt Date: Tue Apr 16 08:37:22 2024 +0800 Adjust alternative *k to ?k for avx512 mask in zero_extend patterns So when both source operand and dest operand require avx512 MASK_REGS, RA can allocate MASK_REGS register instead of GPR to avoid reload it from GPR to MASK_REGS. gcc/ChangeLog: * config/i386/i386.md: (zero_extendsidi2): Adjust alternative *k to ?k. (zero_extenddi2): Ditto. (*zero_extendsi2): Ditto. (*zero_extendqihi2): Ditto. Diff: --- gcc/config/i386/i386.md | 16 - gcc/testsuite/gcc.target/i386/zero_extendkmask.c | 43 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 80e64c603eb..764bfe20ff2 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4569,10 +4569,10 @@ (define_insn "*zero_extendsidi2" [(set (match_operand:DI 0 "nonimmediate_operand" - "=r,?r,?o,r ,o,?*y,?!*y,$r,$v,$x,*x,*v,*r,*k") + "=r,?r,?o,r ,o,?*y,?!*y,$r,$v,$x,*x,*v,?r,?k") (zero_extend:DI (match_operand:SI 1 "x86_64_zext_operand" - "0 ,rm,r ,rmWz,0,r ,m ,v ,r ,m ,*x,*v,*k,*km")))] + "0 ,rm,r ,rmWz,0,r ,m ,v ,r ,m ,*x,*v,?k,?km")))] "" { switch (get_attr_type (insn)) @@ -4705,9 +4705,9 @@ [(QI "avx512dq") (HI "avx512f") (SI "avx512bw") (DI "avx512bw")]) (define_insn "zero_extenddi2" - [(set (match_operand:DI 0 "register_operand" "=r,*r,*k") + [(set (match_operand:DI 0 "register_operand" "=r,?r,?k") (zero_extend:DI -(match_operand:SWI12 1 "nonimmediate_operand" "m,*k,*km")))] +(match_operand:SWI12 1 "nonimmediate_operand" "m,?k,?km")))] "TARGET_64BIT" "@ movz{l|x}\t{%1, %k0|%k0, %1} @@ -4760,9 +4760,9 @@ (set_attr "mode" "SI")]) (define_insn "*zero_extendsi2" - [(set (match_operand:SI 0 "register_operand" "=r,*r,*k") + [(set (match_operand:SI 0 "register_operand" "=r,?r,?k") (zero_extend:SI - (match_operand:SWI12 1 "nonimmediate_operand" "m,*k,*km")))] + (match_operand:SWI12 1 "nonimmediate_operand" "m,?k,?km")))] "!(TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))" "@ movz{l|x}\t{%1, %0|%0, %1} @@ -4815,8 +4815,8 @@ ; zero extend to SImode to avoid partial register stalls (define_insn "*zero_extendqihi2" - [(set (match_operand:HI 0 "register_operand" "=r,*r,*k") - (zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" "qm,*k,*km")))] + [(set (match_operand:HI 0 "register_operand" "=r,?r,?k") + (zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" "qm,?k,?km")))] "!(TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))" "@ movz{bl|x}\t{%1, %k0|%k0, %1} diff --git a/gcc/testsuite/gcc.target/i386/zero_extendkmask.c b/gcc/testsuite/gcc.target/i386/zero_extendkmask.c new file mode 100644 index 000..6b18980bbd1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/zero_extendkmask.c @@ -0,0 +1,43 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-march=x86-64-v4 -O2" } */ +/* { dg-final { scan-assembler-not {(?n)shr[bwl]} } } */ +/* { dg-final { scan-assembler-not {(?n)movz[bw]} } } */ + +#include + +__m512 +foo (__m512d a, __m512d b, __m512 c, __m512 d) +{ + return _mm512_mask_mov_ps (c, (__mmask16) (_mm512_cmpeq_pd_mask (a, b) >> 1), d); +} + + +__m512i +foo1 (__m512d a, __m512d b, __m512i c, __m512i d) +{ + return _mm512_mask_mov_epi16 (c, (__mmask32) (_mm512_cmpeq_pd_mask (a, b) >> 1), d); +} + +__m512i +foo2 (__m512d a, __m512d b, __m512i c, __m512i d) +{ + return _mm512_mask_mov_epi8 (c, (__mmask64) (_mm512_cmpeq_pd_mask (a, b) >> 1), d); +} + +__m512i +foo3 (__m512 a, __m512 b, __m512i c, __m512i d) +{ + return _mm512_mask_mov_epi16 (c, (__mmask32) (_mm512_cmpeq_ps_mask (a, b) >> 1), d); +} + +__m512i +foo4 (__m512 a, __m512 b, __m512i c, __m512i d) +{ + return _mm512_mask_mov_epi8 (c, (__mmask64) (_mm512_cmpeq_ps_mask (a, b) >> 1), d); +} + +__m512i +foo5 (__m512i a, __m512i b, __m512i c, __m512i d) +{ + return _mm512_mask_mov_epi8 (c, (__mmask64) (_mm512_cmp_epi16_mask (a, b, 5) >> 1), d); +}
[gcc r13-8488] Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.
https://gcc.gnu.org/g:e6a3d1f5bcfd954b614155d96c97bde8ac230e2e commit r13-8488-ge6a3d1f5bcfd954b614155d96c97bde8ac230e2e Author: liuhongt Date: Fri Mar 22 10:09:43 2024 +0800 Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute. Also fixed a typo in the testcase. gcc/testsuite/ChangeLog: PR tree-optimization/114396 * gcc.target/i386/pr114396.c: Move to... * gcc.c-torture/execute/pr114396.c: ...here. (cherry picked from commit 9a6c7aa1b011b77fcd9b19f7b8d7ff0fc823cdb2) Diff: --- gcc/testsuite/{gcc.target/i386 => gcc.c-torture/execute}/pr114396.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c b/gcc/testsuite/gcc.c-torture/execute/pr114396.c similarity index 92% rename from gcc/testsuite/gcc.target/i386/pr114396.c rename to gcc/testsuite/gcc.c-torture/execute/pr114396.c index 4c4015f871f..baf90eafabf 100644 --- a/gcc/testsuite/gcc.target/i386/pr114396.c +++ b/gcc/testsuite/gcc.c-torture/execute/pr114396.c @@ -1,5 +1,5 @@ -/* { dg-do run } */ -/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */ +/* PR tree-optimization/114396 */ +/* { dg-additional-options "-fwrapv -fno-vect-cost-model" } */ short a = 0xF; short b[16]; @@ -88,7 +88,7 @@ int main() { exp = foo1 (a); res = foo1_o3 (a); - if (uexp != ures) + if (exp != res) __builtin_abort (); uexp = foou (a);
[gcc r14-9603] Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.
https://gcc.gnu.org/g:9a6c7aa1b011b77fcd9b19f7b8d7ff0fc823cdb2 commit r14-9603-g9a6c7aa1b011b77fcd9b19f7b8d7ff0fc823cdb2 Author: liuhongt Date: Fri Mar 22 10:09:43 2024 +0800 Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute. Also fixed a typo in the testcase. gcc/testsuite/ChangeLog: PR tree-optimization/114396 * gcc.target/i386/pr114396.c: Move to... * gcc.c-torture/execute/pr114396.c: ...here. Diff: --- gcc/testsuite/{gcc.target/i386 => gcc.c-torture/execute}/pr114396.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c b/gcc/testsuite/gcc.c-torture/execute/pr114396.c similarity index 92% rename from gcc/testsuite/gcc.target/i386/pr114396.c rename to gcc/testsuite/gcc.c-torture/execute/pr114396.c index 4c4015f871f..baf90eafabf 100644 --- a/gcc/testsuite/gcc.target/i386/pr114396.c +++ b/gcc/testsuite/gcc.c-torture/execute/pr114396.c @@ -1,5 +1,5 @@ -/* { dg-do run } */ -/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */ +/* PR tree-optimization/114396 */ +/* { dg-additional-options "-fwrapv -fno-vect-cost-model" } */ short a = 0xF; short b[16]; @@ -88,7 +88,7 @@ int main() { exp = foo1 (a); res = foo1_o3 (a); - if (uexp != ures) + if (exp != res) __builtin_abort (); uexp = foou (a);
[gcc r13-8475] Fix runtime error for nonlinear iv vectorization(step_mult).
https://gcc.gnu.org/g:199b021a38f30b681e0dbecd2d0296beabd50b13 commit r13-8475-g199b021a38f30b681e0dbecd2d0296beabd50b13 Author: liuhongt Date: Thu Mar 21 13:15:23 2024 +0800 Fix runtime error for nonlinear iv vectorization(step_mult). wi::from_mpz doesn't take a sign argument, we want it to be wrapped instead of saturation, so pass utype and true to it, and it fixes the bug. gcc/ChangeLog: PR tree-optimization/114396 * tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Pass utype and true to wi::from_mpz. gcc/testsuite/ChangeLog: * gcc.target/i386/pr114396.c: New test. (cherry picked from commit ac2f8c2a367151fc0410f904339c475a953cffc8) Diff: --- gcc/testsuite/gcc.target/i386/pr114396.c | 105 +++ gcc/tree-vect-loop.cc| 2 +- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c b/gcc/testsuite/gcc.target/i386/pr114396.c new file mode 100644 index 000..4c4015f871f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr114396.c @@ -0,0 +1,105 @@ +/* { dg-do run } */ +/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */ + +short a = 0xF; +short b[16]; +unsigned short ua = 0xF; +unsigned short ub[16]; + +short +__attribute__((noipa)) +foo (short a) +{ + for (int e = 0; e < 9; e += 1) +b[e] = a *= 5; + return a; +} + +short +__attribute__((noipa)) +foo1 (short a) +{ + for (int e = 0; e < 9; e += 1) +b[e] = a *= -5; + return a; +} + +unsigned short +__attribute__((noipa)) +foou (unsigned short a) +{ + for (int e = 0; e < 9; e += 1) +ub[e] = a *= -5; + return a; +} + +unsigned short +__attribute__((noipa)) +foou1 (unsigned short a) +{ + for (int e = 0; e < 9; e += 1) +ub[e] = a *= 5; + return a; +} + +short +__attribute__((noipa,optimize("O3"))) +foo_o3 (short a) +{ + for (int e = 0; e < 9; e += 1) +b[e] = a *= 5; + return a; +} + +short +__attribute__((noipa,optimize("O3"))) +foo1_o3 (short a) +{ + for (int e = 0; e < 9; e += 1) +b[e] = a *= -5; + return a; +} + +unsigned short +__attribute__((noipa,optimize("O3"))) +foou_o3 (unsigned short a) +{ + for (int e = 0; e < 9; e += 1) +ub[e] = a *= -5; + return a; +} + +unsigned short +__attribute__((noipa,optimize("O3"))) +foou1_o3 (unsigned short a) +{ + for (int e = 0; e < 9; e += 1) +ub[e] = a *= 5; + return a; +} + +int main() { + unsigned short uexp, ures; + short exp, res; + exp = foo (a); + res = foo_o3 (a); + if (exp != res) +__builtin_abort (); + + exp = foo1 (a); + res = foo1_o3 (a); + if (uexp != ures) +__builtin_abort (); + + uexp = foou (a); + ures = foou_o3 (a); + if (uexp != ures) +__builtin_abort (); + + uexp = foou1 (a); + ures = foou1_o3 (a); + if (uexp != ures) +__builtin_abort (); + + return 0; +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index d08d4996771..9615161ad37 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -8730,7 +8730,7 @@ vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, wi::to_mpz (skipn, exp, UNSIGNED); mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type)); mpz_powm (res, base, exp, mod); - begin = wi::from_mpz (type, res, TYPE_SIGN (type)); + begin = wi::from_mpz (utype, res, true); tree mult_expr = wide_int_to_tree (utype, begin); init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
[gcc r14-9591] Fix runtime error for nonlinear iv vectorization(step_mult).
https://gcc.gnu.org/g:ac2f8c2a367151fc0410f904339c475a953cffc8 commit r14-9591-gac2f8c2a367151fc0410f904339c475a953cffc8 Author: liuhongt Date: Thu Mar 21 13:15:23 2024 +0800 Fix runtime error for nonlinear iv vectorization(step_mult). wi::from_mpz doesn't take a sign argument, we want it to be wrapped instead of saturation, so pass utype and true to it, and it fixes the bug. gcc/ChangeLog: PR tree-optimization/114396 * tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Pass utype and true to wi::from_mpz. gcc/testsuite/ChangeLog: * gcc.target/i386/pr114396.c: New test. Diff: --- gcc/testsuite/gcc.target/i386/pr114396.c | 105 +++ gcc/tree-vect-loop.cc| 2 +- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c b/gcc/testsuite/gcc.target/i386/pr114396.c new file mode 100644 index 000..4c4015f871f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr114396.c @@ -0,0 +1,105 @@ +/* { dg-do run } */ +/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */ + +short a = 0xF; +short b[16]; +unsigned short ua = 0xF; +unsigned short ub[16]; + +short +__attribute__((noipa)) +foo (short a) +{ + for (int e = 0; e < 9; e += 1) +b[e] = a *= 5; + return a; +} + +short +__attribute__((noipa)) +foo1 (short a) +{ + for (int e = 0; e < 9; e += 1) +b[e] = a *= -5; + return a; +} + +unsigned short +__attribute__((noipa)) +foou (unsigned short a) +{ + for (int e = 0; e < 9; e += 1) +ub[e] = a *= -5; + return a; +} + +unsigned short +__attribute__((noipa)) +foou1 (unsigned short a) +{ + for (int e = 0; e < 9; e += 1) +ub[e] = a *= 5; + return a; +} + +short +__attribute__((noipa,optimize("O3"))) +foo_o3 (short a) +{ + for (int e = 0; e < 9; e += 1) +b[e] = a *= 5; + return a; +} + +short +__attribute__((noipa,optimize("O3"))) +foo1_o3 (short a) +{ + for (int e = 0; e < 9; e += 1) +b[e] = a *= -5; + return a; +} + +unsigned short +__attribute__((noipa,optimize("O3"))) +foou_o3 (unsigned short a) +{ + for (int e = 0; e < 9; e += 1) +ub[e] = a *= -5; + return a; +} + +unsigned short +__attribute__((noipa,optimize("O3"))) +foou1_o3 (unsigned short a) +{ + for (int e = 0; e < 9; e += 1) +ub[e] = a *= 5; + return a; +} + +int main() { + unsigned short uexp, ures; + short exp, res; + exp = foo (a); + res = foo_o3 (a); + if (exp != res) +__builtin_abort (); + + exp = foo1 (a); + res = foo1_o3 (a); + if (uexp != ures) +__builtin_abort (); + + uexp = foou (a); + ures = foou_o3 (a); + if (uexp != ures) +__builtin_abort (); + + uexp = foou1 (a); + ures = foou1_o3 (a); + if (uexp != ures) +__builtin_abort (); + + return 0; +} diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index 4375ebdcb49..2921a9e6aa1 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -9454,7 +9454,7 @@ vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr, wi::to_mpz (skipn, exp, UNSIGNED); mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type)); mpz_powm (res, base, exp, mod); - begin = wi::from_mpz (type, res, TYPE_SIGN (type)); + begin = wi::from_mpz (utype, res, true); tree mult_expr = wide_int_to_tree (utype, begin); init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
[gcc r14-9588] Document -fexcess-precision=16.
https://gcc.gnu.org/g:415091f09096a0ebba1fdcd4af8c2fda24cfd411 commit r14-9588-g415091f09096a0ebba1fdcd4af8c2fda24cfd411 Author: liuhongt Date: Mon Mar 18 18:53:59 2024 +0800 Document -fexcess-precision=16. gcc/ChangeLog: PR middle-end/114347 * doc/invoke.texi: Document -fexcess-precision=16. Diff: --- gcc/doc/invoke.texi | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index b446b2905c7..e0950ca5dc2 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -14931,6 +14931,9 @@ assignments). This option is enabled by default for C or C++ if a strict conformance option such as @option{-std=c99} or @option{-std=c++17} is used. @option{-ffast-math} enables @option{-fexcess-precision=fast} by default regardless of whether a strict conformance option is used. +If @option{-fexcess-precision=16} is specified, constants and the +results of expressions with types @code{_Float16} and @code{__bf16} +are computed without excess precision. @opindex mfpmath @option{-fexcess-precision=standard} is not implemented for languages
[gcc r14-9512] Add missing hf/bf patterns.
https://gcc.gnu.org/g:942d470a5a4fb1baeff943127a81b441dffaa543 commit r14-9512-g942d470a5a4fb1baeff943127a81b441dffaa543 Author: liuhongt Date: Fri Mar 15 10:59:10 2024 +0800 Add missing hf/bf patterns. It will be used by copysignm3/xorsignm3/lroundmn2 expanders. gcc/ChangeLog: PR target/114334 * config/i386/i386.md (mode): Add new number V8BF,V16BF,V32BF. (MODEF248): New mode iterator. (ssevecmodesuffix): Hanlde BF and HF. * config/i386/sse.md (andnot3): Extend to HF/BF. (3): Ditto. gcc/testsuite/ChangeLog: * gcc.target/i386/pr114334.c: New test. Diff: --- gcc/config/i386/i386.md | 13 + gcc/config/i386/sse.md | 22 +++--- gcc/testsuite/gcc.target/i386/pr114334.c | 8 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index df97a2d6270..11fdc6af3fa 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -543,8 +543,9 @@ ;; Main data type used by the insn (define_attr "mode" - "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,BF,SF,DF,XF,TF,V32HF,V16HF,V8HF, - V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V4BF,V2HF,V2BF" + "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,BF,SF,DF,XF,TF, + V32HF,V16HF,V8HF,V4HF,V2HF,V32BF,V16BF,V8BF,V4BF,V2BF, + V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF" (const_string "unknown")) ;; The CPU unit operations uses. @@ -1323,6 +1324,8 @@ ;; SSE and x87 SFmode and DFmode floating point modes (define_mode_iterator MODEF [SF DF]) +(define_mode_iterator MODEF248 [BF HF SF (DF "TARGET_SSE2")]) + ;; SSE floating point modes (define_mode_iterator MODEFH [(HF "TARGET_AVX512FP16") SF DF]) @@ -1347,7 +1350,8 @@ (V64QI "b") (V32HI "w") (V16SI "d") (V8DI "q")]) ;; SSE vector suffix for floating point modes -(define_mode_attr ssevecmodesuffix [(SF "ps") (DF "pd")]) +;; BF HF use same suffix as SF for logic operations. +(define_mode_attr ssevecmodesuffix [(BF "ps") (HF "ps") (SF "ps") (DF "pd")]) ;; SSE vector mode corresponding to a scalar mode (define_mode_attr ssevecmode @@ -1357,7 +1361,8 @@ ;; AVX512F vector mode corresponding to a scalar mode (define_mode_attr avx512fvecmode - [(QI "V64QI") (HI "V32HI") (SI "V16SI") (DI "V8DI") (SF "V16SF") (DF "V8DF")]) + [(QI "V64QI") (HI "V32HI") (SI "V16SI") (DI "V8DI") + (HF "V32HF") (BF "V32BF") (SF "V16SF") (DF "V8DF")]) ;; Instruction suffix for REX 64bit operators. (define_mode_attr rex64suffix [(SI "{l}") (DI "{q}")]) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 1bc614ab702..3286d3a4fac 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -5125,12 +5125,12 @@ ;; because the native instructions read the full 128-bits. (define_insn "*andnot3" - [(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v") - (and:MODEF - (not:MODEF - (match_operand:MODEF 1 "register_operand" "0,x,v,v")) - (match_operand:MODEF 2 "register_operand" "x,x,v,v")))] - "SSE_FLOAT_MODE_P (mode)" + [(set (match_operand:MODEF248 0 "register_operand" "=x,x,v,v") + (and:MODEF248 + (not:MODEF248 + (match_operand:MODEF248 1 "register_operand" "0,x,v,v")) + (match_operand:MODEF248 2 "register_operand" "x,x,v,v")))] + "TARGET_SSE" { char buf[128]; const char *ops; @@ -5257,11 +5257,11 @@ (const_string "TI")))]) (define_insn "3" - [(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v") - (any_logic:MODEF - (match_operand:MODEF 1 "register_operand" "%0,x,v,v") - (match_operand:MODEF 2 "register_operand" "x,x,v,v")))] - "SSE_FLOAT_MODE_P (mode)" + [(set (match_operand:MODEF248 0 "register_operand" "=x,x,v,v") + (any_logic:MODEF248 + (match_operand:MODEF248 1 "register_operand" "%0,x,v,v") + (match_operand:MODEF248 2 "register_operand" "x,x,v,v")))] + "TARGET_SSE" { char buf[128]; const char *ops; diff --git a/gcc/testsuite/gcc.target/i386/pr114334.c b/gcc/testsuite/gcc.target/i386/pr114334.c new file mode 100644 index 000..8e38e24cd16 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr114334.c @@ -0,0 +1,8 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -mavx512fp16" } */ + +long +foo(_Float16 f) +{ + return __builtin_lroundf16(f); +}
[gcc r12-10214] i386[stv]: Handle REG_EH_REGION note
https://gcc.gnu.org/g:a861f940efffae2782c559cd04df2d2740cd28bd commit r12-10214-ga861f940efffae2782c559cd04df2d2740cd28bd Author: liuhongt Date: Wed Mar 13 10:40:01 2024 +0800 i386[stv]: Handle REG_EH_REGION note When we split (insn 37 36 38 10 (set (reg:DI 104 [ _18 ]) (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 84 {*movdi_internal} (expr_list:REG_EH_REGION (const_int -11 [0xfff5]) into (insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0) (vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32]) (const_int 0 [0]))) "test.C":22:42 -1 (nil))) (insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0) (subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal} (expr_list:REG_EH_REGION (const_int -11 [0xfff5]) (nil))) we must copy the REG_EH_REGION note to the first insn and split the block after the newly added insn. The REG_EH_REGION on the second insn will be removed later since it no longer traps. gcc/ChangeLog: * config/i386/i386-features.cc (general_scalar_chain::convert_op): Handle REG_EH_REGION note. (convert_scalars_to_vector): Ditto. * config/i386/i386-features.h (class scalar_chain): New memeber control_flow_insns. gcc/testsuite/ChangeLog: * g++.target/i386/pr111822.C: New test. (cherry picked from commit 618e34d56cc38e9c3ae95a413228068e53ed76bb) Diff: --- gcc/config/i386/i386-features.cc | 50 gcc/config/i386/i386-features.h | 1 + gcc/testsuite/g++.target/i386/pr111822.C | 45 3 files changed, 91 insertions(+), 5 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 6a2444eb6b6..37f22ba3733 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -871,20 +871,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn) } else if (MEM_P (*op)) { + rtx_insn* eh_insn, *movabs = NULL; rtx tmp = gen_reg_rtx (GET_MODE (*op)); - /* Handle movabs. */ + /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */ if (!memory_operand (*op, GET_MODE (*op))) { rtx tmp2 = gen_reg_rtx (GET_MODE (*op)); + movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn); - emit_insn_before (gen_rtx_SET (tmp2, *op), insn); *op = tmp2; } - emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0), -gen_gpr_to_xmm_move_src (vmode, *op)), - insn); + eh_insn + = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0), +gen_gpr_to_xmm_move_src (vmode, *op)), + insn); + + if (cfun->can_throw_non_call_exceptions) + { + /* Handle REG_EH_REGION note. */ + rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX); + if (note) + { + if (movabs) + eh_insn = movabs; + control_flow_insns.safe_push (eh_insn); + add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0)); + } + } + *op = gen_rtx_SUBREG (vmode, tmp, 0); if (dump_file) @@ -1681,6 +1697,7 @@ convert_scalars_to_vector (bool timode_p) { basic_block bb; int converted_insns = 0; + auto_vec control_flow_insns; bitmap_obstack_initialize (NULL); const machine_mode cand_mode[3] = { SImode, DImode, TImode }; @@ -1759,6 +1776,11 @@ convert_scalars_to_vector (bool timode_p) fprintf (dump_file, "Chain #%d conversion is not profitable\n", chain->chain_id); + rtx_insn* iter_insn; + unsigned int ii; + FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn) + control_flow_insns.safe_push (iter_insn); + delete chain; } @@ -1826,6 +1848,24 @@ convert_scalars_to_vector (bool timode_p) DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0); } } + + if (!control_flow_insns.is_empty ()) + { + free_dominance_info (CDI_DOMINATORS); + + unsigned int i; + rtx_insn* insn; + FOR_EACH_VEC_ELT (control_flow_insns, i, insn) + if (control_flow_insn_p (insn)) + { + /* Split the block after insn. There will be a fallthru + edge, which is OK so we keep it. We have to create + the exception edges ourselves. */ + bb = BLOCK_FOR_INSN
[gcc r13-8438] i386[stv]: Handle REG_EH_REGION note
https://gcc.gnu.org/g:bdbcfbfcf591381f0faf95c881e3772b56d0a404 commit r13-8438-gbdbcfbfcf591381f0faf95c881e3772b56d0a404 Author: liuhongt Date: Wed Mar 13 10:40:01 2024 +0800 i386[stv]: Handle REG_EH_REGION note When we split (insn 37 36 38 10 (set (reg:DI 104 [ _18 ]) (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 84 {*movdi_internal} (expr_list:REG_EH_REGION (const_int -11 [0xfff5]) into (insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0) (vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32]) (const_int 0 [0]))) "test.C":22:42 -1 (nil))) (insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0) (subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal} (expr_list:REG_EH_REGION (const_int -11 [0xfff5]) (nil))) we must copy the REG_EH_REGION note to the first insn and split the block after the newly added insn. The REG_EH_REGION on the second insn will be removed later since it no longer traps. gcc/ChangeLog: * config/i386/i386-features.cc (general_scalar_chain::convert_op): Handle REG_EH_REGION note. (convert_scalars_to_vector): Ditto. * config/i386/i386-features.h (class scalar_chain): New memeber control_flow_insns. gcc/testsuite/ChangeLog: * g++.target/i386/pr111822.C: New test. (cherry picked from commit 618e34d56cc38e9c3ae95a413228068e53ed76bb) Diff: --- gcc/config/i386/i386-features.cc | 50 gcc/config/i386/i386-features.h | 1 + gcc/testsuite/g++.target/i386/pr111822.C | 45 3 files changed, 91 insertions(+), 5 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 74ee14a584a..ed3055b43f8 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -913,20 +913,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn) } else if (MEM_P (*op)) { + rtx_insn* eh_insn, *movabs = NULL; rtx tmp = gen_reg_rtx (GET_MODE (*op)); - /* Handle movabs. */ + /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */ if (!memory_operand (*op, GET_MODE (*op))) { rtx tmp2 = gen_reg_rtx (GET_MODE (*op)); + movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn); - emit_insn_before (gen_rtx_SET (tmp2, *op), insn); *op = tmp2; } - emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0), -gen_gpr_to_xmm_move_src (vmode, *op)), - insn); + eh_insn + = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0), +gen_gpr_to_xmm_move_src (vmode, *op)), + insn); + + if (cfun->can_throw_non_call_exceptions) + { + /* Handle REG_EH_REGION note. */ + rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX); + if (note) + { + if (movabs) + eh_insn = movabs; + control_flow_insns.safe_push (eh_insn); + add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0)); + } + } + *op = gen_rtx_SUBREG (vmode, tmp, 0); if (dump_file) @@ -2215,6 +2231,7 @@ convert_scalars_to_vector (bool timode_p) { basic_block bb; int converted_insns = 0; + auto_vec control_flow_insns; bitmap_obstack_initialize (NULL); const machine_mode cand_mode[3] = { SImode, DImode, TImode }; @@ -2296,6 +2313,11 @@ convert_scalars_to_vector (bool timode_p) chain->chain_id); } + rtx_insn* iter_insn; + unsigned int ii; + FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn) + control_flow_insns.safe_push (iter_insn); + delete chain; } } @@ -2364,6 +2386,24 @@ convert_scalars_to_vector (bool timode_p) DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0); } } + + if (!control_flow_insns.is_empty ()) + { + free_dominance_info (CDI_DOMINATORS); + + unsigned int i; + rtx_insn* insn; + FOR_EACH_VEC_ELT (control_flow_insns, i, insn) + if (control_flow_insn_p (insn)) + { + /* Split the block after insn. There will be a fallthru + edge, which is OK so we keep it. We have to create + the exception edges ourselves. */ + bb = BLOCK_FOR_INSN (insn); + split_block (bb,
[gcc r14-9459] i386[stv]: Handle REG_EH_REGION note
https://gcc.gnu.org/g:618e34d56cc38e9c3ae95a413228068e53ed76bb commit r14-9459-g618e34d56cc38e9c3ae95a413228068e53ed76bb Author: liuhongt Date: Wed Mar 13 10:40:01 2024 +0800 i386[stv]: Handle REG_EH_REGION note When we split (insn 37 36 38 10 (set (reg:DI 104 [ _18 ]) (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 84 {*movdi_internal} (expr_list:REG_EH_REGION (const_int -11 [0xfff5]) into (insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0) (vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32]) (const_int 0 [0]))) "test.C":22:42 -1 (nil))) (insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0) (subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal} (expr_list:REG_EH_REGION (const_int -11 [0xfff5]) (nil))) we must copy the REG_EH_REGION note to the first insn and split the block after the newly added insn. The REG_EH_REGION on the second insn will be removed later since it no longer traps. gcc/ChangeLog: * config/i386/i386-features.cc (general_scalar_chain::convert_op): Handle REG_EH_REGION note. (convert_scalars_to_vector): Ditto. * config/i386/i386-features.h (class scalar_chain): New memeber control_flow_insns. gcc/testsuite/ChangeLog: * g++.target/i386/pr111822.C: New test. Diff: --- gcc/config/i386/i386-features.cc | 50 gcc/config/i386/i386-features.h | 1 + gcc/testsuite/g++.target/i386/pr111822.C | 45 3 files changed, 91 insertions(+), 5 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 1de2a07ed75..c7d7a965901 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -998,20 +998,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn) } else if (MEM_P (*op)) { + rtx_insn* eh_insn, *movabs = NULL; rtx tmp = gen_reg_rtx (GET_MODE (*op)); - /* Handle movabs. */ + /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */ if (!memory_operand (*op, GET_MODE (*op))) { rtx tmp2 = gen_reg_rtx (GET_MODE (*op)); + movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn); - emit_insn_before (gen_rtx_SET (tmp2, *op), insn); *op = tmp2; } - emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0), -gen_gpr_to_xmm_move_src (vmode, *op)), - insn); + eh_insn + = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0), +gen_gpr_to_xmm_move_src (vmode, *op)), + insn); + + if (cfun->can_throw_non_call_exceptions) + { + /* Handle REG_EH_REGION note. */ + rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX); + if (note) + { + if (movabs) + eh_insn = movabs; + control_flow_insns.safe_push (eh_insn); + add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0)); + } + } + *op = gen_rtx_SUBREG (vmode, tmp, 0); if (dump_file) @@ -2494,6 +2510,7 @@ convert_scalars_to_vector (bool timode_p) { basic_block bb; int converted_insns = 0; + auto_vec control_flow_insns; bitmap_obstack_initialize (NULL); const machine_mode cand_mode[3] = { SImode, DImode, TImode }; @@ -2575,6 +2592,11 @@ convert_scalars_to_vector (bool timode_p) chain->chain_id); } + rtx_insn* iter_insn; + unsigned int ii; + FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn) + control_flow_insns.safe_push (iter_insn); + delete chain; } } @@ -2643,6 +2665,24 @@ convert_scalars_to_vector (bool timode_p) DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0); } } + + if (!control_flow_insns.is_empty ()) + { + free_dominance_info (CDI_DOMINATORS); + + unsigned int i; + rtx_insn* insn; + FOR_EACH_VEC_ELT (control_flow_insns, i, insn) + if (control_flow_insn_p (insn)) + { + /* Split the block after insn. There will be a fallthru + edge, which is OK so we keep it. We have to create + the exception edges ourselves. */ + bb = BLOCK_FOR_INSN (insn); + split_block (bb, insn); + rtl_make_eh_edge (NULL, bb, BB_END (bb)); + } +