[gcc r15-1701] i386: Some additional AVX512 ternlog refinements.
https://gcc.gnu.org/g:5938cf021e95b40b040974c9cbe7860399247f7f commit r15-1701-g5938cf021e95b40b040974c9cbe7860399247f7f Author: Roger Sayle Date: Fri Jun 28 07:12:53 2024 +0100 i386: Some additional AVX512 ternlog refinements. This patch is another round of refinements to fine tune the new ternlog infrastructure in i386's sse.md. This patch tweaks ix86_ternlog_idx to allow multiple MEM/CONST_VECTOR/VEC_DUPLICATE operands prior to splitting (before reload), when force_register is called on all but one of these operands. Conceptually during the dynamic programming, registers fill the args slots in the order 0, 1, 2, and mem-like operands fill the slots in the order 2, 0, 1 [preferring the memory operand to come last]. This patch allows us to remove some of the legacy ternlog patterns in sse.md without regressions [which is left to the next and final patch in this series]. An indication that these patterns are no longer required is shown by the necessary testsuite tweaks below, where the output assembler for the legacy instructions used hexadecimal, but with the new ternlog infrastructure now consistently use decimal. 2024-06-28 Roger Sayle gcc/ChangeLog * config/i386/i386-expand.cc (ix86_ternlog_idx) : Add a "goto do_mem_operand" as this need not match memory_operand. : Only args[2] may be volatile memory operand. Allow MEM/VEC_DUPLICATE/CONST_VECTOR as args[0] and args[1]. gcc/testsuite/ChangeLog * gcc.target/i386/avx512f-andn-di-zmm-2.c: Match decimal instead of hexadecimal immediate operand to ternlog. * gcc.target/i386/avx512f-andn-si-zmm-2.c: Likewise. * gcc.target/i386/avx512f-orn-si-zmm-1.c: Likewise. * gcc.target/i386/avx512f-orn-si-zmm-2.c: Likewise. * gcc.target/i386/pr100711-3.c: Likewise. * gcc.target/i386/pr100711-4.c: Likewise. * gcc.target/i386/pr100711-5.c: Likewise. Diff: --- gcc/config/i386/i386-expand.cc | 35 -- .../gcc.target/i386/avx512f-andn-di-zmm-2.c| 2 +- .../gcc.target/i386/avx512f-andn-si-zmm-2.c| 2 +- .../gcc.target/i386/avx512f-orn-si-zmm-1.c | 2 +- .../gcc.target/i386/avx512f-orn-si-zmm-2.c | 2 +- gcc/testsuite/gcc.target/i386/pr100711-3.c | 2 +- gcc/testsuite/gcc.target/i386/pr100711-4.c | 2 +- gcc/testsuite/gcc.target/i386/pr100711-5.c | 2 +- 8 files changed, 39 insertions(+), 10 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index eccad080f7c..dd2c3a8718e 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -25606,7 +25606,7 @@ ix86_ternlog_idx (rtx op, rtx *args) case VEC_DUPLICATE: if (!bcst_mem_operand (op, GET_MODE (op))) return -1; - /* FALLTHRU */ + goto do_mem_operand; case MEM: if (!memory_operand (op, GET_MODE (op))) @@ -25618,23 +25618,52 @@ ix86_ternlog_idx (rtx op, rtx *args) /* FALLTHRU */ case CONST_VECTOR: +do_mem_operand: if (!args[2]) { args[2] = op; return 0xaa; } /* Maximum of one volatile memory reference per expression. */ - if (side_effects_p (op) && side_effects_p (args[2])) + if (side_effects_p (op)) return -1; if (rtx_equal_p (op, args[2])) return 0xaa; - /* Check if one CONST_VECTOR is the ones-complement of the other. */ + /* Check if CONST_VECTOR is the ones-complement of args[2]. */ if (GET_CODE (op) == CONST_VECTOR && GET_CODE (args[2]) == CONST_VECTOR && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op), op, GET_MODE (op)), args[2])) return 0x55; + if (!args[0]) + { + args[0] = op; + return 0xf0; + } + if (rtx_equal_p (op, args[0])) + return 0xf0; + /* Check if CONST_VECTOR is the ones-complement of args[0]. */ + if (GET_CODE (op) == CONST_VECTOR + && GET_CODE (args[0]) == CONST_VECTOR + && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op), + op, GET_MODE (op)), + args[0])) + return 0x0f; + if (!args[1]) + { + args[1] = op; + return 0xcc; + } + if (rtx_equal_p (op, args[1])) + return 0xcc; + /* Check if CONST_VECTOR is the ones-complement of args[1]. */ + if (GET_CODE (op) == CONST_VECTOR + && GET_CODE (args[1]) == CONST_VECTOR + && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op), + op, GET_MODE (op)), +
[gcc r15-1702] i386: Handle sign_extend like zero_extend in *concatditi3_[346]
https://gcc.gnu.org/g:07e915913b6b3d4e6e210f6dbc8e7e0e8ea594c4 commit r15-1702-g07e915913b6b3d4e6e210f6dbc8e7e0e8ea594c4 Author: Roger Sayle Date: Fri Jun 28 07:16:07 2024 +0100 i386: Handle sign_extend like zero_extend in *concatditi3_[346] This patch generalizes some of the patterns in i386.md that recognize double word concatenation, so they handle sign_extend the same way that they handle zero_extend in appropriate contexts. As a motivating example consider the following function: __int128 foo(long long x, unsigned long long y) { return ((__int128)x<<64) | y; } when compiled with -O2, x86_64 currently generates: foo:movq%rdi, %rdx xorl%eax, %eax xorl%edi, %edi orq %rsi, %rax orq %rdi, %rdx ret with this patch we now generate (the same as if x is unsigned): foo:movq%rsi, %rax movq%rdi, %rdx ret Treating both extensions the same way using any_extend is valid as the top (extended) bits are "unused" after the shift by 64 (or more). In theory, the RTL optimizers might consider canonicalizing the form of extension used in these cases, but zero_extend is faster on some machine, whereas sign extension is supported via addressing modes on others, so handling both in the machine description is probably best. 2024-06-28 Roger Sayle gcc/ChangeLog * config/i386/i386.md (*concat3_3): Change zero_extend to any_extend in first operand to left shift by mode precision. (*concat3_4): Likewise. (*concat3_6): Likewise. gcc/testsuite/ChangeLog * gcc.target/i386/concatditi-1.c: New test case. Diff: --- gcc/config/i386/i386.md | 6 +++--- gcc/testsuite/gcc.target/i386/concatditi-1.c | 10 ++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index fd48e764469..b6ccb1e798d 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -13446,7 +13446,7 @@ [(set (match_operand: 0 "nonimmediate_operand" "=ro,r,r,&r,x") (any_or_plus: (ashift: - (zero_extend: + (any_extend: (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m,x")) (match_operand:QI 2 "const_int_operand")) (zero_extend: @@ -13473,7 +13473,7 @@ (zero_extend: (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m")) (ashift: - (zero_extend: + (any_extend: (match_operand:DWIH 2 "nonimmediate_operand" "r,r,m,m")) (match_operand:QI 3 "const_int_operand"] "INTVAL (operands[3]) == * BITS_PER_UNIT" @@ -13520,7 +13520,7 @@ [(set (match_operand: 0 "nonimmediate_operand" "=r,o,o,r") (any_or_plus: (ashift: - (zero_extend: + (any_extend: (match_operand:DWIH 1 "nonimmediate_operand" "r,r,r,m")) (match_operand:QI 2 "const_int_operand")) (match_operand: 3 "const_scalar_int_operand" "n,n,Wd,n")))] diff --git a/gcc/testsuite/gcc.target/i386/concatditi-1.c b/gcc/testsuite/gcc.target/i386/concatditi-1.c new file mode 100644 index 000..25c2a95586b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/concatditi-1.c @@ -0,0 +1,10 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2" } */ + +__int128 foo(long long x, unsigned long long y) +{ + return ((__int128)x<<64) | y; +} + +/* { dg-final { scan-assembler-not "xorl" } } */ +/* { dg-final { scan-assembler-not "orq" } } */
[gcc r15-1751] i386: Additional peephole2 to use lea in round-up integer division.
https://gcc.gnu.org/g:142b5263b18be96e5d9ce406ad2c1b6ab35c190f commit r15-1751-g142b5263b18be96e5d9ce406ad2c1b6ab35c190f Author: Roger Sayle Date: Mon Jul 1 12:18:26 2024 +0100 i386: Additional peephole2 to use lea in round-up integer division. A common idiom for implementing an integer division that rounds upwards is to write (x + y - 1) / y. Conveniently on x86, the two additions to form the numerator can be performed by a single lea instruction, and indeed gcc currently generates a lea when both x and y are both registers. int foo(int x, int y) { return (x+y-1)/y; } generates with -O2: foo:leal-1(%rsi,%rdi), %eax // 4 bytes cltd idivl %esi ret Oddly, however, if x is a memory, gcc currently uses two instructions: int m; int bar(int y) { return (m+y-1)/y; } generates: foo:movlm(%rip), %eax addl%edi, %eax // 2 bytes subl$1, %eax// 3 bytes cltd idivl %edi ret This discrepancy is caused by the late decision (in peephole2) to split an addition with a memory operand, into a load followed by a reg-reg addition. This patch improves this situation by adding a peephole2 to recognize consecutive additions and transform them into lea if profitable. My first attempt at fixing this was to use a define_insn_and_split: (define_insn_and_split "*lea3_reg_mem_imm" [(set (match_operand:SWI48 0 "register_operand") (plus:SWI48 (plus:SWI48 (match_operand:SWI48 1 "register_operand") (match_operand:SWI48 2 "memory_operand")) (match_operand:SWI48 3 "x86_64_immediate_operand")))] "ix86_pre_reload_split ()" "#" "&& 1" [(set (match_dup 4) (match_dup 2)) (set (match_dup 0) (plus:SWI48 (plus:SWI48 (match_dup 1) (match_dup 4)) (match_dup 3)))] "operands[4] = gen_reg_rtx (mode);") using combine to combine instructions. Unfortunately, this approach interferes with (reload's) subtle balance of deciding when to use/avoid lea, which can be observed as a code size regression in CSiBE. The peephole2 approach (proposed here) uniformly improves CSiBE results. 2024-07-01 Roger Sayle gcc/ChangeLog * config/i386/i386.md (peephole2): Transform two consecutive additions into a 3-component lea if !TARGET_AVOID_LEA_FOR_ADDR. gcc/testsuite/ChangeLog * gcc.target/i386/lea-3.c: New test case. Diff: --- gcc/config/i386/i386.md | 15 +++ gcc/testsuite/gcc.target/i386/lea-3.c | 13 + 2 files changed, 28 insertions(+) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 59a889da304..0b6f6e75072 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -6332,6 +6332,21 @@ "TARGET_APX_NF && reload_completed" [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))] "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));") + +;; The peephole2 pass may expose consecutive additions suitable for lea. +(define_peephole2 + [(parallel [(set (match_operand:SWI48 0 "register_operand") + (plus:SWI48 (match_dup 0) + (match_operand 1 "register_operand"))) + (clobber (reg:CC FLAGS_REG))]) + (parallel [(set (match_dup 0) + (plus:SWI48 (match_dup 0) + (match_operand 2 "x86_64_immediate_operand"))) + (clobber (reg:CC FLAGS_REG))])] + "!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun)" + [(set (match_dup 0) (plus:SWI48 (plus:SWI48 (match_dup 0) + (match_dup 1)) + (match_dup 2)))]) ;; Add instructions diff --git a/gcc/testsuite/gcc.target/i386/lea-3.c b/gcc/testsuite/gcc.target/i386/lea-3.c new file mode 100644 index 000..84e66b00fc2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/lea-3.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +int m; + +int foo(int y) +{ + return (m+y-1)/y; +} + +/* { dg-final { scan-assembler "leal" } } */ +/* { dg-final { scan-assembler-not "addl" } } */ +/* { dg-final { scan-assembler-not "subl" } } */
[gcc r15-1752] testsuite: Fix -m32 gcc.target/i386/pr102464-vrndscaleph.c on RedHat.
https://gcc.gnu.org/g:589865a8e4f6bd26c622ea0ee0a38565a0d42e80 commit r15-1752-g589865a8e4f6bd26c622ea0ee0a38565a0d42e80 Author: Roger Sayle Date: Mon Jul 1 12:21:20 2024 +0100 testsuite: Fix -m32 gcc.target/i386/pr102464-vrndscaleph.c on RedHat. This patch fixes the 4 FAILs of gcc.target/i386/pr192464-vrndscaleph.c with --target_board='unix{-m32}' on RedHat 7.x. The issue is that this AVX512 test includes the system math.h, and on older systems this provides inline versions of floor, ceil and rint (for the 387). The work around is to define __NO_MATH_INLINES before #include (or alternatively use __builtin_floor, __builtin_ceil, etc.). 2024-07-01 Roger Sayle gcc/testsuite/ChangeLog PR middle-end/102464 * gcc.target/i386/pr102464-vrndscaleph.c: Define __NO_MATH_INLINES to resovle FAILs with -m32 on older RedHat systems. Diff: --- gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c index a76d9e7e376..9eb8124e3f5 100644 --- a/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c +++ b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c @@ -1,6 +1,9 @@ /* PR target/102464. */ /* { dg-do compile } */ /* { dg-options "-Ofast -mavx512fp16 -mavx512vl -mprefer-vector-width=512" } */ +#ifndef __NO_MATH_INLINES +#define __NO_MATH_INLINES +#endif #include void foo (_Float16* __restrict a, _Float16* b)
[gcc r15-1835] i386: Add additional variant of bswaphisi2_lowpart peephole2.
https://gcc.gnu.org/g:727f8b142b7d5442af6c2e903293abc367a8de5f commit r15-1835-g727f8b142b7d5442af6c2e903293abc367a8de5f Author: Roger Sayle Date: Thu Jul 4 07:31:17 2024 +0100 i386: Add additional variant of bswaphisi2_lowpart peephole2. This patch adds an additional variation of the peephole2 used to convert bswaphisi2_lowpart into rotlhi3_1_slp, which converts xchgb %ah,%al into rotw if the flags register isn't live. The motivating example is: void ext(int x); void foo(int x) { ext((x&~0x)|((x>>8)&0xff)|((x&0xff)<<8)); } where GCC with -O2 currently produces: foo:movl%edi, %eax rolw$8, %ax movl%eax, %edi jmp ext The issue is that the original xchgb (bswaphisi2_lowpart) can only be performed in "Q" registers that allow the %?h register to be used, so reload generates the above two movl. However, it's later in peephole2 where we see that CC_FLAGS can be clobbered, so we can use a rotate word, which is more forgiving with register allocations. With the additional peephole2 proposed here, we now generate: foo:rolw$8, %di jmp ext 2024-07-04 Roger Sayle gcc/ChangeLog * config/i386/i386.md (bswaphisi2_lowpart peephole2): New peephole2 variant to eliminate register shuffling. gcc/testsuite/ChangeLog * gcc.target/i386/xchg-4.c: New test case. Diff: --- gcc/config/i386/i386.md| 24 gcc/testsuite/gcc.target/i386/xchg-4.c | 11 +++ 2 files changed, 35 insertions(+) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 4a44b69b5fc..b24c4fe5875 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -21489,6 +21489,30 @@ (clobber (reg:CC FLAGS_REG))])] "operands[0] = gen_lowpart (HImode, operands[0]);") +;; Variant of above peephole2 to improve register allocation. +(define_peephole2 + [(set (match_operand:SI 0 "general_reg_operand") +(match_operand:SI 1 "register_operand")) + (set (match_dup 0) + (ior:SI (and:SI (match_dup 0) + (const_int -65536)) + (lshiftrt:SI (bswap:SI (match_dup 0)) +(const_int 16 + (set (match_operand:SI 2 "general_reg_operand") (match_dup 0))] + "!(TARGET_USE_XCHGB || + TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun)) + && peep2_regno_dead_p (0, FLAGS_REG) + && peep2_reg_dead_p(3, operands[0])" + [(parallel +[(set (strict_low_part (match_dup 3)) + (rotate:HI (match_dup 3) (const_int 8))) + (clobber (reg:CC FLAGS_REG))])] +{ + if (!rtx_equal_p (operands[1], operands[2])) +emit_move_insn (operands[2], operands[1]); + operands[3] = gen_lowpart (HImode, operands[2]); +}) + (define_expand "paritydi2" [(set (match_operand:DI 0 "register_operand") (parity:DI (match_operand:DI 1 "register_operand")))] diff --git a/gcc/testsuite/gcc.target/i386/xchg-4.c b/gcc/testsuite/gcc.target/i386/xchg-4.c new file mode 100644 index 000..de099e79f5d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/xchg-4.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2" } */ + +void ext(int x); +void foo(int x) +{ +ext((x&~0x)|((x>>8)&0xff)|((x&0xff)<<8)); +} + +/* { dg-final { scan-assembler "rolw" } } */ +/* { dg-final { scan-assembler-not "mov" } } */
[gcc r15-1869] PR target/115751: Avoid force_reg in ix86_expand_ternlog.
https://gcc.gnu.org/g:9a7e3f57e1ab8e6e4cf5ea3c0998aa50c6220579 commit r15-1869-g9a7e3f57e1ab8e6e4cf5ea3c0998aa50c6220579 Author: Roger Sayle Date: Sat Jul 6 05:24:39 2024 +0100 PR target/115751: Avoid force_reg in ix86_expand_ternlog. This patch fixes a problem with splitting of complex AVX512 ternlog instructions on x86_64. A recent change allows the ternlog pattern to have multiple mem-like operands prior to reload, by emitting any "reloads" as necessary during split1, before register allocation. The issue is that this code calls force_reg to place the mem-like operand into a register, but unfortunately the vec_duplicate (broadcast) form of operands supported by ternlog isn't considered a "general_operand", i.e. supported by all instructions. This mismatch triggers an ICE in the middle-end's force_reg, even though the x86 supports loading these vec_duplicate operands into a vector register in a single (move) instruction. This patch resolves this problem by replacing force_reg with calls to gen_reg_rtx and emit_move (as the i386 backend, unlike the middle-end, knows these will be recognized by recog). 2024-07-06 Roger Sayle gcc/ChangeLog PR target/115751 * config/i386/i386-expand.cc (ix86_expand_ternlog): Avoid use of force_reg to "reload" non-register operands, as these may contain vec_duplicate (broadcast) operands that aren't supported by force_reg. Use (safer) gen_reg_rtx and emit_move instead. Diff: --- gcc/config/i386/i386-expand.cc | 15 +-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index a773b45bf03..bf79e59f811 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -26050,14 +26050,25 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, break; } - tmp0 = register_operand (op0, mode) ? op0 : force_reg (mode, op0); + if (!register_operand (op0, mode)) +{ + /* We can't use force_reg (mode, op0). */ + tmp0 = gen_reg_rtx (GET_MODE (op0)); + emit_move_insn (tmp0,op0); +} + else +tmp0 = op0; if (GET_MODE (tmp0) != mode) tmp0 = gen_lowpart (mode, tmp0); if (!op1 || rtx_equal_p (op0, op1)) tmp1 = copy_rtx (tmp0); else if (!register_operand (op1, mode)) -tmp1 = force_reg (mode, op1); +{ + /* We can't use force_reg (mode, op1). */ + tmp1 = gen_reg_rtx (GET_MODE (op1)); + emit_move_insn (tmp1, op1); +} else tmp1 = op1; if (GET_MODE (tmp1) != mode)
[gcc r15-2000] i386: Some AVX512 ternlog expansion refinements.
https://gcc.gnu.org/g:6b5d263f2c90c3e22cdf576970c94bca268c5296 commit r15-2000-g6b5d263f2c90c3e22cdf576970c94bca268c5296 Author: Roger Sayle Date: Fri Jul 12 12:30:56 2024 +0100 i386: Some AVX512 ternlog expansion refinements. This patch replaces the calls to force_reg in ix86_expand_ternlog_binop and ix86_expand_ternlog with gen_reg_rtx and emit_move_insn. This patch also cleans up whitespace, consistently uses CONST_VECTOR_P instead of GET_CODE and tweaks checks for ix86_ternlog_leaf_p (for example where vpandn may take a memory operand). 2024-07-12 Roger Sayle Hongtao Liu gcc/ChangeLog * config/i386/i386-expand.cc (ix86_broadcast_from_constant): Use CONST_VECTOR_P instead of comparison against GET_CODE. (ix86_gen_bcst_mem): Likewise. (ix86_ternlog_leaf_p): Likewise. (ix86_ternlog_operand_p): ix86_ternlog_leaf_p is always true for vector_all_ones_operand. (ix86_expand_ternlog_bin_op): Use CONST_VECTOR_P instead of equality comparison against GET_CODE. Replace call to force_reg with gen_reg_rtx and emit_move_insn (for VEC_DUPLICATE broadcast). Check for !register_operand instead of memory_operand. Support CONST_VECTORs by calling force_const_mem. (ix86_expand_ternlog): Fix indentation whitespace. Allow ix86_ternlog_leaf_p as ix86_expand_ternlog_andnot's second operand. Use CONST_VECTOR_P instead of equality against GET_CODE. Use gen_reg_rtx and emit_move_insn for ~a, ~b and ~c cases. Diff: --- gcc/config/i386/i386-expand.cc | 126 + 1 file changed, 78 insertions(+), 48 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index abc702d3ff27..cfcfdd94e8f0 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -613,7 +613,7 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op) return nullptr; rtx constant = get_pool_constant (XEXP (op, 0)); - if (GET_CODE (constant) != CONST_VECTOR) + if (!CONST_VECTOR_P (constant)) return nullptr; /* There could be some rtx like @@ -623,7 +623,7 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op) { constant = simplify_subreg (mode, constant, GET_MODE (constant), 0); - if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR) + if (constant == nullptr || !CONST_VECTOR_P (constant)) return nullptr; } @@ -25561,7 +25561,7 @@ static rtx ix86_gen_bcst_mem (machine_mode mode, rtx x) { if (!TARGET_AVX512F - || GET_CODE (x) != CONST_VECTOR + || !CONST_VECTOR_P (x) || (!TARGET_AVX512VL && (GET_MODE_SIZE (mode) != 64 || !TARGET_EVEX512)) || !VALID_BCST_MODE_P (GET_MODE_INNER (mode)) @@ -25751,7 +25751,7 @@ ix86_ternlog_leaf_p (rtx op, machine_mode mode) problems splitting instructions. */ return register_operand (op, mode) || MEM_P (op) -|| GET_CODE (op) == CONST_VECTOR +|| CONST_VECTOR_P (op) || bcst_mem_operand (op, mode); } @@ -25801,8 +25801,7 @@ ix86_ternlog_operand_p (rtx op) op1 = XEXP (op, 1); /* Prefer pxor, or one_cmpl2. */ if (ix86_ternlog_leaf_p (XEXP (op, 0), mode) - && (ix86_ternlog_leaf_p (op1, mode) - || vector_all_ones_operand (op1, mode))) + && ix86_ternlog_leaf_p (XEXP (op, 1), mode)) return false; break; @@ -25822,15 +25821,20 @@ ix86_expand_ternlog_binop (enum rtx_code code, machine_mode mode, if (GET_MODE (op1) != mode) op1 = gen_lowpart (mode, op1); - if (GET_CODE (op0) == CONST_VECTOR) + if (CONST_VECTOR_P (op0)) op0 = validize_mem (force_const_mem (mode, op0)); - if (GET_CODE (op1) == CONST_VECTOR) + if (CONST_VECTOR_P (op1)) op1 = validize_mem (force_const_mem (mode, op1)); - if (memory_operand (op0, mode)) + if (!register_operand (op0, mode)) { - if (memory_operand (op1, mode)) - op0 = force_reg (mode, op0); + if (!register_operand (op1, mode)) + { + /* We can't use force_reg (op0, mode). */ + rtx reg = gen_reg_rtx (mode); + emit_move_insn (reg, op0); + op0 = reg; + } else std::swap (op0, op1); } @@ -25849,6 +25853,8 @@ ix86_expand_ternlog_andnot (machine_mode mode, rtx op0, rtx op1, rtx target) op0 = gen_rtx_NOT (mode, op0); if (GET_MODE (op1) != mode) op1 = gen_lowpart (mode, op1); + if (CONST_VECTOR_P (op1)) +op1 = validize_mem (force_const_mem (mode, op1)); emit_move_insn (target, gen_rtx_AND (mode, op0, op1)); return target; } @@ -25885,9 +25891,9 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, { case 0x00: if ((!op0 || !side_effects_p (op0)) -
[gcc r15-2027] i386: Tweak i386-expand.cc to restore bootstrap on RHEL.
https://gcc.gnu.org/g:74e6dfb23163c2dd670d1d60fbf4c782e0b44b94 commit r15-2027-g74e6dfb23163c2dd670d1d60fbf4c782e0b44b94 Author: Roger Sayle Date: Sun Jul 14 17:22:27 2024 +0100 i386: Tweak i386-expand.cc to restore bootstrap on RHEL. This is a minor change to restore bootstrap on systems using gcc 4.8 as a host compiler. The fatal error is: In file included from gcc/gcc/coretypes.h:471:0, from gcc/gcc/config/i386/i386-expand.cc:23: gcc/gcc/config/i386/i386-expand.cc: In function 'void ix86_expand_fp_absneg_operator(rtx_code, machine_mode, rtx_def**)': ./insn-modes.h:315:75: error: temporary of non-literal type 'scalar_float_mode' in a constant expression #define HFmode (scalar_float_mode ((scalar_float_mode::from_int) E_HFmode)) ^ gcc/gcc/config/i386/i386-expand.cc:2179:8: note: in expansion of macro 'HFmode' case HFmode: ^ The solution is to use the E_?Fmode enumeration constants as case values in switch statements. 2024-07-14 Roger Sayle * config/i386/i386-expand.cc (ix86_expand_fp_absneg_operator): Use E_?Fmode enumeration constants in switch statement. (ix86_expand_copysign): Likewise. (ix86_expand_xorsign): Likewise. Diff: --- gcc/config/i386/i386-expand.cc | 26 +- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index cfcfdd94e8f0..9a31e6df2aa2 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -2176,19 +2176,19 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode, switch (mode) { - case HFmode: + case E_HFmode: use_sse = true; vmode = V8HFmode; break; - case BFmode: + case E_BFmode: use_sse = true; vmode = V8BFmode; break; - case SFmode: + case E_SFmode: use_sse = TARGET_SSE_MATH && TARGET_SSE; vmode = V4SFmode; break; - case DFmode: + case E_DFmode: use_sse = TARGET_SSE_MATH && TARGET_SSE2; vmode = V2DFmode; break; @@ -2330,19 +2330,19 @@ ix86_expand_copysign (rtx operands[]) switch (mode) { - case HFmode: + case E_HFmode: vmode = V8HFmode; break; - case BFmode: + case E_BFmode: vmode = V8BFmode; break; - case SFmode: + case E_SFmode: vmode = V4SFmode; break; - case DFmode: + case E_DFmode: vmode = V2DFmode; break; - case TFmode: + case E_TFmode: vmode = mode; break; default: @@ -2410,16 +2410,16 @@ ix86_expand_xorsign (rtx operands[]) switch (mode) { - case HFmode: + case E_HFmode: vmode = V8HFmode; break; - case BFmode: + case E_BFmode: vmode = V8BFmode; break; - case SFmode: + case E_SFmode: vmode = V4SFmode; break; - case DFmode: + case E_DFmode: vmode = V2DFmode; break; default:
[gcc r15-2053] PR tree-optimization/114661: Generalize MULT_EXPR recognition in match.pd.
https://gcc.gnu.org/g:df9451936c6c9e4faea371e3f188e1fc6b6d39e3 commit r15-2053-gdf9451936c6c9e4faea371e3f188e1fc6b6d39e3 Author: Roger Sayle Date: Tue Jul 16 07:58:28 2024 +0100 PR tree-optimization/114661: Generalize MULT_EXPR recognition in match.pd. This patch resolves PR tree-optimization/114661, by generalizing the set of expressions that we canonicalize to multiplication. This extends the optimization(s) contributed (by me) back in July 2021. https://gcc.gnu.org/pipermail/gcc-patches/2021-July/575999.html The existing transformation folds (X*C1)^(X< 3) __builtin_unreachable(); return c << 18 | c << 15 | c << 12 | c << 9 | c << 6 | c << 3 | c; } GCC on x86_64 with -O2 previously generated: mul:movzbl %dil, %edi leal(%rdi,%rdi,8), %edx leal0(,%rdx,8), %eax movl%edx, %ecx sall$15, %edx orl %edi, %eax sall$9, %ecx orl %ecx, %eax orl %edx, %eax ret with this patch we now generate: mul:movzbl %dil, %eax imull $299593, %eax, %eax ret 2024-07-16 Roger Sayle Richard Biener gcc/ChangeLog PR tree-optimization/114661 * match.pd ((X*C1)|(X*C2) to X*(C1+C2)): Allow optional useless type conversions around multiplications, such as those inserted by this transformation. gcc/testsuite/ChangeLog PR tree-optimization/114661 * gcc.dg/pr114661.c: New test case. Diff: --- gcc/match.pd| 43 + gcc/testsuite/gcc.dg/pr114661.c | 10 ++ 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/gcc/match.pd b/gcc/match.pd index 3759c64d461f..24a0bbead3e7 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -4171,30 +4171,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) Likewise, handle (X< 0 - && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0) - (with { wide_int wone = wi::one (TYPE_PRECISION (type)); + && (tree_nonzero_bits (@5) & tree_nonzero_bits (@3)) == 0) + (with { tree t = type; + if (!TYPE_OVERFLOW_WRAPS (t)) +t = unsigned_type_for (t); + wide_int wone = wi::one (TYPE_PRECISION (type)); wide_int c = wi::add (wi::to_wide (@2), wi::lshift (wone, wi::to_wide (@4))); } -(mult @1 { wide_int_to_tree (type, c); } +(convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); }) (simplify - (op:c (mult:s@0 @1 INTEGER_CST@2) + (op:c (nop_convert?:s@3 (mult:s@0 (nop_convert? @1) INTEGER_CST@2)) @1) - (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type) - && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0) - (mult @1 -{ wide_int_to_tree (type, -wi::add (wi::to_wide (@2), 1)); }))) + (if (INTEGRAL_TYPE_P (type) + && (tree_nonzero_bits (@3) & tree_nonzero_bits (@1)) == 0) + (with { tree t = type; + if (!TYPE_OVERFLOW_WRAPS (t)) +t = unsigned_type_for (t); + wide_int c = wi::add (wi::to_wide (@2), 1); } +(convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); }) (simplify (op (lshift:s@0 @1 INTEGER_CST@2) (lshift:s@3 @1 INTEGER_CST@4)) diff --git a/gcc/testsuite/gcc.dg/pr114661.c b/gcc/testsuite/gcc.dg/pr114661.c new file mode 100644 index ..e6b5c69dba86 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pr114661.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-evrp" } */ + +unsigned mul(unsigned char c) { +if (c > 3) __builtin_unreachable(); +return c << 18 | c << 15 | +c << 12 | c << 9 | +c << 6 | c << 3 | c; +} +/* { dg-final { scan-tree-dump-times " \\* 299593" 1 "evrp" } } */
[gcc r15-2132] Implement a -ftrapping-math/-fsignaling-nans TODO in match.pd.
https://gcc.gnu.org/g:030186cabe8128e752619e101768cf8823a42c38 commit r15-2132-g030186cabe8128e752619e101768cf8823a42c38 Author: Roger Sayle Date: Thu Jul 18 08:27:36 2024 +0100 Implement a -ftrapping-math/-fsignaling-nans TODO in match.pd. I've been investigating some (float)i == CST optimizations for match.pd, and noticed there's already a TODO comment in match.pd that's relatively easy to implement. When CST is a NaN, we only need to worry about exceptions with flag_trapping_math, and equality/inequality tests for sNaN only behave differently to qNaN with -fsignaling-nans. These issues are related to PR 57371 and PR 106805 in bugzilla. 2024-07-18 Roger Sayle gcc/ChangeLog * match.pd ((FTYPE) N CMP CST): Only worry about exceptions with flag_trapping_math, and about signaling NaNs with HONOR_SNANS. gcc/testsuite/ChangeLog * c-c++-common/pr57371-4.c: Update comment. * c-c++-common/pr57371-5.c: Add missing testcases from pr57371-4.c and update for -fno-signaling-nans -fno-trapping-math. Diff: --- gcc/match.pd | 14 ++-- gcc/testsuite/c-c++-common/pr57371-4.c | 4 +--- gcc/testsuite/c-c++-common/pr57371-5.c | 42 +++--- 3 files changed, 47 insertions(+), 13 deletions(-) diff --git a/gcc/match.pd b/gcc/match.pd index 5cb399b87180..6818856991c6 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -6862,13 +6862,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) tree itype = TREE_TYPE (@0); format_helper fmt (REAL_MODE_FORMAT (TYPE_MODE (TREE_TYPE (@1; const REAL_VALUE_TYPE *cst = TREE_REAL_CST_PTR (@1); - /* Be careful to preserve any potential exceptions due to - NaNs. qNaNs are ok in == or != context. - TODO: relax under -fno-trapping-math or - -fno-signaling-nans. */ - bool exception_p - = real_isnan (cst) && (cst->signalling - || (cmp != EQ_EXPR && cmp != NE_EXPR)); + /* Be careful to preserve any potential exceptions due to NaNs. + qNaNs are ok in == or != context. */ + bool exception_p = real_isnan (cst) + && flag_trapping_math + && ((cmp != EQ_EXPR && cmp != NE_EXPR) + || (cst->signalling + && HONOR_SNANS (TREE_TYPE (@1; } /* TODO: allow non-fitting itype and SNaNs when -fno-trapping-math. */ diff --git a/gcc/testsuite/c-c++-common/pr57371-4.c b/gcc/testsuite/c-c++-common/pr57371-4.c index f43f7c22419a..b0e539de4b9f 100644 --- a/gcc/testsuite/c-c++-common/pr57371-4.c +++ b/gcc/testsuite/c-c++-common/pr57371-4.c @@ -2,9 +2,7 @@ /* { dg-options "-O -fsignaling-nans -fdump-tree-original" } */ /* We can not get rid of comparison in tests below because of - pending NaN exceptions. - - TODO: avoid under -fno-trapping-math. */ + pending NaN exceptions. */ #define QNAN __builtin_nanf ("0") #define SNAN __builtin_nansf ("0") diff --git a/gcc/testsuite/c-c++-common/pr57371-5.c b/gcc/testsuite/c-c++-common/pr57371-5.c index 8e18b0a73138..77decbe5dff5 100644 --- a/gcc/testsuite/c-c++-common/pr57371-5.c +++ b/gcc/testsuite/c-c++-common/pr57371-5.c @@ -2,11 +2,10 @@ /* { dg-options "-O -fno-signaling-nans -fno-trapping-math -fdump-tree-original" } */ /* We can not get rid of comparison in tests below because of - pending NaN exceptions. - - TODO: avoid under -fno-trapping-math. */ + pending NaN exceptions. */ #define QNAN __builtin_nanf ("0") +#define SNAN __builtin_nansf ("0") void nonfinite(unsigned short x) { { @@ -33,6 +32,43 @@ void nonfinite(unsigned short x) { /* { dg-final { scan-tree-dump "nonfinite_4 = 0" "original" } } */ } + { +volatile int nonfinite_5; +nonfinite_5 = (float) x > SNAN; +/* { dg-final { scan-tree-dump "nonfinite_5 = 0" "original" } } */ + } + + { +volatile int nonfinite_6; +nonfinite_6 = (float) x >= SNAN; +/* { dg-final { scan-tree-dump "nonfinite_6 = 0" "original" } } */ + } + + { +volatile int nonfinite_7; +nonfinite_7 = (float) x < SNAN; +/* { dg-final { scan-tree-dump "nonfinite_7 = 0" "original" } } */ + } + + { +volatile int nonfinite_8; +nonfinite_8 = (float) x <= SNAN; +/* { dg-final { scan-tree-dump "nonfinite_8 = 0" "original" } } */ + } + + { +volatile int nonfinite_9; +nonfinite_9 = (float) x == SNAN; +/* { dg-final { scan-tree-dump "nonfinite_9 = 0" "original" } } */ + } + + { +volatile int nonfinite_10; +nonfinite_10 = (float) x != SNAN; +/* { dg-final { scan-tree-dump "nonfinite_10 = 1" "original" } } * + */ + } + { volatile int nonfinite_11; nonfinite_11 = (float) x == QNAN;
[gcc r15-2359] Fold ctz(-x) and ctz(abs(x)) as ctz(x) in match.pd.
https://gcc.gnu.org/g:928116e94a5a8a995dffd926af58abfa7286e78e commit r15-2359-g928116e94a5a8a995dffd926af58abfa7286e78e Author: Roger Sayle Date: Sat Jul 27 15:16:19 2024 +0100 Fold ctz(-x) and ctz(abs(x)) as ctz(x) in match.pd. The subject line pretty much says it all; the count-trailing-zeros function of -X and abs(X) produce the same result as count-trailing-zeros of X. This transformation eliminates a negation which may potentially overflow with an equivalent expression that doesn't [much like the analogous abs(-X) simplification in match.pd]. I'd noticed this -X equivalence, which isn't mentioned in Hacker's Delight, investigating whether ranger's non_zero_bits can help determine whether an integer variable may be converted to a floating point type exactly (without raising FE_INEXACT), but it turns out this observation isn't novel, as (disappointingly) LLVM already performs this same folding. 2024-07-27 Roger Sayle Andrew Pinski gcc/ChangeLog * match.pd (ctz (-X) => ctz (X)): New simplification. (ctz (abs (X)) => ctz (X)): Likewise. gcc/testsuite/ChangeLog * gcc.dg/fold-ctz-1.c: New test case. * gcc.dg/fold-ctz-2.c: Likewise. Diff: --- gcc/match.pd | 6 ++ gcc/testsuite/gcc.dg/fold-ctz-1.c | 9 + gcc/testsuite/gcc.dg/fold-ctz-2.c | 9 + 3 files changed, 24 insertions(+) diff --git a/gcc/match.pd b/gcc/match.pd index b2e7d61790df..1c8601229e3d 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -9102,6 +9102,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) /* CTZ simplifications. */ (for ctz (CTZ) + /* ctz (-X) => ctz (X). ctz (abs (X)) => ctz (X). */ + (for op (negate abs) + (simplify + (ctz (nop_convert?@0 (op @1))) +(with { tree t = TREE_TYPE (@0); } + (ctz (convert:t @1) (for op (ge gt le lt) cmp (eq eq ne ne) (simplify diff --git a/gcc/testsuite/gcc.dg/fold-ctz-1.c b/gcc/testsuite/gcc.dg/fold-ctz-1.c new file mode 100644 index ..dcc444cbbb6b --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-ctz-1.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +int foo(int x) +{ + return __builtin_ctz (-x); +} + +/* { dg-final { scan-tree-dump-not "-x_" "optimized"} } */ diff --git a/gcc/testsuite/gcc.dg/fold-ctz-2.c b/gcc/testsuite/gcc.dg/fold-ctz-2.c new file mode 100644 index ..c685698f31e5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-ctz-2.c @@ -0,0 +1,9 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +int foo(int x) +{ + return __builtin_ctz (__builtin_abs (x)); +} + +/* { dg-final { scan-tree-dump-not "ABS_EXPR" "optimized"} } */
[gcc r15-774] Avoid ICE in except.cc on targets that don't support exceptions.
https://gcc.gnu.org/g:26df7b4684e201e66c09dd018603a248ddc5f437 commit r15-774-g26df7b4684e201e66c09dd018603a248ddc5f437 Author: Roger Sayle Date: Wed May 22 13:48:52 2024 +0100 Avoid ICE in except.cc on targets that don't support exceptions. A number of testcases currently fail on nvptx with the ICE: during RTL pass: final openmp-simd-2.c: In function 'foo': openmp-simd-2.c:28:1: internal compiler error: in get_personality_function, at expr.cc:14037 28 | } | ^ 0x98a38f get_personality_function(tree_node*) /home/roger/GCC/nvptx-none/gcc/gcc/expr.cc:14037 0x969d3b output_function_exception_table(int) /home/roger/GCC/nvptx-none/gcc/gcc/except.cc:3226 0x9b760d rest_of_handle_final /home/roger/GCC/nvptx-none/gcc/gcc/final.cc:4252 The simple oversight in output_function_exception_table is that it calls get_personality_function (immediately) before checking the target's except_unwind_info hook (which on nvptx always returns UI_NONE). The (perhaps obvious) fix is to move the assignments of fname and personality after the tests that they are needed, and before their first use. 2024-05-22 Roger Sayle gcc/ChangeLog * except.cc (output_function_exception_table): Move call to get_personality_function after targetm_common.except_unwind_info check, to avoid ICE on targets that don't support exceptions. Diff: --- gcc/except.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gcc/except.cc b/gcc/except.cc index 2080fcc22e6..b5886e97be9 100644 --- a/gcc/except.cc +++ b/gcc/except.cc @@ -3222,9 +3222,6 @@ output_one_function_exception_table (int section) void output_function_exception_table (int section) { - const char *fnname = get_fnname_from_decl (current_function_decl); - rtx personality = get_personality_function (current_function_decl); - /* Not all functions need anything. */ if (!crtl->uses_eh_lsda || targetm_common.except_unwind_info (&global_options) == UI_NONE) @@ -3234,6 +3231,9 @@ output_function_exception_table (int section) if (section == 1 && !crtl->eh.call_site_record_v[1]) return; + const char *fnname = get_fnname_from_decl (current_function_decl); + rtx personality = get_personality_function (current_function_decl); + if (personality) { assemble_external_libcall (personality);
[gcc r15-775] i386: Correct insn_cost of movabsq.
https://gcc.gnu.org/g:a3b16e73a2d5b2d4d20ef6f2fd164cea633bbec8 commit r15-775-ga3b16e73a2d5b2d4d20ef6f2fd164cea633bbec8 Author: Roger Sayle Date: Wed May 22 16:45:48 2024 +0100 i386: Correct insn_cost of movabsq. This single line patch fixes a strange quirk/glitch in i386's rtx_costs, which considers an instruction loading a 64-bit constant to be significantly cheaper than loading a 32-bit (or smaller) constant. Consider the two functions: unsigned long long foo() { return 0x0123456789abcdefULL; } unsigned int bar() { return 10; } and the corresponding lines from combine's dump file: insn_cost 1 for #: r98:DI=0x123456789abcdef insn_cost 4 for #: ax:SI=0xa The same issue can be seen in -dP assembler output. movabsq $81985529216486895, %rax# 5 [c=1 l=10] *movdi_internal/4 The problem is that pattern_costs interpretation of rtx_costs contains "return cost > 0 ? cost : COSTS_N_INSNS (1)" where a zero value (for example a register or small immediate constant) is considered special, and equivalent to a single instruction, but all other values are treated as verbatim. Hence to x86_64's 10-byte long movabsq instruction slightly more expensive than a simple constant, rtx_costs needs to return COSTS_N_INSNS(1)+1 and not 1. With this change, the insn_cost of movabsq is the intended value 5: insn_cost 5 for #: r98:DI=0x123456789abcdef and movabsq $81985529216486895, %rax# 5 [c=5 l=10] *movdi_internal/4 2024-05-22 Roger Sayle gcc/ChangeLog * config/i386/i386.cc (ix86_rtx_costs) : A CONST_INT that isn't x86_64_immediate_operand requires an extra (expensive) movabsq insn to load, so return COSTS_N_INSNS (1) + 1. Diff: --- gcc/config/i386/i386.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 69cd4ae05a7..3e2a3a194f1 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -21562,7 +21562,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, if (x86_64_immediate_operand (x, VOIDmode)) *total = 0; else - *total = 1; + /* movabsq is slightly more expensive than a simple instruction. */ + *total = COSTS_N_INSNS (1) + 1; return true; case CONST_DOUBLE:
[gcc r15-1100] i386: Improve handling of ternlog instructions in i386/sse.md
https://gcc.gnu.org/g:ec985bc97a01577bca8307f986caba7ba7633cde commit r15-1100-gec985bc97a01577bca8307f986caba7ba7633cde Author: Roger Sayle Date: Fri Jun 7 13:57:23 2024 +0100 i386: Improve handling of ternlog instructions in i386/sse.md This patch improves the way that the x86 backend recognizes and expands AVX512's bitwise ternary logic (vpternlog) instructions. As a motivating example consider the following code which calculates the carry out from a (binary) full adder: typedef unsigned long long v4di __attribute((vector_size(32))); v4di foo(v4di a, v4di b, v4di c) { return (a & b) | ((a ^ b) & c); } with -O2 -march=cascadelake current mainline produces: foo:vpternlogq $96, %ymm0, %ymm1, %ymm2 vmovdqa %ymm0, %ymm3 vmovdqa %ymm2, %ymm0 vpternlogq $248, %ymm3, %ymm1, %ymm0 ret with the patch below, we now generate a single instruction: foo:vpternlogq $232, %ymm2, %ymm1, %ymm0 ret The AVX512 vpternlog[qd] instructions are a very cool addition to the x86 instruction set, that can calculate any Boolean function of three inputs in a single fast instruction. As the truth table for any three-input function has 8 rows, any specific function can be represented by specifying those bits, i.e. by a 8-bit byte, an immediate integer between 0 and 256. Examples of ternary functions and their indices are given below: 0x01 1: ~((b|a)|c) 0x02 2: (~(b|a))&c 0x03 3: ~(b|a) 0x04 4: (~(c|a))&b 0x05 5: ~(c|a) 0x06 6: (c^b)&~a 0x07 7: ~((c&b)|a) 0x08 8: (~a&c)&b (~a&b)&c (c&b)&~a 0x09 9: ~((c^b)|a) 0x0a 10: ~a&c 0x0b 11: ~((~c&b)|a) (~b|c)&~a 0x0c 12: ~a&b 0x0d 13: ~((~b&c)|a) (~c|b)&~a 0x0e 14: (c|b)&~a 0x0f 15: ~a 0x10 16: (~(c|b))&a 0x11 17: ~(c|b) ... 0xf4 244: (~c&b)|a 0xf5 245: ~c|a 0xf6 246: (c^b)|a 0xf7 247: (~(c&b))|a 0xf8 248: (c&b)|a 0xf9 249: (~(c^b))|a 0xfa 250: c|a 0xfb 251: (c|a)|~b (~b|a)|c (~b|c)|a 0xfc 252: b|a 0xfd 253: (b|a)|~c (~c|a)|b (~c|b)|a 0xfe 254: (b|a)|c (c|a)|b (c|b)|a A naive implementation (in many compilers) might be add define_insn patterns for all 256 different functions. The situation is even worse as many of these Boolean functions don't have a "canonical form" (as produced by simplify_rtx) and would each need multiple patterns. See the space-separated equivalent expressions in the table above. This need to provide instruction "templates" might explain why GCC, LLVM and ICC all exhibit similar coverage problems in their ability to recognize x86 ternlog ternary functions. Perhaps a unique feature of GCC's design is that in addition to regular define_insn templates, machine descriptions can also perform pattern matching via a match_operator (and its corresponding predicate). This patch introduces a ternlog_operand predicate that matches a (possibly infinite) set of expression trees, identifying those that have at most three unique operands. This then allows a define_insn_and_split to recognize suitable expressions and then transform them into the appropriate UNSPEC_VTERNLOG as a pre-reload splitter. This design allows combine to smash together arbitrarily complex Boolean expressions, then transform them into an UNSPEC before register allocation. As an "optimization", where possible ix86_expand_ternlog generates a simpler binary operation, using AND, XOR, IOR or ANDN where possible, and in a few cases attempts to "canonicalize" the ternlog, by reordering or duplicating operands, so that later CSE passes have a hope of spotting equivalent values. This patch leaves the existing ternlog patterns in sse.md (for now), many of which are made obsolete by these changes. In theory we now only need one define_insn for UNSPEC_VTERNLOG. One complication from these previous variants was that they inconsistently used decimal vs. hexadecimal to specify the immediate constant operand in assembly language, making the list of tweaks to the testsuite with this patch larger than it might have been. I propose to remove the vestigial patterns in a follow-up patch, once this approach has baked (proven to be stable) on mainline. 2024-06-07 Roger Sayle Hongtao Liu gcc/ChangeLog * config/i386/i386-expand.cc (ix86_expand_args_builtin): Call fixup_modeless_constant before testing predicates. Only call copy_to_mode_reg on memory operands (after the first one). (ix86_gen_bcst_mem): Helper function to convert a CONST_VECTOR into a VEC_DUPLICATE if possible. (ix86_tern
[gcc r15-1101] i386: PR target/115351: RTX costs for *concatditi3 and *insvti_highpart.
https://gcc.gnu.org/g:fb3e4c549d16d5050e10114439ad77149f33c597 commit r15-1101-gfb3e4c549d16d5050e10114439ad77149f33c597 Author: Roger Sayle Date: Fri Jun 7 14:03:20 2024 +0100 i386: PR target/115351: RTX costs for *concatditi3 and *insvti_highpart. This patch addresses PR target/115351, which is a code quality regression on x86 when passing floating point complex numbers. The ABI considers these arguments to have TImode, requiring interunit moves to place the FP values (which are actually passed in SSE registers) into the upper and lower parts of a TImode pseudo, and then similar moves back again before they can be used. The cause of the regression is that changes in how TImode initialization is represented in RTL now prevents the RTL optimizers from eliminating these redundant moves. The specific cause is that the *concatditi3 pattern, (zext(hi)<<64)|zext(lo), has an inappropriately high (default) rtx_cost, preventing fwprop1 from propagating it. This pattern just sets the hipart and lopart of a double-word register, typically two instructions (less if reload can allocate things appropriately) but the current ix86_rtx_costs actually returns INSN_COSTS(13), i.e. 52. propagating insn 5 into insn 6, replacing: (set (reg:TI 110) (ior:TI (and:TI (reg:TI 110) (const_wide_int 0x0)) (ashift:TI (zero_extend:TI (subreg:DI (reg:DF 112 [ zD.2796+8 ]) 0)) (const_int 64 [0x40] successfully matched this instruction to *concatditi3_3: (set (reg:TI 110) (ior:TI (ashift:TI (zero_extend:TI (subreg:DI (reg:DF 112 [ zD.2796+8 ]) 0)) (const_int 64 [0x40])) (zero_extend:TI (subreg:DI (reg:DF 111 [ zD.2796 ]) 0 change not profitable (cost 50 -> cost 52) This issue is resolved by having ix86_rtx_costs return more reasonable values for these (place-holder) patterns. 2024-06-07 Roger Sayle gcc/ChangeLog PR target/115351 * config/i386/i386.cc (ix86_rtx_costs): Provide estimates for the *concatditi3 and *insvti_highpart patterns, about two insns. gcc/testsuite/ChangeLog PR target/115351 * g++.target/i386/pr115351.C: New test case. Diff: --- gcc/config/i386/i386.cc | 43 gcc/testsuite/g++.target/i386/pr115351.C | 19 ++ 2 files changed, 62 insertions(+) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 4126ab24a79..173db213d14 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -21912,6 +21912,49 @@ ix86_rtx_costs (rtx x, machine_mode mode, int outer_code_i, int opno, } *total = ix86_vec_cost (mode, cost->sse_op); } + else if (TARGET_64BIT + && mode == TImode + && GET_CODE (XEXP (x, 0)) == ASHIFT + && GET_CODE (XEXP (XEXP (x, 0), 0)) == ZERO_EXTEND + && GET_MODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == DImode + && CONST_INT_P (XEXP (XEXP (x, 0), 1)) + && INTVAL (XEXP (XEXP (x, 0), 1)) == 64 + && GET_CODE (XEXP (x, 1)) == ZERO_EXTEND + && GET_MODE (XEXP (XEXP (x, 1), 0)) == DImode) + { + /* *concatditi3 is cheap. */ + rtx op0 = XEXP (XEXP (XEXP (x, 0), 0), 0); + rtx op1 = XEXP (XEXP (x, 1), 0); + *total = (SUBREG_P (op0) && GET_MODE (SUBREG_REG (op0)) == DFmode) + ? COSTS_N_INSNS (1)/* movq. */ + : set_src_cost (op0, DImode, speed); + *total += (SUBREG_P (op1) && GET_MODE (SUBREG_REG (op1)) == DFmode) + ? COSTS_N_INSNS (1)/* movq. */ + : set_src_cost (op1, DImode, speed); + return true; + } + else if (TARGET_64BIT + && mode == TImode + && GET_CODE (XEXP (x, 0)) == AND + && REG_P (XEXP (XEXP (x, 0), 0)) + && CONST_WIDE_INT_P (XEXP (XEXP (x, 0), 1)) + && CONST_WIDE_INT_NUNITS (XEXP (XEXP (x, 0), 1)) == 2 + && CONST_WIDE_INT_ELT (XEXP (XEXP (x, 0), 1), 0) == -1 + && CONST_WIDE_INT_ELT (XEXP (XEXP (x, 0), 1), 1) == 0 + && GET_CODE (XEXP (x, 1)) == ASHIFT + && GET_CODE (XEXP (XEXP (x, 1), 0)) == ZERO_EXTEND + && GET_MODE (XEXP (XEXP (XEXP (x, 1), 0), 0)) == DImode + && CONST_INT_P (XEXP (XEXP (x, 1), 1)) + && INTVAL (XEXP (XEXP (x, 1), 1)) == 64) + { + /* *insvti_highpart is cheap. */ + rtx op = XEXP (XEXP (XEXP (x, 1), 0), 0); + *total = COSTS_N_INSNS (1) + 1; + *total += (SUBREG_P (op) && GET_MODE (SUBREG_REG (op)) == DFmode) + ? COSTS_N_INSNS (1)/* movq. */ + : set_src_cost (op, DImode, speed); + return true
[gcc r15-1111] analyzer: Restore g++ 4.8 bootstrap; use std::move to return std::unique_ptr.
https://gcc.gnu.org/g:e22b7f741ab54ff3a3f8a676ce9e7414fe174958 commit r15--ge22b7f741ab54ff3a3f8a676ce9e7414fe174958 Author: Roger Sayle Date: Sat Jun 8 05:01:38 2024 +0100 analyzer: Restore g++ 4.8 bootstrap; use std::move to return std::unique_ptr. This patch restores bootstrap when using g++ 4.8 as a host compiler. Returning a std::unique_ptr requires a std::move on C++ compilers (pre-C++17) that don't guarantee copy elision/return value optimization. 2024-06-08 Roger Sayle gcc/analyzer/ChangeLog * constraint-manager.cc (equiv_class::make_dump_widget): Use std::move to return a std::unique_ptr. (bounded_ranges_constraint::make_dump_widget): Likewise. (constraint_manager::make_dump_widget): Likewise. * program-state.cc (sm_state_map::make_dump_widget): Likewise. (program_state::make_dump_widget): Likewise. * region-model.cc (region_to_value_map::make_dump_widget): Likewise. (region_model::make_dump_widget): Likewise. * region.cc (region::make_dump_widget): Likewise. * store.cc (binding_cluster::make_dump_widget): Likewise. (store::make_dump_widget): Likewise. * svalue.cc (svalue::make_dump_widget): Likewise. Diff: --- gcc/analyzer/constraint-manager.cc | 6 +++--- gcc/analyzer/program-state.cc | 4 ++-- gcc/analyzer/region-model.cc | 4 ++-- gcc/analyzer/region.cc | 2 +- gcc/analyzer/store.cc | 4 ++-- gcc/analyzer/svalue.cc | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/gcc/analyzer/constraint-manager.cc b/gcc/analyzer/constraint-manager.cc index 707385d3fa6..883f33b2cdd 100644 --- a/gcc/analyzer/constraint-manager.cc +++ b/gcc/analyzer/constraint-manager.cc @@ -1176,7 +1176,7 @@ equiv_class::make_dump_widget (const text_art::dump_widget_info &dwi, ec_widget->add_child (tree_widget::make (dwi, &pp)); } - return ec_widget; + return std::move (ec_widget); } /* Generate a hash value for this equiv_class. @@ -1500,7 +1500,7 @@ make_dump_widget (const text_art::dump_widget_info &dwi) const (tree_widget::from_fmt (dwi, nullptr, "ec%i bounded ranges", m_ec_id.as_int ())); m_ranges->add_to_dump_widget (*brc_widget.get (), dwi); - return brc_widget; + return std::move (brc_widget); } bool @@ -1853,7 +1853,7 @@ constraint_manager::make_dump_widget (const text_art::dump_widget_info &dwi) con if (cm_widget->get_num_children () == 0) return nullptr; - return cm_widget; + return std::move (cm_widget); } /* Attempt to add the constraint LHS OP RHS to this constraint_manager. diff --git a/gcc/analyzer/program-state.cc b/gcc/analyzer/program-state.cc index dc2d4bdf7b0..efaf569a490 100644 --- a/gcc/analyzer/program-state.cc +++ b/gcc/analyzer/program-state.cc @@ -382,7 +382,7 @@ sm_state_map::make_dump_widget (const text_art::dump_widget_info &dwi, state_widget->add_child (tree_widget::make (dwi, pp)); } - return state_widget; + return std::move (state_widget); } /* Return true if no states have been set within this map @@ -1247,7 +1247,7 @@ program_state::make_dump_widget (const text_art::dump_widget_info &dwi) const state_widget->add_child (smap->make_dump_widget (dwi, m_region_model)); } - return state_widget; + return std::move (state_widget); } /* Update this program_state to reflect a top-level call to FUN. diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc index a25181f2a3e..1a44ff073bd 100644 --- a/gcc/analyzer/region-model.cc +++ b/gcc/analyzer/region-model.cc @@ -288,7 +288,7 @@ make_dump_widget (const text_art::dump_widget_info &dwi) const sval->dump_to_pp (pp, true); w->add_child (text_art::tree_widget::make (dwi, pp)); } - return w; + return std::move (w); } /* Attempt to merge THIS with OTHER, writing the result @@ -556,7 +556,7 @@ region_model::make_dump_widget (const text_art::dump_widget_info &dwi) const m_mgr->get_store_manager ())); model_widget->add_child (m_constraints->make_dump_widget (dwi)); model_widget->add_child (m_dynamic_extents.make_dump_widget (dwi)); - return model_widget; + return std::move (model_widget); } /* Assert that this object is valid. */ diff --git a/gcc/analyzer/region.cc b/gcc/analyzer/region.cc index 1fc42f2cd97..d5cfd476fd8 100644 --- a/gcc/analyzer/region.cc +++ b/gcc/analyzer/region.cc @@ -1101,7 +1101,7 @@ region::make_dump_widget (const text_art::dump_widget_info &dwi, if (m_parent) w->add_child (m_parent->make_dump_widget (dwi, "parent")); - return w; + return std::move (w); } void diff --git a/gcc/analyzer/store.cc b/gcc/analyzer/store.cc index d5c1a9f6aff..5a33d740ce2 100644 --- a/gcc/analyzer/store.cc +++ b/gcc/analyzer/store.cc @@ -1489,7 +1489,7 @@ binding_cluster::mak
[gcc r15-1175] i386: PR target/115397: AVX512 ternlog vs. -m32 -fPIC constant pool.
https://gcc.gnu.org/g:a797398cfbc75899fdb7d97436c0c89c02b133c0 commit r15-1175-ga797398cfbc75899fdb7d97436c0c89c02b133c0 Author: Roger Sayle Date: Tue Jun 11 09:31:34 2024 +0100 i386: PR target/115397: AVX512 ternlog vs. -m32 -fPIC constant pool. This patch fixes PR target/115397, a recent regression caused by my ternlog patch that results in an ICE (building numpy) with -m32 -fPIC. The problem is that ix86_broadcast_from_constant, which calls get_pool_constant, doesn't handle the UNSPEC_GOTOFF that's created by calling validize_mem when using -fPIC on i686. The logic here is a bit convoluted (and my future patches will clean some of this up), but the simplest fix is to call ix86_broadcast_from_constant between the calls to force_const_mem and the call to validize_mem. Perhaps a better solution might be to call targetm.delegitimize_address from the middle-end's get_pool_constant, but ultimately the best approach would be to not place things in the constant pool if we don't need to. My plans to move (broadcast) constant handling from expand to split1 should simplify this. 2024-06-11 Roger Sayle gcc/ChangeLog PR target/115397 * config/i386/i386-expand.cc (ix86_expand_ternlog): Move call to ix86_broadcast_from_constant before call to validize_mem, but after call to force_const_mem. gcc/testsuite/ChangeLog PR target/115397 * gcc.target/i386/pr115397.c: New test case. Diff: --- gcc/config/i386/i386-expand.cc | 3 ++- gcc/testsuite/gcc.target/i386/pr115397.c | 17 + 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 9b60264dce2..312329e550b 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -26041,8 +26041,9 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, tmp2 = ix86_gen_bcst_mem (mode, op2); if (!tmp2) { - tmp2 = validize_mem (force_const_mem (mode, op2)); + tmp2 = force_const_mem (mode, op2); rtx bcast = ix86_broadcast_from_constant (mode, tmp2); + tmp2 = validize_mem (tmp2); if (bcast) { rtx reg2 = gen_reg_rtx (mode); diff --git a/gcc/testsuite/gcc.target/i386/pr115397.c b/gcc/testsuite/gcc.target/i386/pr115397.c new file mode 100644 index 000..27835782b78 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr115397.c @@ -0,0 +1,17 @@ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-fPIC -mavx512f -O3" } */ + +int LONG_divide_AVX512F_dimensions_0; +void npy_set_floatstatus_overflow(); +void LONG_divide_AVX512F() { + long *src; + int raise_err = 0; + for (; LONG_divide_AVX512F_dimensions_0; + --LONG_divide_AVX512F_dimensions_0, ++src) { +long a = *src; +if (a) + raise_err = 1; + } + if (raise_err) +npy_set_floatstatus_overflow(); +}
[gcc r15-1306] i386: More use of m{32, 64}bcst addressing modes with ternlog.
https://gcc.gnu.org/g:c129a34dc8e69f7b34cf72835aeba2cefbb8673a commit r15-1306-gc129a34dc8e69f7b34cf72835aeba2cefbb8673a Author: Roger Sayle Date: Fri Jun 14 06:29:27 2024 +0100 i386: More use of m{32,64}bcst addressing modes with ternlog. This patch makes more use of m32bcst and m64bcst addressing modes in ix86_expand_ternlog. Previously, the i386 backend would only consider using a m32bcst if the inner mode of the vector was 32-bits, or using m64bcst if the inner mode was 64-bits. For ternlog (and other logic operations) this is a strange restriction, as how the same constant is materialized is dependent upon the mode it is used/operated on. Hence, the V16QI constant {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} wouldn't use m??bcst, but (V4SI){0x02020202,0x02020202,0x02020202,0x02020202} which has the same bit pattern would. This can optimized by (re)checking whether a CONST_VECTOR can be broadcast from memory after casting it to VxSI (or for m64bst to VxDI) where x has the appropriate vector size. Taking the test case from pr115407: __attribute__((__vector_size__(64))) char v; void foo() { v = v | v << 7; } Compiled with -O2 -mcmodel=large -mavx512bw GCC 14 generates a 64-byte (512-bit) load from the constant pool: foo:movabsq $v, %rax// 10 movabsq $.LC0, %rdx // 10 vpsllw $7, (%rax), %zmm1 // 7 vmovdqa64 (%rax), %zmm0 // 6 vpternlogd $248, (%rdx), %zmm1, %zmm0 // 7 vmovdqa64 %zmm0, (%rax) // 6 vzeroupper // 3 ret // 1 .LC0: .byte -12 // 64 = 114 bytes .byte -128 ;; repeated another 62 times mainline currently generates two instructions, using interunit broadcast: foo:movabsq $v, %rdx// 10 movl$-2139062144, %eax // 5 vmovdqa64 (%rdx), %zmm2 // 6 vpbroadcastd%eax, %zmm0 // 6 vpsllw $7, %zmm2, %zmm1// 7 vpternlogd $236, %zmm0, %zmm2, %zmm1 // 7 vmovdqa64 %zmm1, (%rdx) // 6 vzeroupper // 3 ret // 1 = 51 bytes With this patch, we now generate a broadcast addressing mode: foo:movabsq $v, %rax // 10 movabsq $.LC1, %rdx// 10 vmovdqa64 (%rax), %zmm1 // 6 vpsllw $7, %zmm1, %zmm0 // 7 vpternlogd $236, (%rdx){1to16}, %zmm1, %zmm0 // 7 vmovdqa64 %zmm0, (%rax) // 6 vzeroupper // 3 ret// 1 = 50 total Without -mcmodel=large, the benefit is two instructions: foo:vmovdqa64 v(%rip), %zmm1 // 10 vpsllw $7, %zmm1, %zmm0 // 7 vpternlogd $236, .LC2(%rip){1to16}, %zmm1, %zmm0 // 11 vmovdqa64 %zmm0, v(%rip) // 10 vzeroupper // 3 ret// 1 = 42 total 2024-06-14 Roger Sayle gcc/ChangeLog * config/i386/i386-expand.cc (ix86_expand_ternlog): Try performing logic operation in a different vector mode if that enables use of a 32-bit or 64-bit broadcast addressing mode. gcc/testsuite/ChangeLog * gcc.target/i386/pr115407.c: New test case. Diff: --- gcc/config/i386/i386-expand.cc | 63 gcc/testsuite/gcc.target/i386/pr115407.c | 9 + 2 files changed, 72 insertions(+) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 312329e550b6..a4379b863170 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -26041,6 +26041,69 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx, tmp2 = ix86_gen_bcst_mem (mode, op2); if (!tmp2) { + machine_mode bcst32_mode = mode; + machine_mode bcst64_mode = mode; + switch (mode) + { + case V1TImode: + case V4SImode: + case V4SFmode: + case V8HImode: + case V
[gcc r15-1502] i386: Allow all register_operand SUBREGs in x86_ternlog_idx.
https://gcc.gnu.org/g:9a76db24e044c8058497051a652cca4228cbc8e9 commit r15-1502-g9a76db24e044c8058497051a652cca4228cbc8e9 Author: Roger Sayle Date: Thu Jun 20 16:30:15 2024 +0100 i386: Allow all register_operand SUBREGs in x86_ternlog_idx. This patch tweaks ix86_ternlog_idx to allow any SUBREG that matches the register_operand predicate, and is split out as an independent piece of a patch that I have to clean-up redundant ternlog patterns in sse.md. It turns out that some of these patterns aren't (yet) sufficiently redundant to be obsolete. The problem is that the "new" ternlog pattern has the restriction that it allows SUBREGs, but only those where the inner and outer modes are the same size, where regular patterns use "register_operand" which allows arbitrary including paradoxical SUBREGs. A motivating example is f2 in gcc.target/i386/avx512dq-abs-copysign-1.c void f2 (float x, float y) { register float a __asm ("xmm16"), b __asm ("xmm17"); a = x; b = y; asm volatile ("" : "+v" (a), "+v" (b)); a = __builtin_copysignf (a, b); asm volatile ("" : "+v" (a)); } for which combine tries: (set (subreg:V4SF (reg:SF 100 [ _3 ]) 0) (ior:V4SF (and:V4SF (not:V4SF (reg:V4SF 104)) (subreg:V4SF (reg:SF 110) 0)) (reg:V4SF 106))) where the SUBREG is paradoxical, with inner mode SF and outer mode V4SF. This patch allows the recently added ternlog_operand to accept this case. 2024-06-20 Roger Sayle gcc/ChangeLog * config/i386/i386-expand.cc (ix86_ternlog_idx): Allow any SUBREG that matches register_operand. Use rtx_equal_p to compare REG or SUBREG "leaf" operands. Diff: --- gcc/config/i386/i386-expand.cc | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 5c29ee1353f7..ac423000ce67 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -25576,27 +25576,32 @@ ix86_ternlog_idx (rtx op, rtx *args) switch (GET_CODE (op)) { +case SUBREG: + if (!register_operand (op, GET_MODE (op))) + return -1; + /* FALLTHRU */ + case REG: if (!args[0]) { args[0] = op; return 0xf0; } - if (REGNO (op) == REGNO (args[0])) + if (rtx_equal_p (op, args[0])) return 0xf0; if (!args[1]) { args[1] = op; return 0xcc; } - if (REGNO (op) == REGNO (args[1])) + if (rtx_equal_p (op, args[1])) return 0xcc; if (!args[2]) { args[2] = op; return 0xaa; } - if (REG_P (args[2]) && REGNO (op) == REGNO (args[2])) + if (rtx_equal_p (op, args[2])) return 0xaa; return -1; @@ -25634,12 +25639,6 @@ ix86_ternlog_idx (rtx op, rtx *args) return 0x55; return -1; -case SUBREG: - if (GET_MODE_SIZE (GET_MODE (SUBREG_REG (op))) - != GET_MODE_SIZE (GET_MODE (op))) - return -1; - return ix86_ternlog_idx (SUBREG_REG (op), args); - case NOT: idx0 = ix86_ternlog_idx (XEXP (op, 0), args); return (idx0 >= 0) ? idx0 ^ 0xff : -1;
[gcc r15-1584] PR tree-optimization/113673: Avoid load merging when potentially trapping.
https://gcc.gnu.org/g:d8b05aef77443e1d3d8f3f5d2c56ac49a503fee3 commit r15-1584-gd8b05aef77443e1d3d8f3f5d2c56ac49a503fee3 Author: Roger Sayle Date: Mon Jun 24 15:34:03 2024 +0100 PR tree-optimization/113673: Avoid load merging when potentially trapping. This patch fixes PR tree-optimization/113673, a P2 ice-on-valid regression caused by load merging of (ptr[0]<<8)+ptr[1] when -ftrapv has been specified. When the operator is | or ^ this is safe, but for addition of signed integer types, a trap may be generated/required, so merging this idiom into a single non-trapping instruction is inappropriate, confusing the compiler by transforming a basic block with an exception edge into one without. This revision implements Richard Biener's feedback to add an early check for stmt_can_throw_internal (cfun, stmt) to prevent transforming in the presence of any statement that could trap, not just overflow on addition. The one other tweak included in this patch is to mark the local function find_bswap_or_nop_load as static ensuring that it isn't called from outside this file, and guaranteeing that it is dominated by stmt_can_throw_internal checking. 2024-06-24 Roger Sayle Richard Biener gcc/ChangeLog PR tree-optimization/113673 * gimple-ssa-store-merging.cc (find_bswap_or_nop_load): Make static. (find_bswap_or_nop_1): Avoid transformations (load merging) when stmt_can_throw_internal indicates that a statement can trap. gcc/testsuite/ChangeLog PR tree-optimization/113673 * g++.dg/pr113673.C: New test case. Diff: --- gcc/gimple-ssa-store-merging.cc | 6 -- gcc/testsuite/g++.dg/pr113673.C | 14 ++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc index cb0cb5f42f6..7dba4a7a781 100644 --- a/gcc/gimple-ssa-store-merging.cc +++ b/gcc/gimple-ssa-store-merging.cc @@ -363,7 +363,7 @@ init_symbolic_number (struct symbolic_number *n, tree src) the answer. If so, REF is that memory source and the base of the memory area accessed and the offset of the access from that base are recorded in N. */ -bool +static bool find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n) { /* Leaf node is an array or component ref. Memorize its base and @@ -610,7 +610,9 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit) gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1; enum gimple_rhs_class rhs_class; - if (!limit || !is_gimple_assign (stmt)) + if (!limit + || !is_gimple_assign (stmt) + || stmt_can_throw_internal (cfun, stmt)) return NULL; rhs1 = gimple_assign_rhs1 (stmt); diff --git a/gcc/testsuite/g++.dg/pr113673.C b/gcc/testsuite/g++.dg/pr113673.C new file mode 100644 index 000..11489777f5b --- /dev/null +++ b/gcc/testsuite/g++.dg/pr113673.C @@ -0,0 +1,14 @@ +/* { dg-do compile } */ +/* { dg-options "-Os -fnon-call-exceptions -ftrapv" } */ + +struct s { ~s(); }; +void +h (unsigned char *data, int c) +{ + s a1; + while (c) +{ + int m = *data++ << 8; + m += *data++; +} +}
[gcc r15-2758] i386: Refactor V2DI arithmetic right shift expansion for STV.
https://gcc.gnu.org/g:2f759fa9f4dd78ae8d86482ccda72a335aaac404 commit r15-2758-g2f759fa9f4dd78ae8d86482ccda72a335aaac404 Author: Roger Sayle Date: Tue Aug 6 17:19:29 2024 +0100 i386: Refactor V2DI arithmetic right shift expansion for STV. This patch refactors ashrv2di RTL expansion into a function so that it may be reused by a pre-reload splitter, such that DImode right shifts may be considered candidates during the Scalar-To-Vector (STV) pass. Currently DImode arithmetic right shifts are not considered potential candidates during STV, so for the following testcase: long long m; typedef long long v2di __attribute__((vector_size (16))); void foo(v2di x) { m = x[0]>>63; } We currently see the following warning/error during STV2 > r101 use in insn 7 isn't convertible And end up generating scalar code with an interunit move: foo:movq%xmm0, %rax sarq$63, %rax movq%rax, m(%rip) ret With this patch, we can reuse the RTL expansion logic and produce: foo:psrad $31, %xmm0 pshufd $245, %xmm0, %xmm0 movq%xmm0, m(%rip) ret Or with the addition of -mavx2, the equivalent: foo:vpxor %xmm1, %xmm1, %xmm1 vpcmpgtq%xmm0, %xmm1, %xmm0 vmovq %xmm0, m(%rip) ret The only design decision of note is the choice to continue lowering V2DI into vector sequences during RTL expansion, to enable combine to optimize things if possible. Using just define_insn_and_split potentially misses optimizations, such as reusing the zero vector produced by vpxor above. It may be necessary to tweak STV's compute gain at some point, but this patch controls what's possible (rather than what's beneficial). 2024-08-06 Roger Sayle gcc/ChangeLog * config/i386/i386-expand.cc (ix86_expand_v2di_ashiftrt): New function refactored from define_expand ashrv2di3. * config/i386/i386-features.cc (general_scalar_to_vector_candidate_p) : Handle like other shifts and rotates. * config/i386/i386-protos.h (ix86_expand_v2di_ashiftrt): Prototype. * config/i386/sse.md (ashrv2di3): Call ix86_expand_v2di_ashiftrt. (*ashrv2di3): New define_insn_and_split to enable creation by stv2 pass, and splitting during split1 reusing ix86_expand_v2di_ashiftrt. gcc/testsuite/ChangeLog * gcc.target/i386/sse2-stv-2.c: New test case. Diff: --- gcc/config/i386/i386-expand.cc | 156 gcc/config/i386/i386-features.cc | 6 +- gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/sse.md | 159 +++-- gcc/testsuite/gcc.target/i386/sse2-stv-2.c | 10 ++ 5 files changed, 180 insertions(+), 152 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index d9ad06264aaf..bdbc14232679 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -7471,6 +7471,162 @@ ix86_expand_v1ti_ashiftrt (rtx operands[]) } } +/* Expand V2DI mode ashiftrt. */ +void +ix86_expand_v2di_ashiftrt (rtx operands[]) +{ + if (operands[2] == const0_rtx) +{ + emit_move_insn (operands[0], operands[1]); + return; +} + + if (TARGET_SSE4_2 + && CONST_INT_P (operands[2]) + && UINTVAL (operands[2]) >= 63 + && !optimize_insn_for_size_p ()) +{ + rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode)); + emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1])); + return; +} + + if (CONST_INT_P (operands[2]) + && (!TARGET_XOP || UINTVAL (operands[2]) >= 63)) +{ + vec_perm_builder sel (4, 4, 1); + sel.quick_grow (4); + rtx arg0, arg1; + rtx op1 = lowpart_subreg (V4SImode, + force_reg (V2DImode, operands[1]), + V2DImode); + rtx target = gen_reg_rtx (V4SImode); + if (UINTVAL (operands[2]) >= 63) + { + arg0 = arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31))); + sel[0] = 1; + sel[1] = 1; + sel[2] = 3; + sel[3] = 3; + } + else if (INTVAL (operands[2]) > 32) + { + arg0 = gen_reg_rtx (V4SImode); + arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31))); + emit_insn (gen_ashrv4si3 (arg0, op1, + GEN_INT (INTVAL (operands[2]) - 32))); + sel[0] = 1; + sel[1] = 5; + sel[2] = 3; + sel[3] = 7; + } + else if (INTVAL (operands[2]) == 32) + { + arg0 = op1; + arg1 = gen_reg_rtx (V4SImode); + emit_insn (gen_ashrv4si3 (arg1,
[gcc r15-2793] testsuite: Fix recent regression of g++.dg/other/sse2-pr85572-1.C
https://gcc.gnu.org/g:990a65fb1aa5d1b05a7737df879afb6900e2ce96 commit r15-2793-g990a65fb1aa5d1b05a7737df879afb6900e2ce96 Author: Roger Sayle Date: Wed Aug 7 12:52:26 2024 +0100 testsuite: Fix recent regression of g++.dg/other/sse2-pr85572-1.C My sincere apologies for not noticing that g++.dg/other/sse2-pr85572-1.C was FAILing with my recent ashrv2di patch. I'm not sure how that happened. Many thanks to Andrew Pinski for alerting me, and confirming that the changes are harmless/beneficial. Sorry again for the inconvenience. 2024-08-07 Roger Sayle gcc/testsuite/ChangeLog * g++.dg/other/sse2-pr85572-1.C: Update expected output after my recent patch for ashrv2di3. Now with one less instruction. Diff: --- gcc/testsuite/g++.dg/other/sse2-pr85572-1.C | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gcc/testsuite/g++.dg/other/sse2-pr85572-1.C b/gcc/testsuite/g++.dg/other/sse2-pr85572-1.C index e4c442394243..46edc065c33c 100644 --- a/gcc/testsuite/g++.dg/other/sse2-pr85572-1.C +++ b/gcc/testsuite/g++.dg/other/sse2-pr85572-1.C @@ -1,9 +1,10 @@ // PR target/85572 // { dg-do compile { target i?86-*-* x86_64-*-* } } // { dg-options "-O2 -msse2 -mno-sse3" } -// { dg-final { scan-assembler-times {\mpxor\M} 2 } } -// { dg-final { scan-assembler-times {\mpsubq\M} 2 } } -// { dg-final { scan-assembler-times {\mpsrlq\M} 1 } } +// { dg-final { scan-assembler-times {\mpsrad\M} 1 } } +// { dg-final { scan-assembler-times {\mpshufd\M} 1 } } +// { dg-final { scan-assembler-times {\mpxor\M} 1 } } +// { dg-final { scan-assembler-times {\mpsubq\M} 1 } } typedef long long V __attribute__((vector_size (16)));
[gcc r15-2816] i386: Tweak ix86_mode_can_transfer_bits to restore bootstrap on RHEL.
https://gcc.gnu.org/g:4d44f3fc387815eb232d7757352857993a1d21d9 commit r15-2816-g4d44f3fc387815eb232d7757352857993a1d21d9 Author: Roger Sayle Date: Thu Aug 8 11:16:29 2024 +0100 i386: Tweak ix86_mode_can_transfer_bits to restore bootstrap on RHEL. This minor patch, very similar to one posted and approved previously at https://gcc.gnu.org/pipermail/gcc-patches/2024-July/657229.html is required to restore builds on systems using gcc 4.8 as a host compiler. Using the enumeration constants E_SFmode and E_DFmode avoids issues with SFmode and DFmode being "non-literal types in constant expressions". 2024-08-08 Roger Sayle gcc/ChangeLog * config/i386/i386.cc (ix86_mode_can_transfer_bits): Use E_?Fmode enumeration constants in switch statement. Diff: --- gcc/config/i386/i386.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 8f289b5bc228..02e282904410 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -26113,8 +26113,8 @@ ix86_mode_can_transfer_bits (machine_mode mode) || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT) switch (GET_MODE_INNER (mode)) { - case SFmode: - case DFmode: + case E_SFmode: + case E_DFmode: /* These suffer from normalization upon load when not using SSE. */ return !(ix86_fpmath & FPMATH_387); default:
[gcc r15-2880] PR target/116275: Handle STV of *extenddi2_doubleword_highpart on i386.
https://gcc.gnu.org/g:7a970bd03f1d8eed7703db8a8db3c753ea68899f commit r15-2880-g7a970bd03f1d8eed7703db8a8db3c753ea68899f Author: Roger Sayle Date: Mon Aug 12 06:52:48 2024 +0100 PR target/116275: Handle STV of *extenddi2_doubleword_highpart on i386. This patch resolves PR target/116275, a recent ICE-on-valid regression on -m32 caused by my recent change to enable STV of DImode arithmeric right shift on non-AVX512VL targets. The oversight is that the i386 backend contains an *extenddi2_doubleword_highpart instruction (whose pattern is an arithmetic right shift of a left shift) that optimizes the case where sign-extension need only update the highpart word of a DImode value when generating 32-bit code (!TARGET_64BIT). STV accepts this pattern as a candidate, as there are patterns to handle this form of extension on SSE using AVX512VL instructions (and previously ASHIFTRT was only allowed on AVX512VL). Now that ASHIFTRT is a candidate on non-AVX512vL targets, we either need to check that the first operand is a register, or as done below provide the define_insn_and_split that provides a non-AVX512VL implementation of *extendv2di_highpart_stv. The new testcase only ICEed with -m32, so this test could be limited to target ia32, but there's no harm also running this test on -m64 to provide a little extra test coverage. 2024-08-12 Roger Sayle gcc/ChangeLog PR target/116275 * config/i386/i386.md (*extendv2di2_highpart_stv_noavx512vl): New define_insn_and_split to handle the STV conversion of the DImode pattern *extendsi2_doubleword_highpart. gcc/testsuite/ChangeLog PR target/116275 * g++.target/i386/pr116275.C: New test case. Diff: --- gcc/config/i386/i386.md | 18 ++ gcc/testsuite/g++.target/i386/pr116275.C | 15 +++ 2 files changed, 33 insertions(+) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index db7789c17d2a..1a6188f5161b 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -17393,6 +17393,24 @@ (ashift:V2DI (match_dup 1) (match_dup 2))) (set (match_dup 0) (ashiftrt:V2DI (match_dup 0) (match_dup 2)))]) + +;; Without AVX512VL, split this instruction before reload. +(define_insn_and_split "*extendv2di2_highpart_stv_noavx512vl" + [(set (match_operand:V2DI 0 "register_operand" "=v") + (ashiftrt:V2DI + (ashift:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "vm") + (match_operand:QI 2 "const_int_operand")) + (match_operand:QI 3 "const_int_operand")))] + "!TARGET_AVX512VL + && INTVAL (operands[2]) == INTVAL (operands[3]) + && UINTVAL (operands[2]) < 32 + && ix86_pre_reload_split ()" + "#" + "&& 1" + [(set (match_dup 0) + (ashift:V2DI (match_dup 1) (match_dup 2))) + (set (match_dup 0) + (ashiftrt:V2DI (match_dup 0) (match_dup 2)))]) ;; Rotate instructions diff --git a/gcc/testsuite/g++.target/i386/pr116275.C b/gcc/testsuite/g++.target/i386/pr116275.C new file mode 100644 index ..69c5b5a2ef9f --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr116275.C @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx -std=c++11" } */ + +struct SymbolDesc push_back(SymbolDesc); +struct SymbolDesc { + long long ELFLocalSymIdx; +}; +struct Expected { + long long &operator*(); +}; +void SymbolizableObjectFileaddSymbol() { + Expected SymbolAddressOrErr; + long long SymbolAddress = *SymbolAddressOrErr << 8 >> 8; + push_back({SymbolAddress}); +}
[gcc r15-2940] i386: Improve split of *extendv2di2_highpart_stv_noavx512vl.
https://gcc.gnu.org/g:b6fb4f7f651d2aa89548c5833fe2679af2638df5 commit r15-2940-gb6fb4f7f651d2aa89548c5833fe2679af2638df5 Author: Roger Sayle Date: Thu Aug 15 22:02:05 2024 +0100 i386: Improve split of *extendv2di2_highpart_stv_noavx512vl. This patch follows up on the previous patch to fix PR target/116275 by improving the code STV (ultimately) generates for highpart sign extensions like (x<<8)>>8. The arithmetic right shift is able to take advantage of the available common subexpressions from the preceding left shift. Hence previously with -O2 -m32 -mavx -mno-avx512vl we'd generate: vpsllq $8, %xmm0, %xmm0 vpsrad $8, %xmm0, %xmm1 vpsrlq $8, %xmm0, %xmm0 vpblendw$51, %xmm0, %xmm1, %xmm0 But with improved splitting, we now generate three instructions: vpslld $8, %xmm1, %xmm0 vpsrad $8, %xmm0, %xmm0 vpblendw$51, %xmm1, %xmm0, %xmm0 This patch also implements Uros' suggestion that the pre-reload splitter could introduced a new pseudo to hold the intermediate to potentially help reload with register allocation, which applies when not performing the above optimization, i.e. on TARGET_XOP. 2024-08-15 Roger Sayle Uros Bizjak gcc/ChangeLog * config/i386/i386.md (*extendv2di2_highpart_stv_noavx512vl): Split to an improved implementation on !TARGET_XOP. On TARGET_XOP, use a new pseudo for the intermediate to simplify register allocation. gcc/testsuite/ChangeLog * g++.target/i386/pr116275-2.C: New test case. Diff: --- gcc/config/i386/i386.md| 32 -- gcc/testsuite/g++.target/i386/pr116275-2.C | 19 ++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index efbab2f25ec..36108e5c2c9 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -17872,10 +17872,38 @@ && ix86_pre_reload_split ()" "#" "&& 1" - [(set (match_dup 0) + [(set (match_dup 4) (ashift:V2DI (match_dup 1) (match_dup 2))) (set (match_dup 0) - (ashiftrt:V2DI (match_dup 0) (match_dup 2)))]) + (ashiftrt:V2DI (match_dup 4) (match_dup 2)))] +{ + if (!TARGET_XOP) +{ + rtx op0 = operands[0]; + rtx op2 = operands[2]; + rtx tmp1 = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_reg_rtx (V4SImode); + rtx tmp3 = gen_reg_rtx (V4SImode); + rtx tmp4 = gen_reg_rtx (V4SImode); + emit_move_insn (tmp1, lowpart_subreg (V4SImode, operands[1], V2DImode)); + emit_insn (gen_ashlv4si3 (tmp2, tmp1, op2)); + emit_insn (gen_ashrv4si3 (tmp3, tmp2, op2)); + vec_perm_builder sel (4, 4, 1); + sel.quick_grow (4); + sel[0] = 0; + sel[1] = 5; + sel[2] = 2; + sel[3] = 7; + vec_perm_indices indices(sel, 2, 4); + bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode, tmp4, + tmp1, tmp3, indices); + gcc_assert (ok); + emit_move_insn (op0, lowpart_subreg (V2DImode, tmp4, V4SImode)); + DONE; +} + else +operands[4] = gen_reg_rtx (V2DImode); +}) ;; Rotate instructions diff --git a/gcc/testsuite/g++.target/i386/pr116275-2.C b/gcc/testsuite/g++.target/i386/pr116275-2.C new file mode 100644 index 000..98d3c19e59c --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr116275-2.C @@ -0,0 +1,19 @@ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-O2 -mavx -mno-avx512vl -std=c++11" } */ + +struct SymbolDesc push_back(SymbolDesc); +struct SymbolDesc { + long long ELFLocalSymIdx; +}; +struct Expected { + long long &operator*(); +}; +void SymbolizableObjectFileaddSymbol() { + Expected SymbolAddressOrErr; + long long SymbolAddress = *SymbolAddressOrErr << 8 >> 8; + push_back({SymbolAddress}); +} + +/* { dg-final { scan-assembler "vpslld" } } */ +/* { dg-final { scan-assembler-not "vpsllq" } } */ +/* { dg-final { scan-assembler-not "vpsrlq" } } */
[gcc r15-222] PR target/106060: Improved SSE vector constant materialization on x86.
https://gcc.gnu.org/g:79649a5dcd81bc05c0ba591068c9075de43bd417 commit r15-222-g79649a5dcd81bc05c0ba591068c9075de43bd417 Author: Roger Sayle Date: Tue May 7 07:14:40 2024 +0100 PR target/106060: Improved SSE vector constant materialization on x86. This patch resolves PR target/106060 by providing efficient methods for materializing/synthesizing special "vector" constants on x86. Currently there are three methods of materializing a vector constant; the most general is to load a vector from the constant pool, secondly "duplicated" constants can be synthesized by moving an integer between units and broadcasting (of shuffling it), and finally the special cases of the all-zeros vector and all-ones vectors can be loaded via a single SSE instruction. This patch handle additional cases that can be synthesized in two instructions, loading an all-ones vector followed by another SSE instruction. Following my recent patch for PR target/112992, there's conveniently a single place in i386-expand.cc where these special cases can be handled. Two examples are given in the original bugzilla PR for 106060. __m256i should_be_cmpeq_abs () { return _mm256_set1_epi8 (1); } is now generated (with -O3 -march=x86-64-v3) as: vpcmpeqd%ymm0, %ymm0, %ymm0 vpabsb %ymm0, %ymm0 ret and __m256i should_be_cmpeq_add () { return _mm256_set1_epi8 (-2); } is now generated as: vpcmpeqd%ymm0, %ymm0, %ymm0 vpaddb %ymm0, %ymm0, %ymm0 ret 2024-05-07 Roger Sayle Hongtao Liu gcc/ChangeLog PR target/106060 * config/i386/i386-expand.cc (enum ix86_vec_bcast_alg): New. (struct ix86_vec_bcast_map_simode_t): New type for table below. (ix86_vec_bcast_map_simode): Table of SImode constants that may be efficiently synthesized by a ix86_vec_bcast_alg method. (ix86_vec_bcast_map_simode_cmp): New comparator for bsearch. (ix86_vector_duplicate_simode_const): Efficiently synthesize V4SImode and V8SImode constants that duplicate special constants. (ix86_vector_duplicate_value): Attempt to synthesize "special" vector constants using ix86_vector_duplicate_simode_const. * config/i386/i386.cc (ix86_rtx_costs) : ABS of a vector integer mode costs with a single SSE instruction. gcc/testsuite/ChangeLog PR target/106060 * gcc.target/i386/auto-init-8.c: Update test case. * gcc.target/i386/avx512fp16-13.c: Likewise. * gcc.target/i386/pr100865-9a.c: Likewise. * gcc.target/i386/pr101796-1.c: Likewise. * gcc.target/i386/pr106060-1.c: New test case. * gcc.target/i386/pr106060-2.c: Likewise. * gcc.target/i386/pr106060-3.c: Likewise. * gcc.target/i386/pr70314.c: Update test case. * gcc.target/i386/vect-shiftv4qi.c: Likewise. * gcc.target/i386/vect-shiftv8qi.c: Likewise. Diff: --- gcc/config/i386/i386-expand.cc | 364 - gcc/config/i386/i386.cc| 2 + gcc/testsuite/gcc.target/i386/auto-init-8.c| 2 +- gcc/testsuite/gcc.target/i386/avx512fp16-13.c | 3 - gcc/testsuite/gcc.target/i386/pr100865-9a.c| 2 +- gcc/testsuite/gcc.target/i386/pr101796-1.c | 6 +- gcc/testsuite/gcc.target/i386/pr106060-1.c | 12 + gcc/testsuite/gcc.target/i386/pr106060-2.c | 13 + gcc/testsuite/gcc.target/i386/pr106060-3.c | 14 + gcc/testsuite/gcc.target/i386/pr70314.c| 2 +- gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c | 2 +- gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c | 2 +- 12 files changed, 411 insertions(+), 13 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 8bb8f21e686..a6132911e6a 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -15696,6 +15696,332 @@ s4fma_expand: gcc_unreachable (); } +/* See below where shifts are handled for explanation of this enum. */ +enum ix86_vec_bcast_alg +{ + VEC_BCAST_PXOR, + VEC_BCAST_PCMPEQ, + VEC_BCAST_PABSB, + VEC_BCAST_PADDB, + VEC_BCAST_PSRLW, + VEC_BCAST_PSRLD, + VEC_BCAST_PSLLW, + VEC_BCAST_PSLLD +}; + +struct ix86_vec_bcast_map_simode_t +{ + unsigned int key; + enum ix86_vec_bcast_alg alg; + unsigned int arg; +}; + +/* This table must be kept sorted as values are looked-up using bsearch. */ +static const ix86_vec_bcast_map_simode_t ix86_vec_bcast_map_simode[] = { + { 0x, VEC_BCAST_PXOR,0 }, + { 0x0001, VEC_BCAST_PSRLD, 31 }, + { 0x0003, VEC_BCAST_PSRLD, 30 }, + { 0x0007, VEC_BCAST_PSRLD, 29 }, + { 0x000f, VEC_BCAST_PSRLD, 28 }, + { 0x001f, VE
[gcc r15-352] Constant fold {-1,-1} << 1 in simplify-rtx.cc
https://gcc.gnu.org/g:f2449b55fb2d32fc4200667ba79847db31f6530d commit r15-352-gf2449b55fb2d32fc4200667ba79847db31f6530d Author: Roger Sayle Date: Thu May 9 22:45:54 2024 +0100 Constant fold {-1,-1} << 1 in simplify-rtx.cc This patch addresses a missed optimization opportunity in the RTL optimization passes. The function simplify_const_binary_operation will constant fold binary operators with two CONST_INT operands, and those with two CONST_VECTOR operands, but is missing compile-time evaluation of binary operators with a CONST_VECTOR and a CONST_INT, such as vector shifts and rotates. The first version of this patch didn't contain a switch statement to explicitly check for valid binary opcodes, which bootstrapped and regression tested fine, but my paranoia has got the better of me, so this version now checks that VEC_SELECT or some funky (future) rtx_code doesn't cause problems. 2024-05-09 Roger Sayle gcc/ChangeLog * simplify-rtx.cc (simplify_const_binary_operation): Constant fold binary operations where the LHS is CONST_VECTOR and the RHS is CONST_INT (or CONST_DOUBLE) such as vector shifts. Diff: --- gcc/simplify-rtx.cc | 54 + 1 file changed, 54 insertions(+) diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc index dceaa1ca..53f54d1d3928 100644 --- a/gcc/simplify-rtx.cc +++ b/gcc/simplify-rtx.cc @@ -5021,6 +5021,60 @@ simplify_const_binary_operation (enum rtx_code code, machine_mode mode, return gen_rtx_CONST_VECTOR (mode, v); } + if (VECTOR_MODE_P (mode) + && GET_CODE (op0) == CONST_VECTOR + && (CONST_SCALAR_INT_P (op1) || CONST_DOUBLE_AS_FLOAT_P (op1)) + && (CONST_VECTOR_DUPLICATE_P (op0) + || CONST_VECTOR_NUNITS (op0).is_constant ())) +{ + switch (code) + { + case PLUS: + case MINUS: + case MULT: + case DIV: + case MOD: + case UDIV: + case UMOD: + case AND: + case IOR: + case XOR: + case SMIN: + case SMAX: + case UMIN: + case UMAX: + case LSHIFTRT: + case ASHIFTRT: + case ASHIFT: + case ROTATE: + case ROTATERT: + case SS_PLUS: + case US_PLUS: + case SS_MINUS: + case US_MINUS: + case SS_ASHIFT: + case US_ASHIFT: + case COPYSIGN: + break; + default: + return NULL_RTX; + } + + unsigned int npatterns = (CONST_VECTOR_DUPLICATE_P (op0) + ? CONST_VECTOR_NPATTERNS (op0) + : CONST_VECTOR_NUNITS (op0).to_constant ()); + rtx_vector_builder builder (mode, npatterns, 1); + for (unsigned i = 0; i < npatterns; i++) + { + rtx x = simplify_binary_operation (code, GET_MODE_INNER (mode), +CONST_VECTOR_ELT (op0, i), op1); + if (!x || !valid_for_const_vector_p (mode, x)) + return 0; + builder.quick_push (x); + } + return builder.build (); +} + if (SCALAR_FLOAT_MODE_P (mode) && CONST_DOUBLE_AS_FLOAT_P (op0) && CONST_DOUBLE_AS_FLOAT_P (op1)
[gcc r15-366] i386: Improve V[48]QI shifts on AVX512/SSE4.1
https://gcc.gnu.org/g:f5a8cdc1ef5d6aa2de60849c23658ac5298df7bb commit r15-366-gf5a8cdc1ef5d6aa2de60849c23658ac5298df7bb Author: Roger Sayle Date: Fri May 10 20:26:40 2024 +0100 i386: Improve V[48]QI shifts on AVX512/SSE4.1 The following one line patch improves the code generated for V8QI and V4QI shifts when AV512BW and AVX512VL functionality is available. For the testcase (from gcc.target/i386/vect-shiftv8qi.c): typedef signed char v8qi __attribute__ ((__vector_size__ (8))); v8qi foo (v8qi x) { return x >> 5; } GCC with -O2 -march=cascadelake currently generates: foo:movl$67372036, %eax vpsraw $5, %xmm0, %xmm2 vpbroadcastd%eax, %xmm1 movl$117901063, %eax vpbroadcastd%eax, %xmm3 vmovdqa %xmm1, %xmm0 vmovdqa %xmm3, -24(%rsp) vpternlogd $120, -24(%rsp), %xmm2, %xmm0 vpsubb %xmm1, %xmm0, %xmm0 ret with this patch we now generate the much improved: foo:vpmovsxbw %xmm0, %xmm0 vpsraw $5, %xmm0, %xmm0 vpmovwb %xmm0, %xmm0 ret This patch also fixes the FAILs of gcc.target/i386/vect-shiftv[48]qi.c when run with the additional -march=cascadelake flag, by splitting these tests into two; one form testing code generation with -msse2 (and -mno-avx512vl) as originally intended, and the other testing AVX512 code generation with an explicit -march=cascadelake. 2024-05-10 Roger Sayle Hongtao Liu gcc/ChangeLog * config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial): Don't attempt ix86_expand_vec_shift_qihi_constant on SSE4.1. gcc/testsuite/ChangeLog * gcc.target/i386/vect-shiftv4qi.c: Specify -mno-avx512vl. * gcc.target/i386/vect-shiftv8qi.c: Likewise. * gcc.target/i386/vect-shiftv4qi-2.c: New test case. * gcc.target/i386/vect-shiftv8qi-2.c: Likewise. Diff: --- gcc/config/i386/i386-expand.cc | 3 ++ gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c | 43 gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c | 2 +- gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c | 43 gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c | 2 +- 5 files changed, 91 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 2f27bfb484c2..1ab22fe79736 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -24283,6 +24283,9 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2) if (CONST_INT_P (op2) && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT) + /* With AVX512 it's cheaper to do vpmovsxbw/op/vpmovwb. + Even with SSE4.1 the alternative is better. */ + && !TARGET_SSE4_1 && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2)) { emit_move_insn (dest, gen_lowpart (qimode, qdest)); diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c new file mode 100644 index ..abc1a276b043 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=cascadelake" } */ + +#define N 4 + +typedef unsigned char __vu __attribute__ ((__vector_size__ (N))); +typedef signed char __vi __attribute__ ((__vector_size__ (N))); + +__vu sll (__vu a, int n) +{ + return a << n; +} + +__vu sll_c (__vu a) +{ + return a << 5; +} + +/* { dg-final { scan-assembler-times "vpsllw" 2 } } */ + +__vu srl (__vu a, int n) +{ + return a >> n; +} + +__vu srl_c (__vu a) +{ + return a >> 5; +} + +/* { dg-final { scan-assembler-times "vpsrlw" 2 } } */ + +__vi sra (__vi a, int n) +{ + return a >> n; +} + +__vi sra_c (__vi a) +{ + return a >> 5; +} + +/* { dg-final { scan-assembler-times "vpsraw" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c index b7e45c2e8799..9b52582d01f8 100644 --- a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c +++ b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -msse2" } */ +/* { dg-options "-O2 -msse2 -mno-avx2 -mno-avx512vl" } */ #define N 4 diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c new file mode 100644 index ..52760f5a0607 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c @@ -0,0 +1,43 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -march=cascadelake" } */ + +#define N 8 + +typedef unsigned char __vu __attribute__ ((__vector_size__ (N))); +typedef signed char __vi __attribute__ ((__vector_size__ (N))); +
[gcc r15-390] arm: Use utxb rN, rM, ror #8 to implement zero_extract on armv6.
https://gcc.gnu.org/g:46077992180d6d86c86544df5e8cb943492d3b01 commit r15-390-g46077992180d6d86c86544df5e8cb943492d3b01 Author: Roger Sayle Date: Sun May 12 16:27:22 2024 +0100 arm: Use utxb rN, rM, ror #8 to implement zero_extract on armv6. Examining the code generated for the following C snippet on a raspberry pi: int popcount_lut8(unsigned *buf, int n) { int cnt=0; unsigned int i; do { i = *buf; cnt += lut[i&255]; cnt += lut[i>>8&255]; cnt += lut[i>>16&255]; cnt += lut[i>>24]; buf++; } while(--n); return cnt; } I was surprised to see following instruction sequence generated by the compiler: movr5, r2, lsr #8 uxtb r5, r5 This sequence can be performed by a single ARM instruction: uxtb r5, r2, ror #8 The attached patch allows GCC's combine pass to take advantage of ARM's uxtb with rotate functionality to implement the above zero_extract, and likewise to use the sxtb with rotate to implement sign_extract. ARM's uxtb and sxtb can only be used with rotates of 0, 8, 16 and 24, and of these only the 8 and 16 are useful [ror #0 is a nop, and extends with ror #24 can be implemented using regular shifts], so the approach here is to add the six missing but useful instructions as 6 different define_insn in arm.md, rather than try to be clever with new predicates. Later ARM hardware has advanced bit field instructions, and earlier ARM cores didn't support extend-with-rotate, so this appears to only benefit armv6 era CPUs (e.g. the raspberry pi). Patch posted: https://gcc.gnu.org/legacy-ml/gcc-patches/2018-01/msg01339.html Approved by Kyrill Tkachov: https://gcc.gnu.org/legacy-ml/gcc-patches/2018-01/msg01881.html 2024-05-12 Roger Sayle Kyrill Tkachov * config/arm/arm.md (*arm_zeroextractsi2_8_8, *arm_signextractsi2_8_8, *arm_zeroextractsi2_8_16, *arm_signextractsi2_8_16, *arm_zeroextractsi2_16_8, *arm_signextractsi2_16_8): New. 2024-05-12 Roger Sayle Kyrill Tkachov * gcc.target/arm/extend-ror.c: New test. Diff: --- gcc/config/arm/arm.md | 66 +++ gcc/testsuite/gcc.target/arm/extend-ror.c | 38 ++ 2 files changed, 104 insertions(+) diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md index 1fd00146ca9e..f47e036a8034 100644 --- a/gcc/config/arm/arm.md +++ b/gcc/config/arm/arm.md @@ -12647,6 +12647,72 @@ "" ) +;; Implement zero_extract using uxtb/uxth instruction with +;; the ror #N qualifier when applicable. + +(define_insn "*arm_zeroextractsi2_8_8" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (zero_extract:SI (match_operand:SI 1 "s_register_operand" "r") +(const_int 8) (const_int 8)))] + "TARGET_ARM && arm_arch6" + "uxtb%?\\t%0, %1, ror #8" + [(set_attr "predicable" "yes") + (set_attr "type" "extend")] +) + +(define_insn "*arm_zeroextractsi2_8_16" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (zero_extract:SI (match_operand:SI 1 "s_register_operand" "r") +(const_int 8) (const_int 16)))] + "TARGET_ARM && arm_arch6" + "uxtb%?\\t%0, %1, ror #16" + [(set_attr "predicable" "yes") + (set_attr "type" "extend")] +) + +(define_insn "*arm_zeroextractsi2_16_8" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (zero_extract:SI (match_operand:SI 1 "s_register_operand" "r") +(const_int 16) (const_int 8)))] + "TARGET_ARM && arm_arch6" + "uxth%?\\t%0, %1, ror #8" + [(set_attr "predicable" "yes") + (set_attr "type" "extend")] +) + +;; Implement sign_extract using sxtb/sxth instruction with +;; the ror #N qualifier when applicable. + +(define_insn "*arm_signextractsi2_8_8" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r") +(const_int 8) (const_int 8)))] + "TARGET_ARM && arm_arch6" + "sxtb%?\\t%0, %1, ror #8" + [(set_attr "predicable" "yes") + (set_attr "type" "extend")] +) + +(define_insn "*arm_signextractsi2_8_16" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r") +(const_int 8) (const_int 16)))] + "TARGET_ARM && arm_arch6" + "sxtb%?\\t%0, %1, ror #16" + [(set_attr "predicable" "yes") + (set_attr "type" "extend")] +) + +(define_insn "*arm_signextractsi2_16_8" + [(set (match_operand:SI 0 "s_register_operand" "=r") + (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r") +(const_int 16) (const_int 8)))] + "TARGET_ARM && arm_arch6" + "sxth%?\\t%0, %1, ror #8" + [(set_attr "predicable" "yes") + (set_attr "type"
[gcc r15-648] nvptx: Correct pattern for popcountdi2 insn in nvptx.md.
https://gcc.gnu.org/g:1676ef6e91b902f592270e4bcf10b4fc342e200d commit r15-648-g1676ef6e91b902f592270e4bcf10b4fc342e200d Author: Roger Sayle Date: Sun May 19 09:49:45 2024 +0100 nvptx: Correct pattern for popcountdi2 insn in nvptx.md. The result of a POPCOUNT operation in RTL should have the same mode as its operand. This corrects the specification of popcount in the nvptx backend, splitting the current generic define_insn into two, one for popcountsi2 and the other for popcountdi2 (the latter with an explicit truncate). 2024-05-19 Roger Sayle gcc/ChangeLog * config/nvptx/nvptx.md (popcount2): Split into... (popcountsi2): define_insn handling SImode popcount. (popcountdi2): define_insn handling DImode popcount, with an explicit truncate:SI to produce an SImode result. Diff: --- gcc/config/nvptx/nvptx.md | 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md index 96e6c9116080..ef7e3fb00fac 100644 --- a/gcc/config/nvptx/nvptx.md +++ b/gcc/config/nvptx/nvptx.md @@ -655,11 +655,18 @@ DONE; }) -(define_insn "popcount2" +(define_insn "popcountsi2" [(set (match_operand:SI 0 "nvptx_register_operand" "=R") - (popcount:SI (match_operand:SDIM 1 "nvptx_register_operand" "R")))] + (popcount:SI (match_operand:SI 1 "nvptx_register_operand" "R")))] "" - "%.\\tpopc.b%T1\\t%0, %1;") + "%.\\tpopc.b32\\t%0, %1;") + +(define_insn "popcountdi2" + [(set (match_operand:SI 0 "nvptx_register_operand" "=R") + (truncate:SI + (popcount:DI (match_operand:DI 1 "nvptx_register_operand" "R"] + "" + "%.\\tpopc.b64\\t%0, %1;") ;; Multiplication variants
[gcc r15-3162] i386: Update STV's gains for TImode arithmetic right shifts on AVX2.
https://gcc.gnu.org/g:07d62a1711f3e3bbdd2146ab5914d3bc5e246509 commit r15-3162-g07d62a1711f3e3bbdd2146ab5914d3bc5e246509 Author: Roger Sayle Date: Sun Aug 25 09:14:34 2024 -0600 i386: Update STV's gains for TImode arithmetic right shifts on AVX2. This patch tweaks timode_scalar_chain::compute_convert_gain to better reflect the expansion of V1TImode arithmetic right shifts by the i386 backend. The comment "see ix86_expand_v1ti_ashiftrt" appears after "case ASHIFTRT" in compute_convert_gain, and the changes below attempt to better match the logic used there. The original motivating example is: __int128 m1; void foo() { m1 = (m1 << 8) >> 8; } which with -O2 -mavx2 we fail to convert to vector form due to the inappropriate cost of the arithmetic right shift. Instruction gain -16 for 7: {r103:TI=r101:TI>>0x8;clobber flags:CC;} Total gain: -3 Chain #1 conversion is not profitable This is reporting that the ASHIFTRT is four instructions worse using vectors than in scalar form, which is incorrect as the AVX2 expansion of this shift only requires three instructions (and the scalar form requires two). With more accurate costs in timode_scalar_chain::compute_convert_gain we now see (with -O2 -mavx2): Instruction gain -4 for 7: {r103:TI=r101:TI>>0x8;clobber flags:CC;} Total gain: 9 Converting chain #1... which results in: foo:vmovdqa m1(%rip), %xmm0 vpslldq $1, %xmm0, %xmm0 vpsrad $8, %xmm0, %xmm1 vpsrldq $1, %xmm0, %xmm0 vpblendd$7, %xmm0, %xmm1, %xmm0 vmovdqa %xmm0, m1(%rip) ret 2024-08-25 Roger Sayle Uros Bizjak gcc/ChangeLog * config/i386/i386-features.cc (compute_convert_gain) : Update to match ix86_expand_v1ti_ashiftrt. Diff: --- gcc/config/i386/i386-features.cc | 21 + 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index 7e80e7b0103f..ca902ecf0de5 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -1650,23 +1650,28 @@ timode_scalar_chain::compute_convert_gain () else if (op1val == 64) vcost = COSTS_N_INSNS (3); else if (op1val == 96) - vcost = COSTS_N_INSNS (4); - else if (op1val >= 111) vcost = COSTS_N_INSNS (3); - else if (TARGET_AVX2 && op1val == 32) + else if (op1val >= 111) vcost = COSTS_N_INSNS (3); else if (TARGET_SSE4_1 && op1val == 32) - vcost = COSTS_N_INSNS (4); + vcost = COSTS_N_INSNS (3); + else if (TARGET_SSE4_1 + && (op1val == 8 || op1val == 16 || op1val == 24)) + vcost = COSTS_N_INSNS (3); else if (op1val >= 96) - vcost = COSTS_N_INSNS (5); + vcost = COSTS_N_INSNS (4); + else if (TARGET_SSE4_1 && (op1val == 28 || op1val == 80)) + vcost = COSTS_N_INSNS (4); else if ((op1val & 7) == 0) - vcost = COSTS_N_INSNS (6); + vcost = COSTS_N_INSNS (5); else if (TARGET_AVX2 && op1val < 32) vcost = COSTS_N_INSNS (6); + else if (TARGET_SSE4_1 && op1val < 15) + vcost = COSTS_N_INSNS (6); else if (op1val == 1 || op1val >= 64) - vcost = COSTS_N_INSNS (9); + vcost = COSTS_N_INSNS (8); else - vcost = COSTS_N_INSNS (10); + vcost = COSTS_N_INSNS (9); } igain = scost - vcost; break;
[gcc r15-3281] i386: Support wide immediate constants in STV.
https://gcc.gnu.org/g:3cb92be94e6581697369eeafdb67057c8cfba73f commit r15-3281-g3cb92be94e6581697369eeafdb67057c8cfba73f Author: Roger Sayle Date: Wed Aug 28 21:19:28 2024 -0600 i386: Support wide immediate constants in STV. This patch provides more accurate costs/gains for (wide) immediate constants in STV, suitably adjusting the costs/gains when the highpart and lowpart words are the same. 2024-08-28 Roger Sayle gcc/ChangeLog * config/i386/i386-features.cc (timode_immed_const_gain): New function to determine the gain/cost on a CONST_WIDE_INT. (timode_scalar_chain::compute_convert_gain): Fix whitespace. : Provide more accurate estimates using timode_immed_const_gain. : Handle CONSTANT_SCALAR_INT_P (src). Diff: --- gcc/config/i386/i386-features.cc | 28 +++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index ca902ecf0de5..c09a5c73a8e3 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -1503,6 +1503,23 @@ general_scalar_chain::convert_insn (rtx_insn *insn) df_insn_rescan (insn); } +/* Helper function to compute gain for loading an immediate constant. + Typically, two movabsq for TImode vs. vmovdqa for V1TImode, but + with numerous special cases. */ + +static int +timode_immed_const_gain (rtx cst) +{ + /* movabsq vs. movabsq+vmovq+vunpacklqdq. */ + if (CONST_WIDE_INT_P (cst) + && CONST_WIDE_INT_NUNITS (cst) == 2 + && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1)) +return optimize_insn_for_size_p () ? -COSTS_N_BYTES (9) + : -COSTS_N_INSNS (2); + /* 2x movabsq ~ vmovdqa. */ + return 0; +} + /* Compute a gain for chain conversion. */ int @@ -1549,7 +1566,14 @@ timode_scalar_chain::compute_convert_gain () case CONST_INT: if (MEM_P (dst) && standard_sse_constant_p (src, V1TImode)) - igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (11) : 1; + igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (11) : 1; + break; + + case CONST_WIDE_INT: + /* 2 x mov vs. vmovdqa. */ + if (MEM_P (dst)) + igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (3) + : COSTS_N_INSNS (1); break; case NOT: @@ -1562,6 +1586,8 @@ timode_scalar_chain::compute_convert_gain () case IOR: if (!MEM_P (dst)) igain = COSTS_N_INSNS (1); + if (CONST_SCALAR_INT_P (XEXP (src, 1))) + igain += timode_immed_const_gain (XEXP (src, 1)); break; case ASHIFT:
[gcc r15-3342] i386: Support read-modify-write memory operands in STV.
https://gcc.gnu.org/g:bac00c34226bac3a95979b21dc2d668a96b14f6e commit r15-3342-gbac00c34226bac3a95979b21dc2d668a96b14f6e Author: Roger Sayle Date: Sat Aug 31 14:17:18 2024 -0600 i386: Support read-modify-write memory operands in STV. This patch enables STV when the first operand of a TImode binary logic operand (AND, IOR or XOR) is a memory operand, which is commonly the case with read-modify-write instructions. A different motivating example from the one given previously is: __int128 m, p, q; void foo() { m ^= (p & q); } Currently with -O2 -mavx the RMW instructions are rejected by STV, resulting in scalar code: foo:movqp(%rip), %rax movqp+8(%rip), %rdx andqq(%rip), %rax andqq+8(%rip), %rdx xorq%rax, m(%rip) xorq%rdx, m+8(%rip) ret With this patch they become scalar-to-vector candidates: foo:vmovdqa p(%rip), %xmm0 vpand q(%rip), %xmm0, %xmm0 vpxor m(%rip), %xmm0, %xmm0 vmovdqa %xmm0, m(%rip) ret 2024-08-31 Roger Sayle gcc/ChangeLog * config/i386/i386-features.cc (timode_scalar_to_vector_candidate_p): Support the first operand of AND, IOR and XOR being MEM_P, i.e. a read-modify-write insn. gcc/testsuite/ChangeLog * gcc.target/i386/movti-2.c: Change dg-options to -Os. * gcc.target/i386/movti-4.c: Expected output of original movti-2.c. Diff: --- gcc/config/i386/i386-features.cc| 6 -- gcc/testsuite/gcc.target/i386/movti-2.c | 2 +- gcc/testsuite/gcc.target/i386/movti-4.c | 11 +++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index c09a5c73a8e3..3434d0069439 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -2330,14 +2330,16 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn) || CONST_SCALAR_INT_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1 return true; - return REG_P (XEXP (src, 0)) + return (REG_P (XEXP (src, 0)) + || timode_mem_p (XEXP (src, 0))) && (REG_P (XEXP (src, 1)) || CONST_SCALAR_INT_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1))); case IOR: case XOR: - return REG_P (XEXP (src, 0)) + return (REG_P (XEXP (src, 0)) + || timode_mem_p (XEXP (src, 0))) && (REG_P (XEXP (src, 1)) || CONST_SCALAR_INT_P (XEXP (src, 1)) || timode_mem_p (XEXP (src, 1))); diff --git a/gcc/testsuite/gcc.target/i386/movti-2.c b/gcc/testsuite/gcc.target/i386/movti-2.c index 73f69d290cbd..c3a6ae3c51de 100644 --- a/gcc/testsuite/gcc.target/i386/movti-2.c +++ b/gcc/testsuite/gcc.target/i386/movti-2.c @@ -1,5 +1,5 @@ /* { dg-do compile { target int128 } } */ -/* { dg-options "-O2 -mavx" } */ +/* { dg-options "-Os -mavx" } */ __int128 m; void foo() diff --git a/gcc/testsuite/gcc.target/i386/movti-4.c b/gcc/testsuite/gcc.target/i386/movti-4.c new file mode 100644 index ..eac66fcbf3d1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/movti-4.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -mavx" } */ +__int128 m; + +void foo() +{ +m &= ((__int128)0x0123456789abcdefULL<<64) | 0x0123456789abcdefULL; +} + +/* { dg-final { scan-assembler-times "movabsq" 1 } } */ +/* { dg-final { scan-assembler-times "vpand" 1 } } */