[gcc r15-1701] i386: Some additional AVX512 ternlog refinements.

2024-06-27 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:5938cf021e95b40b040974c9cbe7860399247f7f

commit r15-1701-g5938cf021e95b40b040974c9cbe7860399247f7f
Author: Roger Sayle 
Date:   Fri Jun 28 07:12:53 2024 +0100

i386: Some additional AVX512 ternlog refinements.

This patch is another round of refinements to fine tune the new ternlog
infrastructure in i386's sse.md.  This patch tweaks ix86_ternlog_idx
to allow multiple MEM/CONST_VECTOR/VEC_DUPLICATE operands prior to
splitting (before reload), when force_register is called on all but
one of these operands.  Conceptually during the dynamic programming,
registers fill the args slots in the order 0, 1, 2, and mem-like
operands fill the slots in the order 2, 0, 1 [preferring the memory
operand to come last].

This patch allows us to remove some of the legacy ternlog patterns
in sse.md without regressions [which is left to the next and final
patch in this series].  An indication that these patterns are no
longer required is shown by the necessary testsuite tweaks below,
where the output assembler for the legacy instructions used hexadecimal,
but with the new ternlog infrastructure now consistently use decimal.

2024-06-28  Roger Sayle  

gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_ternlog_idx) :
Add a "goto do_mem_operand" as this need not match memory_operand.
: Only args[2] may be volatile memory operand.
Allow MEM/VEC_DUPLICATE/CONST_VECTOR as args[0] and args[1].

gcc/testsuite/ChangeLog
* gcc.target/i386/avx512f-andn-di-zmm-2.c: Match decimal instead
of hexadecimal immediate operand to ternlog.
* gcc.target/i386/avx512f-andn-si-zmm-2.c: Likewise.
* gcc.target/i386/avx512f-orn-si-zmm-1.c: Likewise.
* gcc.target/i386/avx512f-orn-si-zmm-2.c: Likewise.
* gcc.target/i386/pr100711-3.c: Likewise.
* gcc.target/i386/pr100711-4.c: Likewise.
* gcc.target/i386/pr100711-5.c: Likewise.

Diff:
---
 gcc/config/i386/i386-expand.cc | 35 --
 .../gcc.target/i386/avx512f-andn-di-zmm-2.c|  2 +-
 .../gcc.target/i386/avx512f-andn-si-zmm-2.c|  2 +-
 .../gcc.target/i386/avx512f-orn-si-zmm-1.c |  2 +-
 .../gcc.target/i386/avx512f-orn-si-zmm-2.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr100711-3.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr100711-4.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr100711-5.c |  2 +-
 8 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index eccad080f7c..dd2c3a8718e 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -25606,7 +25606,7 @@ ix86_ternlog_idx (rtx op, rtx *args)
 case VEC_DUPLICATE:
   if (!bcst_mem_operand (op, GET_MODE (op)))
return -1;
-  /* FALLTHRU */
+  goto do_mem_operand;
 
 case MEM:
   if (!memory_operand (op, GET_MODE (op)))
@@ -25618,23 +25618,52 @@ ix86_ternlog_idx (rtx op, rtx *args)
   /* FALLTHRU */
 
 case CONST_VECTOR:
+do_mem_operand:
   if (!args[2])
{
  args[2] = op;
  return 0xaa;
}
   /* Maximum of one volatile memory reference per expression.  */
-  if (side_effects_p (op) && side_effects_p (args[2]))
+  if (side_effects_p (op))
return -1;
   if (rtx_equal_p (op, args[2]))
return 0xaa;
-  /* Check if one CONST_VECTOR is the ones-complement of the other.  */
+  /* Check if CONST_VECTOR is the ones-complement of args[2].  */
   if (GET_CODE (op) == CONST_VECTOR
  && GET_CODE (args[2]) == CONST_VECTOR
  && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
  op, GET_MODE (op)),
  args[2]))
return 0x55;
+  if (!args[0])
+   {
+ args[0] = op;
+ return 0xf0;
+   }
+  if (rtx_equal_p (op, args[0]))
+   return 0xf0;
+  /* Check if CONST_VECTOR is the ones-complement of args[0].  */
+  if (GET_CODE (op) == CONST_VECTOR
+ && GET_CODE (args[0]) == CONST_VECTOR
+ && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
+ op, GET_MODE (op)),
+ args[0]))
+   return 0x0f;
+  if (!args[1])
+   {
+ args[1] = op;
+ return 0xcc;
+   }
+  if (rtx_equal_p (op, args[1]))
+   return 0xcc;
+  /* Check if CONST_VECTOR is the ones-complement of args[1].  */
+  if (GET_CODE (op) == CONST_VECTOR
+ && GET_CODE (args[1]) == CONST_VECTOR
+ && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
+ op, GET_MODE (op)),
+

[gcc r15-1702] i386: Handle sign_extend like zero_extend in *concatditi3_[346]

2024-06-27 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:07e915913b6b3d4e6e210f6dbc8e7e0e8ea594c4

commit r15-1702-g07e915913b6b3d4e6e210f6dbc8e7e0e8ea594c4
Author: Roger Sayle 
Date:   Fri Jun 28 07:16:07 2024 +0100

i386: Handle sign_extend like zero_extend in *concatditi3_[346]

This patch generalizes some of the patterns in i386.md that recognize
double word concatenation, so they handle sign_extend the same way that
they handle zero_extend in appropriate contexts.

As a motivating example consider the following function:

__int128 foo(long long x, unsigned long long y)
{
  return ((__int128)x<<64) | y;
}

when compiled with -O2, x86_64 currently generates:

foo:movq%rdi, %rdx
xorl%eax, %eax
xorl%edi, %edi
orq %rsi, %rax
orq %rdi, %rdx
ret

with this patch we now generate (the same as if x is unsigned):

foo:movq%rsi, %rax
movq%rdi, %rdx
ret

Treating both extensions the same way using any_extend is valid as
the top (extended) bits are "unused" after the shift by 64 (or more).
In theory, the RTL optimizers might consider canonicalizing the form
of extension used in these cases, but zero_extend is faster on some
machine, whereas sign extension is supported via addressing modes on
others, so handling both in the machine description is probably best.

2024-06-28  Roger Sayle  

gcc/ChangeLog
* config/i386/i386.md (*concat3_3): Change zero_extend
to any_extend in first operand to left shift by mode precision.
(*concat3_4): Likewise.
(*concat3_6): Likewise.

gcc/testsuite/ChangeLog
* gcc.target/i386/concatditi-1.c: New test case.

Diff:
---
 gcc/config/i386/i386.md  |  6 +++---
 gcc/testsuite/gcc.target/i386/concatditi-1.c | 10 ++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index fd48e764469..b6ccb1e798d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -13446,7 +13446,7 @@
   [(set (match_operand: 0 "nonimmediate_operand" "=ro,r,r,&r,x")
(any_or_plus:
  (ashift:
-   (zero_extend:
+   (any_extend:
  (match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m,x"))
(match_operand:QI 2 "const_int_operand"))
  (zero_extend:
@@ -13473,7 +13473,7 @@
  (zero_extend:
(match_operand:DWIH 1 "nonimmediate_operand" "r,m,r,m"))
  (ashift:
-   (zero_extend:
+   (any_extend:
  (match_operand:DWIH 2 "nonimmediate_operand" "r,r,m,m"))
(match_operand:QI 3 "const_int_operand"]
   "INTVAL (operands[3]) ==  * BITS_PER_UNIT"
@@ -13520,7 +13520,7 @@
   [(set (match_operand: 0 "nonimmediate_operand" "=r,o,o,r")
(any_or_plus:
  (ashift:
-   (zero_extend:
+   (any_extend:
  (match_operand:DWIH 1 "nonimmediate_operand" "r,r,r,m"))
(match_operand:QI 2 "const_int_operand"))
  (match_operand: 3 "const_scalar_int_operand" "n,n,Wd,n")))]
diff --git a/gcc/testsuite/gcc.target/i386/concatditi-1.c 
b/gcc/testsuite/gcc.target/i386/concatditi-1.c
new file mode 100644
index 000..25c2a95586b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/concatditi-1.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+
+__int128 foo(long long x, unsigned long long y)
+{
+  return ((__int128)x<<64) | y;
+}
+
+/* { dg-final { scan-assembler-not "xorl" } } */
+/* { dg-final { scan-assembler-not "orq" } } */


[gcc r15-1751] i386: Additional peephole2 to use lea in round-up integer division.

2024-07-01 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:142b5263b18be96e5d9ce406ad2c1b6ab35c190f

commit r15-1751-g142b5263b18be96e5d9ce406ad2c1b6ab35c190f
Author: Roger Sayle 
Date:   Mon Jul 1 12:18:26 2024 +0100

i386: Additional peephole2 to use lea in round-up integer division.

A common idiom for implementing an integer division that rounds upwards is
to write (x + y - 1) / y.  Conveniently on x86, the two additions to form
the numerator can be performed by a single lea instruction, and indeed gcc
currently generates a lea when both x and y are both registers.

int foo(int x, int y) {
  return (x+y-1)/y;
}

generates with -O2:

foo:leal-1(%rsi,%rdi), %eax // 4 bytes
cltd
idivl   %esi
ret

Oddly, however, if x is a memory, gcc currently uses two instructions:

int m;
int bar(int y) {
  return (m+y-1)/y;
}

generates:

foo:movlm(%rip), %eax
addl%edi, %eax  // 2 bytes
subl$1, %eax// 3 bytes
cltd
idivl   %edi
ret

This discrepancy is caused by the late decision (in peephole2) to split
an addition with a memory operand, into a load followed by a reg-reg
addition.  This patch improves this situation by adding a peephole2
to recognize consecutive additions and transform them into lea if
profitable.

My first attempt at fixing this was to use a define_insn_and_split:

(define_insn_and_split "*lea3_reg_mem_imm"
  [(set (match_operand:SWI48 0 "register_operand")
   (plus:SWI48 (plus:SWI48 (match_operand:SWI48 1 "register_operand")
   (match_operand:SWI48 2 "memory_operand"))
   (match_operand:SWI48 3 "x86_64_immediate_operand")))]
  "ix86_pre_reload_split ()"
  "#"
  "&& 1"
  [(set (match_dup 4) (match_dup 2))
   (set (match_dup 0) (plus:SWI48 (plus:SWI48 (match_dup 1) (match_dup 4))
 (match_dup 3)))]
  "operands[4] = gen_reg_rtx (mode);")

using combine to combine instructions.  Unfortunately, this approach
interferes with (reload's) subtle balance of deciding when to use/avoid lea,
which can be observed as a code size regression in CSiBE.  The peephole2
approach (proposed here) uniformly improves CSiBE results.

2024-07-01  Roger Sayle  

gcc/ChangeLog
* config/i386/i386.md (peephole2): Transform two consecutive
additions into a 3-component lea if !TARGET_AVOID_LEA_FOR_ADDR.

gcc/testsuite/ChangeLog
* gcc.target/i386/lea-3.c: New test case.

Diff:
---
 gcc/config/i386/i386.md   | 15 +++
 gcc/testsuite/gcc.target/i386/lea-3.c | 13 +
 2 files changed, 28 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 59a889da304..0b6f6e75072 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -6332,6 +6332,21 @@
   "TARGET_APX_NF && reload_completed"
   [(set (match_dup 0) (ashift:SWI48 (match_dup 0) (match_dup 1)))]
   "operands[1] = GEN_INT (exact_log2 (INTVAL (operands[1])));")
+
+;; The peephole2 pass may expose consecutive additions suitable for lea.
+(define_peephole2
+  [(parallel [(set (match_operand:SWI48 0 "register_operand")
+  (plus:SWI48 (match_dup 0)
+  (match_operand 1 "register_operand")))
+ (clobber (reg:CC FLAGS_REG))])
+   (parallel [(set (match_dup 0)
+  (plus:SWI48 (match_dup 0)
+  (match_operand 2 "x86_64_immediate_operand")))
+ (clobber (reg:CC FLAGS_REG))])]
+  "!TARGET_AVOID_LEA_FOR_ADDR || optimize_function_for_size_p (cfun)"
+  [(set (match_dup 0) (plus:SWI48 (plus:SWI48 (match_dup 0)
+ (match_dup 1))
+ (match_dup 2)))])
 
 ;; Add instructions
 
diff --git a/gcc/testsuite/gcc.target/i386/lea-3.c 
b/gcc/testsuite/gcc.target/i386/lea-3.c
new file mode 100644
index 000..84e66b00fc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/lea-3.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+int m;
+
+int foo(int y)
+{
+  return (m+y-1)/y;
+}
+
+/* { dg-final { scan-assembler "leal" } } */
+/* { dg-final { scan-assembler-not "addl" } } */
+/* { dg-final { scan-assembler-not "subl" } } */


[gcc r15-1752] testsuite: Fix -m32 gcc.target/i386/pr102464-vrndscaleph.c on RedHat.

2024-07-01 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:589865a8e4f6bd26c622ea0ee0a38565a0d42e80

commit r15-1752-g589865a8e4f6bd26c622ea0ee0a38565a0d42e80
Author: Roger Sayle 
Date:   Mon Jul 1 12:21:20 2024 +0100

testsuite: Fix -m32 gcc.target/i386/pr102464-vrndscaleph.c on RedHat.

This patch fixes the 4 FAILs of gcc.target/i386/pr192464-vrndscaleph.c
with --target_board='unix{-m32}' on RedHat 7.x.  The issue is that this
AVX512 test includes the system math.h, and on older systems this provides
inline versions of floor, ceil and rint (for the 387).  The work around
is to define __NO_MATH_INLINES before #include  (or alternatively
use __builtin_floor, __builtin_ceil, etc.).

2024-07-01  Roger Sayle  

gcc/testsuite/ChangeLog
PR middle-end/102464
* gcc.target/i386/pr102464-vrndscaleph.c: Define __NO_MATH_INLINES
to resovle FAILs with -m32 on older RedHat systems.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c 
b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
index a76d9e7e376..9eb8124e3f5 100644
--- a/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
+++ b/gcc/testsuite/gcc.target/i386/pr102464-vrndscaleph.c
@@ -1,6 +1,9 @@
 /* PR target/102464.  */
 /* { dg-do compile } */
 /* { dg-options "-Ofast -mavx512fp16 -mavx512vl -mprefer-vector-width=512" } */
+#ifndef __NO_MATH_INLINES
+#define __NO_MATH_INLINES
+#endif
 #include
 void
 foo (_Float16* __restrict a, _Float16* b)


[gcc r15-1835] i386: Add additional variant of bswaphisi2_lowpart peephole2.

2024-07-03 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:727f8b142b7d5442af6c2e903293abc367a8de5f

commit r15-1835-g727f8b142b7d5442af6c2e903293abc367a8de5f
Author: Roger Sayle 
Date:   Thu Jul 4 07:31:17 2024 +0100

i386: Add additional variant of bswaphisi2_lowpart peephole2.

This patch adds an additional variation of the peephole2 used to convert
bswaphisi2_lowpart into rotlhi3_1_slp, which converts xchgb %ah,%al into
rotw if the flags register isn't live.  The motivating example is:

void ext(int x);
void foo(int x)
{
  ext((x&~0x)|((x>>8)&0xff)|((x&0xff)<<8));
}

where GCC with -O2 currently produces:

foo:movl%edi, %eax
rolw$8, %ax
movl%eax, %edi
jmp ext

The issue is that the original xchgb (bswaphisi2_lowpart) can only be
performed in "Q" registers that allow the %?h register to be used, so
reload generates the above two movl.  However, it's later in peephole2
where we see that CC_FLAGS can be clobbered, so we can use a rotate word,
which is more forgiving with register allocations.  With the additional
peephole2 proposed here, we now generate:

foo:rolw$8, %di
jmp ext

2024-07-04  Roger Sayle  

gcc/ChangeLog
* config/i386/i386.md (bswaphisi2_lowpart peephole2): New
peephole2 variant to eliminate register shuffling.

gcc/testsuite/ChangeLog
* gcc.target/i386/xchg-4.c: New test case.

Diff:
---
 gcc/config/i386/i386.md| 24 
 gcc/testsuite/gcc.target/i386/xchg-4.c | 11 +++
 2 files changed, 35 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 4a44b69b5fc..b24c4fe5875 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -21489,6 +21489,30 @@
  (clobber (reg:CC FLAGS_REG))])]
   "operands[0] = gen_lowpart (HImode, operands[0]);")
 
+;; Variant of above peephole2 to improve register allocation.
+(define_peephole2
+  [(set (match_operand:SI 0 "general_reg_operand")
+(match_operand:SI 1 "register_operand"))
+   (set (match_dup 0)
+   (ior:SI (and:SI (match_dup 0)
+   (const_int -65536))
+   (lshiftrt:SI (bswap:SI (match_dup 0))
+(const_int 16
+   (set (match_operand:SI 2 "general_reg_operand") (match_dup 0))]
+  "!(TARGET_USE_XCHGB ||
+ TARGET_PARTIAL_REG_STALL || optimize_function_for_size_p (cfun))
+   && peep2_regno_dead_p (0, FLAGS_REG)
+   && peep2_reg_dead_p(3, operands[0])"
+  [(parallel
+[(set (strict_low_part (match_dup 3))
+ (rotate:HI (match_dup 3) (const_int 8)))
+ (clobber (reg:CC FLAGS_REG))])]
+{
+  if (!rtx_equal_p (operands[1], operands[2]))
+emit_move_insn (operands[2], operands[1]);
+  operands[3] = gen_lowpart (HImode, operands[2]);
+})
+
 (define_expand "paritydi2"
   [(set (match_operand:DI 0 "register_operand")
(parity:DI (match_operand:DI 1 "register_operand")))]
diff --git a/gcc/testsuite/gcc.target/i386/xchg-4.c 
b/gcc/testsuite/gcc.target/i386/xchg-4.c
new file mode 100644
index 000..de099e79f5d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/xchg-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+
+void ext(int x);
+void foo(int x) 
+{
+ext((x&~0x)|((x>>8)&0xff)|((x&0xff)<<8));
+}
+
+/* { dg-final { scan-assembler "rolw" } } */
+/* { dg-final { scan-assembler-not "mov" } } */


[gcc r15-1869] PR target/115751: Avoid force_reg in ix86_expand_ternlog.

2024-07-05 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:9a7e3f57e1ab8e6e4cf5ea3c0998aa50c6220579

commit r15-1869-g9a7e3f57e1ab8e6e4cf5ea3c0998aa50c6220579
Author: Roger Sayle 
Date:   Sat Jul 6 05:24:39 2024 +0100

PR target/115751: Avoid force_reg in ix86_expand_ternlog.

This patch fixes a problem with splitting of complex AVX512 ternlog
instructions on x86_64.  A recent change allows the ternlog pattern
to have multiple mem-like operands prior to reload, by emitting any
"reloads" as necessary during split1, before register allocation.
The issue is that this code calls force_reg to place the mem-like
operand into a register, but unfortunately the vec_duplicate (broadcast)
form of operands supported by ternlog isn't considered a "general_operand",
i.e. supported by all instructions.  This mismatch triggers an ICE in
the middle-end's force_reg, even though the x86 supports loading these
vec_duplicate operands into a vector register in a single (move)
instruction.

This patch resolves this problem by replacing force_reg with calls
to gen_reg_rtx and emit_move (as the i386 backend, unlike the middle-end,
knows these will be recognized by recog).

2024-07-06  Roger Sayle  

gcc/ChangeLog
PR target/115751
* config/i386/i386-expand.cc (ix86_expand_ternlog): Avoid use of
force_reg to "reload" non-register operands, as these may contain
vec_duplicate (broadcast) operands that aren't supported by
force_reg.  Use (safer) gen_reg_rtx and emit_move instead.

Diff:
---
 gcc/config/i386/i386-expand.cc | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a773b45bf03..bf79e59f811 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -26050,14 +26050,25 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx 
op1, rtx op2, int idx,
   break;
 }
 
-  tmp0 = register_operand (op0, mode) ? op0 : force_reg (mode, op0);
+  if (!register_operand (op0, mode))
+{
+  /* We can't use force_reg (mode, op0).  */
+  tmp0 = gen_reg_rtx (GET_MODE (op0));
+  emit_move_insn (tmp0,op0);
+}
+  else
+tmp0 = op0;
   if (GET_MODE (tmp0) != mode)
 tmp0 = gen_lowpart (mode, tmp0);
 
   if (!op1 || rtx_equal_p (op0, op1))
 tmp1 = copy_rtx (tmp0);
   else if (!register_operand (op1, mode))
-tmp1 = force_reg (mode, op1);
+{
+  /* We can't use force_reg (mode, op1).  */
+  tmp1 = gen_reg_rtx (GET_MODE (op1));
+  emit_move_insn (tmp1, op1);
+}
   else
 tmp1 = op1;
   if (GET_MODE (tmp1) != mode)


[gcc r15-2000] i386: Some AVX512 ternlog expansion refinements.

2024-07-12 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:6b5d263f2c90c3e22cdf576970c94bca268c5296

commit r15-2000-g6b5d263f2c90c3e22cdf576970c94bca268c5296
Author: Roger Sayle 
Date:   Fri Jul 12 12:30:56 2024 +0100

i386: Some AVX512 ternlog expansion refinements.

This patch replaces the calls to force_reg in ix86_expand_ternlog_binop
and ix86_expand_ternlog with gen_reg_rtx and emit_move_insn.
This patch also cleans up whitespace, consistently uses CONST_VECTOR_P
instead of GET_CODE and tweaks checks for ix86_ternlog_leaf_p (for
example where vpandn may take a memory operand).

2024-07-12  Roger Sayle  
Hongtao Liu  

gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_broadcast_from_constant):
Use CONST_VECTOR_P instead of comparison against GET_CODE.
(ix86_gen_bcst_mem): Likewise.
(ix86_ternlog_leaf_p): Likewise.
(ix86_ternlog_operand_p): ix86_ternlog_leaf_p is always true for
vector_all_ones_operand.
(ix86_expand_ternlog_bin_op): Use CONST_VECTOR_P instead of
equality comparison against GET_CODE.  Replace call to force_reg
with gen_reg_rtx and emit_move_insn (for VEC_DUPLICATE broadcast).
Check for !register_operand instead of memory_operand.
Support CONST_VECTORs by calling force_const_mem.
(ix86_expand_ternlog): Fix indentation whitespace.
Allow ix86_ternlog_leaf_p as ix86_expand_ternlog_andnot's second
operand. Use CONST_VECTOR_P instead of equality against GET_CODE.
Use gen_reg_rtx and emit_move_insn for ~a, ~b and ~c cases.

Diff:
---
 gcc/config/i386/i386-expand.cc | 126 +
 1 file changed, 78 insertions(+), 48 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index abc702d3ff27..cfcfdd94e8f0 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -613,7 +613,7 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op)
 return nullptr;
 
   rtx constant = get_pool_constant (XEXP (op, 0));
-  if (GET_CODE (constant) != CONST_VECTOR)
+  if (!CONST_VECTOR_P (constant))
 return nullptr;
 
   /* There could be some rtx like
@@ -623,7 +623,7 @@ ix86_broadcast_from_constant (machine_mode mode, rtx op)
 {
   constant = simplify_subreg (mode, constant, GET_MODE (constant),
  0);
-  if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+  if (constant == nullptr || !CONST_VECTOR_P (constant))
return nullptr;
 }
 
@@ -25561,7 +25561,7 @@ static rtx
 ix86_gen_bcst_mem (machine_mode mode, rtx x)
 {
   if (!TARGET_AVX512F
-  || GET_CODE (x) != CONST_VECTOR
+  || !CONST_VECTOR_P (x)
   || (!TARGET_AVX512VL
  && (GET_MODE_SIZE (mode) != 64 || !TARGET_EVEX512))
   || !VALID_BCST_MODE_P (GET_MODE_INNER (mode))
@@ -25751,7 +25751,7 @@ ix86_ternlog_leaf_p (rtx op, machine_mode mode)
  problems splitting instructions.  */
   return register_operand (op, mode)
 || MEM_P (op)
-|| GET_CODE (op) == CONST_VECTOR
+|| CONST_VECTOR_P (op)
 || bcst_mem_operand (op, mode);
 }
 
@@ -25801,8 +25801,7 @@ ix86_ternlog_operand_p (rtx op)
   op1 = XEXP (op, 1);
   /* Prefer pxor, or one_cmpl2.  */
   if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
- && (ix86_ternlog_leaf_p (op1, mode)
- || vector_all_ones_operand (op1, mode)))
+ && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
return false;
   break;
 
@@ -25822,15 +25821,20 @@ ix86_expand_ternlog_binop (enum rtx_code code, 
machine_mode mode,
   if (GET_MODE (op1) != mode)
 op1 = gen_lowpart (mode, op1);
 
-  if (GET_CODE (op0) == CONST_VECTOR)
+  if (CONST_VECTOR_P (op0))
 op0 = validize_mem (force_const_mem (mode, op0));
-  if (GET_CODE (op1) == CONST_VECTOR)
+  if (CONST_VECTOR_P (op1))
 op1 = validize_mem (force_const_mem (mode, op1));
 
-  if (memory_operand (op0, mode))
+  if (!register_operand (op0, mode))
 {
-  if (memory_operand (op1, mode))
-   op0 = force_reg (mode, op0);
+  if (!register_operand (op1, mode))
+   {
+ /* We can't use force_reg (op0, mode).  */
+ rtx reg = gen_reg_rtx (mode);
+ emit_move_insn (reg, op0);
+ op0 = reg;
+   }
   else
std::swap (op0, op1);
 }
@@ -25849,6 +25853,8 @@ ix86_expand_ternlog_andnot (machine_mode mode, rtx op0, 
rtx op1, rtx target)
   op0 = gen_rtx_NOT (mode, op0);
   if (GET_MODE (op1) != mode)
 op1 = gen_lowpart (mode, op1);
+  if (CONST_VECTOR_P (op1))
+op1 = validize_mem (force_const_mem (mode, op1));
   emit_move_insn (target, gen_rtx_AND (mode, op0, op1));
   return target;
 }
@@ -25885,9 +25891,9 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx 
op1, rtx op2, int idx,
 {
 case 0x00:
   if ((!op0 || !side_effects_p (op0))
-   

[gcc r15-2027] i386: Tweak i386-expand.cc to restore bootstrap on RHEL.

2024-07-14 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:74e6dfb23163c2dd670d1d60fbf4c782e0b44b94

commit r15-2027-g74e6dfb23163c2dd670d1d60fbf4c782e0b44b94
Author: Roger Sayle 
Date:   Sun Jul 14 17:22:27 2024 +0100

i386: Tweak i386-expand.cc to restore bootstrap on RHEL.

This is a minor change to restore bootstrap on systems using gcc 4.8
as a host compiler.  The fatal error is:

In file included from gcc/gcc/coretypes.h:471:0,
 from gcc/gcc/config/i386/i386-expand.cc:23:
gcc/gcc/config/i386/i386-expand.cc: In function 'void 
ix86_expand_fp_absneg_operator(rtx_code, machine_mode, rtx_def**)':
./insn-modes.h:315:75: error: temporary of non-literal type 
'scalar_float_mode' in a constant expression
 #define HFmode (scalar_float_mode ((scalar_float_mode::from_int) E_HFmode))
   ^
gcc/gcc/config/i386/i386-expand.cc:2179:8: note: in expansion of macro 
'HFmode'
   case HFmode:
^

The solution is to use the E_?Fmode enumeration constants as case values
in switch statements.

2024-07-14  Roger Sayle  

* config/i386/i386-expand.cc (ix86_expand_fp_absneg_operator):
Use E_?Fmode enumeration constants in switch statement.
(ix86_expand_copysign): Likewise.
(ix86_expand_xorsign): Likewise.

Diff:
---
 gcc/config/i386/i386-expand.cc | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index cfcfdd94e8f0..9a31e6df2aa2 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2176,19 +2176,19 @@ ix86_expand_fp_absneg_operator (enum rtx_code code, 
machine_mode mode,
 
   switch (mode)
   {
-  case HFmode:
+  case E_HFmode:
 use_sse = true;
 vmode = V8HFmode;
 break;
-  case BFmode:
+  case E_BFmode:
 use_sse = true;
 vmode = V8BFmode;
 break;
-  case SFmode:
+  case E_SFmode:
 use_sse = TARGET_SSE_MATH && TARGET_SSE;
 vmode = V4SFmode;
 break;
-  case DFmode:
+  case E_DFmode:
 use_sse = TARGET_SSE_MATH && TARGET_SSE2;
 vmode = V2DFmode;
 break;
@@ -2330,19 +2330,19 @@ ix86_expand_copysign (rtx operands[])
 
   switch (mode)
   {
-  case HFmode:
+  case E_HFmode:
 vmode = V8HFmode;
 break;
-  case BFmode:
+  case E_BFmode:
 vmode = V8BFmode;
 break;
-  case SFmode:
+  case E_SFmode:
 vmode = V4SFmode;
 break;
-  case DFmode:
+  case E_DFmode:
 vmode = V2DFmode;
 break;
-  case TFmode:
+  case E_TFmode:
 vmode = mode;
 break;
   default:
@@ -2410,16 +2410,16 @@ ix86_expand_xorsign (rtx operands[])
 
   switch (mode)
   {
-  case HFmode:
+  case E_HFmode:
 vmode = V8HFmode;
 break;
-  case BFmode:
+  case E_BFmode:
 vmode = V8BFmode;
 break;
-  case SFmode:
+  case E_SFmode:
 vmode = V4SFmode;
 break;
-  case DFmode:
+  case E_DFmode:
 vmode = V2DFmode;
 break;
   default:


[gcc r15-2053] PR tree-optimization/114661: Generalize MULT_EXPR recognition in match.pd.

2024-07-16 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:df9451936c6c9e4faea371e3f188e1fc6b6d39e3

commit r15-2053-gdf9451936c6c9e4faea371e3f188e1fc6b6d39e3
Author: Roger Sayle 
Date:   Tue Jul 16 07:58:28 2024 +0100

PR tree-optimization/114661: Generalize MULT_EXPR recognition in match.pd.

This patch resolves PR tree-optimization/114661, by generalizing the set
of expressions that we canonicalize to multiplication.  This extends the
optimization(s) contributed (by me) back in July 2021.
https://gcc.gnu.org/pipermail/gcc-patches/2021-July/575999.html

The existing transformation folds (X*C1)^(X< 3) __builtin_unreachable();
return c << 18 | c << 15 |
   c << 12 | c << 9 |
   c << 6 | c << 3 | c;
}

GCC on x86_64 with -O2 previously generated:

mul:movzbl  %dil, %edi
leal(%rdi,%rdi,8), %edx
leal0(,%rdx,8), %eax
movl%edx, %ecx
sall$15, %edx
orl %edi, %eax
sall$9, %ecx
orl %ecx, %eax
orl %edx, %eax
ret

with this patch we now generate:

mul:movzbl  %dil, %eax
imull   $299593, %eax, %eax
ret

2024-07-16  Roger Sayle  
Richard Biener  

gcc/ChangeLog
PR tree-optimization/114661
* match.pd ((X*C1)|(X*C2) to X*(C1+C2)): Allow optional useless
type conversions around multiplications, such as those inserted
by this transformation.

gcc/testsuite/ChangeLog
PR tree-optimization/114661
* gcc.dg/pr114661.c: New test case.

Diff:
---
 gcc/match.pd| 43 +
 gcc/testsuite/gcc.dg/pr114661.c | 10 ++
 2 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 3759c64d461f..24a0bbead3e7 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -4171,30 +4171,39 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
Likewise, handle (X< 0
-   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@3)) == 0)
-   (with { wide_int wone = wi::one (TYPE_PRECISION (type));
+   && (tree_nonzero_bits (@5) & tree_nonzero_bits (@3)) == 0)
+   (with { tree t = type;
+  if (!TYPE_OVERFLOW_WRAPS (t))
+t = unsigned_type_for (t);
+  wide_int wone = wi::one (TYPE_PRECISION (type));
   wide_int c = wi::add (wi::to_wide (@2),
 wi::lshift (wone, wi::to_wide (@4))); }
-(mult @1 { wide_int_to_tree (type, c); }
+(convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); })
  (simplify
-  (op:c (mult:s@0 @1 INTEGER_CST@2)
+  (op:c (nop_convert?:s@3 (mult:s@0 (nop_convert? @1) INTEGER_CST@2))
@1)
-  (if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_WRAPS (type)
-   && (tree_nonzero_bits (@0) & tree_nonzero_bits (@1)) == 0)
-   (mult @1
-{ wide_int_to_tree (type,
-wi::add (wi::to_wide (@2), 1)); })))
+  (if (INTEGRAL_TYPE_P (type)
+   && (tree_nonzero_bits (@3) & tree_nonzero_bits (@1)) == 0)
+   (with { tree t = type;
+  if (!TYPE_OVERFLOW_WRAPS (t))
+t = unsigned_type_for (t);
+  wide_int c = wi::add (wi::to_wide (@2), 1); }
+(convert (mult:t (convert:t @1) { wide_int_to_tree (t, c); })
  (simplify
   (op (lshift:s@0 @1 INTEGER_CST@2)
   (lshift:s@3 @1 INTEGER_CST@4))
diff --git a/gcc/testsuite/gcc.dg/pr114661.c b/gcc/testsuite/gcc.dg/pr114661.c
new file mode 100644
index ..e6b5c69dba86
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr114661.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-evrp" } */
+
+unsigned mul(unsigned char c) {
+if (c > 3) __builtin_unreachable();
+return c << 18 | c << 15 |
+c << 12 | c << 9 |
+c << 6 | c << 3 | c;
+}
+/* { dg-final { scan-tree-dump-times " \\* 299593" 1 "evrp" } } */


[gcc r15-2132] Implement a -ftrapping-math/-fsignaling-nans TODO in match.pd.

2024-07-18 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:030186cabe8128e752619e101768cf8823a42c38

commit r15-2132-g030186cabe8128e752619e101768cf8823a42c38
Author: Roger Sayle 
Date:   Thu Jul 18 08:27:36 2024 +0100

Implement a -ftrapping-math/-fsignaling-nans TODO in match.pd.

I've been investigating some (float)i == CST optimizations for match.pd,
and noticed there's already a TODO comment in match.pd that's relatively
easy to implement.  When CST is a NaN, we only need to worry about
exceptions with flag_trapping_math, and equality/inequality tests for
sNaN only behave differently to qNaN with -fsignaling-nans.  These
issues are related to PR 57371 and PR 106805 in bugzilla.

2024-07-18  Roger Sayle  

gcc/ChangeLog
* match.pd ((FTYPE) N CMP CST): Only worry about exceptions with
flag_trapping_math, and about signaling NaNs with HONOR_SNANS.

gcc/testsuite/ChangeLog
* c-c++-common/pr57371-4.c: Update comment.
* c-c++-common/pr57371-5.c: Add missing testcases from pr57371-4.c
and update for -fno-signaling-nans -fno-trapping-math.

Diff:
---
 gcc/match.pd   | 14 ++--
 gcc/testsuite/c-c++-common/pr57371-4.c |  4 +---
 gcc/testsuite/c-c++-common/pr57371-5.c | 42 +++---
 3 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 5cb399b87180..6818856991c6 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6862,13 +6862,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
tree itype = TREE_TYPE (@0);
format_helper fmt (REAL_MODE_FORMAT (TYPE_MODE (TREE_TYPE (@1;
const REAL_VALUE_TYPE *cst = TREE_REAL_CST_PTR (@1);
-   /* Be careful to preserve any potential exceptions due to
- NaNs.  qNaNs are ok in == or != context.
- TODO: relax under -fno-trapping-math or
- -fno-signaling-nans.  */
-   bool exception_p
- = real_isnan (cst) && (cst->signalling
-   || (cmp != EQ_EXPR && cmp != NE_EXPR));
+   /* Be careful to preserve any potential exceptions due to NaNs.
+ qNaNs are ok in == or != context.  */
+   bool exception_p = real_isnan (cst)
+ && flag_trapping_math
+ && ((cmp != EQ_EXPR && cmp != NE_EXPR)
+ || (cst->signalling
+ && HONOR_SNANS (TREE_TYPE (@1;
  }
  /* TODO: allow non-fitting itype and SNaNs when
-fno-trapping-math.  */
diff --git a/gcc/testsuite/c-c++-common/pr57371-4.c 
b/gcc/testsuite/c-c++-common/pr57371-4.c
index f43f7c22419a..b0e539de4b9f 100644
--- a/gcc/testsuite/c-c++-common/pr57371-4.c
+++ b/gcc/testsuite/c-c++-common/pr57371-4.c
@@ -2,9 +2,7 @@
 /* { dg-options "-O -fsignaling-nans -fdump-tree-original" } */
 
 /* We can not get rid of comparison in tests below because of
-   pending NaN exceptions.
-
-   TODO: avoid under -fno-trapping-math.  */
+   pending NaN exceptions.  */
 
 #define QNAN __builtin_nanf ("0")
 #define SNAN __builtin_nansf ("0")
diff --git a/gcc/testsuite/c-c++-common/pr57371-5.c 
b/gcc/testsuite/c-c++-common/pr57371-5.c
index 8e18b0a73138..77decbe5dff5 100644
--- a/gcc/testsuite/c-c++-common/pr57371-5.c
+++ b/gcc/testsuite/c-c++-common/pr57371-5.c
@@ -2,11 +2,10 @@
 /* { dg-options "-O -fno-signaling-nans -fno-trapping-math 
-fdump-tree-original" } */
 
 /* We can not get rid of comparison in tests below because of
-   pending NaN exceptions.
-
-   TODO: avoid under -fno-trapping-math.  */
+   pending NaN exceptions.  */
 
 #define QNAN __builtin_nanf ("0")
+#define SNAN __builtin_nansf ("0")
 
 void nonfinite(unsigned short x) {
   {
@@ -33,6 +32,43 @@ void nonfinite(unsigned short x) {
 /* { dg-final { scan-tree-dump "nonfinite_4 = 0" "original" } } */
   }
 
+  {
+volatile int nonfinite_5;
+nonfinite_5 = (float) x > SNAN;
+/* { dg-final { scan-tree-dump "nonfinite_5 = 0" "original" } } */
+  }
+
+  {
+volatile int nonfinite_6;
+nonfinite_6 = (float) x >= SNAN;
+/* { dg-final { scan-tree-dump "nonfinite_6 = 0" "original" } } */
+  }
+
+  {
+volatile int nonfinite_7;
+nonfinite_7 = (float) x < SNAN;
+/* { dg-final { scan-tree-dump "nonfinite_7 = 0" "original" } } */
+  }
+
+  {
+volatile int nonfinite_8;
+nonfinite_8 = (float) x <= SNAN;
+/* { dg-final { scan-tree-dump "nonfinite_8 = 0" "original" } } */
+  }
+
+  {
+volatile int nonfinite_9;
+nonfinite_9 = (float) x == SNAN;
+/* { dg-final { scan-tree-dump "nonfinite_9 = 0" "original" } } */
+  }
+
+  {
+volatile int nonfinite_10;
+nonfinite_10 = (float) x != SNAN;
+/* { dg-final { scan-tree-dump "nonfinite_10 = 1" "original" } } *
+ */
+  }
+
   {
 volatile int nonfinite_11;
 nonfinite_11 = (float) x == QNAN;


[gcc r15-2359] Fold ctz(-x) and ctz(abs(x)) as ctz(x) in match.pd.

2024-07-27 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:928116e94a5a8a995dffd926af58abfa7286e78e

commit r15-2359-g928116e94a5a8a995dffd926af58abfa7286e78e
Author: Roger Sayle 
Date:   Sat Jul 27 15:16:19 2024 +0100

Fold ctz(-x) and ctz(abs(x)) as ctz(x) in match.pd.

The subject line pretty much says it all; the count-trailing-zeros function
of -X and abs(X) produce the same result as count-trailing-zeros of X.
This transformation eliminates a negation which may potentially overflow
with an equivalent expression that doesn't [much like the analogous
abs(-X) simplification in match.pd].

I'd noticed this -X equivalence, which isn't mentioned in Hacker's Delight,
investigating whether ranger's non_zero_bits can help determine whether
an integer variable may be converted to a floating point type exactly
(without raising FE_INEXACT), but it turns out this observation isn't
novel, as (disappointingly) LLVM already performs this same folding.

2024-07-27  Roger Sayle  
Andrew Pinski  

gcc/ChangeLog
* match.pd (ctz (-X) => ctz (X)): New simplification.
(ctz (abs (X)) => ctz (X)): Likewise.

gcc/testsuite/ChangeLog
* gcc.dg/fold-ctz-1.c: New test case.
* gcc.dg/fold-ctz-2.c: Likewise.

Diff:
---
 gcc/match.pd  | 6 ++
 gcc/testsuite/gcc.dg/fold-ctz-1.c | 9 +
 gcc/testsuite/gcc.dg/fold-ctz-2.c | 9 +
 3 files changed, 24 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index b2e7d61790df..1c8601229e3d 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9102,6 +9102,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 
 /* CTZ simplifications.  */
 (for ctz (CTZ)
+ /* ctz (-X) => ctz (X).  ctz (abs (X)) => ctz (X).  */
+ (for op (negate abs)
+  (simplify
+   (ctz (nop_convert?@0 (op @1)))
+(with { tree t = TREE_TYPE (@0); }
+ (ctz (convert:t @1)
  (for op (ge gt le lt)
   cmp (eq eq ne ne)
   (simplify
diff --git a/gcc/testsuite/gcc.dg/fold-ctz-1.c 
b/gcc/testsuite/gcc.dg/fold-ctz-1.c
new file mode 100644
index ..dcc444cbbb6b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-ctz-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int foo(int x)
+{
+  return __builtin_ctz (-x);
+}
+
+/* { dg-final { scan-tree-dump-not "-x_" "optimized"} } */
diff --git a/gcc/testsuite/gcc.dg/fold-ctz-2.c 
b/gcc/testsuite/gcc.dg/fold-ctz-2.c
new file mode 100644
index ..c685698f31e5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/fold-ctz-2.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int foo(int x)
+{
+  return __builtin_ctz (__builtin_abs (x));
+}
+
+/* { dg-final { scan-tree-dump-not "ABS_EXPR" "optimized"} } */


[gcc r15-774] Avoid ICE in except.cc on targets that don't support exceptions.

2024-05-22 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:26df7b4684e201e66c09dd018603a248ddc5f437

commit r15-774-g26df7b4684e201e66c09dd018603a248ddc5f437
Author: Roger Sayle 
Date:   Wed May 22 13:48:52 2024 +0100

Avoid ICE in except.cc on targets that don't support exceptions.

A number of testcases currently fail on nvptx with the ICE:

during RTL pass: final
openmp-simd-2.c: In function 'foo':
openmp-simd-2.c:28:1: internal compiler error: in get_personality_function, 
at expr.cc:14037
   28 | }
  | ^
0x98a38f get_personality_function(tree_node*)
/home/roger/GCC/nvptx-none/gcc/gcc/expr.cc:14037
0x969d3b output_function_exception_table(int)
/home/roger/GCC/nvptx-none/gcc/gcc/except.cc:3226
0x9b760d rest_of_handle_final
/home/roger/GCC/nvptx-none/gcc/gcc/final.cc:4252

The simple oversight in output_function_exception_table is that it calls
get_personality_function (immediately) before checking the target's
except_unwind_info hook (which on nvptx always returns UI_NONE).
The (perhaps obvious) fix is to move the assignments of fname and
personality after the tests that they are needed, and before their
first use.

2024-05-22  Roger Sayle  

gcc/ChangeLog
* except.cc (output_function_exception_table): Move call to
get_personality_function after targetm_common.except_unwind_info
check, to avoid ICE on targets that don't support exceptions.

Diff:
---
 gcc/except.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/except.cc b/gcc/except.cc
index 2080fcc22e6..b5886e97be9 100644
--- a/gcc/except.cc
+++ b/gcc/except.cc
@@ -3222,9 +3222,6 @@ output_one_function_exception_table (int section)
 void
 output_function_exception_table (int section)
 {
-  const char *fnname = get_fnname_from_decl (current_function_decl);
-  rtx personality = get_personality_function (current_function_decl);
-
   /* Not all functions need anything.  */
   if (!crtl->uses_eh_lsda
   || targetm_common.except_unwind_info (&global_options) == UI_NONE)
@@ -3234,6 +3231,9 @@ output_function_exception_table (int section)
   if (section == 1 && !crtl->eh.call_site_record_v[1])
 return;
 
+  const char *fnname = get_fnname_from_decl (current_function_decl);
+  rtx personality = get_personality_function (current_function_decl);
+
   if (personality)
 {
   assemble_external_libcall (personality);


[gcc r15-775] i386: Correct insn_cost of movabsq.

2024-05-22 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:a3b16e73a2d5b2d4d20ef6f2fd164cea633bbec8

commit r15-775-ga3b16e73a2d5b2d4d20ef6f2fd164cea633bbec8
Author: Roger Sayle 
Date:   Wed May 22 16:45:48 2024 +0100

i386: Correct insn_cost of movabsq.

This single line patch fixes a strange quirk/glitch in i386's rtx_costs,
which considers an instruction loading a 64-bit constant to be significantly
cheaper than loading a 32-bit (or smaller) constant.

Consider the two functions:
unsigned long long foo() { return 0x0123456789abcdefULL; }
unsigned int bar() { return 10; }

and the corresponding lines from combine's dump file:
  insn_cost 1 for #: r98:DI=0x123456789abcdef
  insn_cost 4 for #: ax:SI=0xa

The same issue can be seen in -dP assembler output.
  movabsq $81985529216486895, %rax# 5  [c=1 l=10]  *movdi_internal/4

The problem is that pattern_costs interpretation of rtx_costs contains
"return cost > 0 ? cost : COSTS_N_INSNS (1)" where a zero value (for
example a register or small immediate constant) is considered special,
and equivalent to a single instruction, but all other values are treated
as verbatim.  Hence to x86_64's 10-byte long movabsq instruction slightly
more expensive than a simple constant, rtx_costs needs to return
COSTS_N_INSNS(1)+1 and not 1.  With this change, the insn_cost of
movabsq is the intended value 5:
  insn_cost 5 for #: r98:DI=0x123456789abcdef
and
  movabsq $81985529216486895, %rax# 5  [c=5 l=10]  *movdi_internal/4

2024-05-22  Roger Sayle  

gcc/ChangeLog
* config/i386/i386.cc (ix86_rtx_costs) :
A CONST_INT that isn't x86_64_immediate_operand requires an extra
(expensive) movabsq insn to load, so return COSTS_N_INSNS (1) + 1.

Diff:
---
 gcc/config/i386/i386.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 69cd4ae05a7..3e2a3a194f1 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21562,7 +21562,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   if (x86_64_immediate_operand (x, VOIDmode))
*total = 0;
  else
-   *total = 1;
+   /* movabsq is slightly more expensive than a simple instruction. */
+   *total = COSTS_N_INSNS (1) + 1;
   return true;
 
 case CONST_DOUBLE:


[gcc r15-1100] i386: Improve handling of ternlog instructions in i386/sse.md

2024-06-07 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:ec985bc97a01577bca8307f986caba7ba7633cde

commit r15-1100-gec985bc97a01577bca8307f986caba7ba7633cde
Author: Roger Sayle 
Date:   Fri Jun 7 13:57:23 2024 +0100

i386: Improve handling of ternlog instructions in i386/sse.md

This patch improves the way that the x86 backend recognizes and
expands AVX512's bitwise ternary logic (vpternlog) instructions.

As a motivating example consider the following code which calculates
the carry out from a (binary) full adder:

typedef unsigned long long v4di __attribute((vector_size(32)));

v4di foo(v4di a, v4di b, v4di c)
{
return (a & b) | ((a ^ b) & c);
}

with -O2 -march=cascadelake current mainline produces:

foo:vpternlogq  $96, %ymm0, %ymm1, %ymm2
vmovdqa %ymm0, %ymm3
vmovdqa %ymm2, %ymm0
vpternlogq  $248, %ymm3, %ymm1, %ymm0
ret

with the patch below, we now generate a single instruction:

foo:vpternlogq  $232, %ymm2, %ymm1, %ymm0
ret

The AVX512 vpternlog[qd] instructions are a very cool addition to the
x86 instruction set, that can calculate any Boolean function of three
inputs in a single fast instruction.  As the truth table for any
three-input function has 8 rows, any specific function can be represented
by specifying those bits, i.e. by a 8-bit byte, an immediate integer
between 0 and 256.

Examples of ternary functions and their indices are given below:

0x01   1:  ~((b|a)|c)
0x02   2:  (~(b|a))&c
0x03   3:  ~(b|a)
0x04   4:  (~(c|a))&b
0x05   5:  ~(c|a)
0x06   6:  (c^b)&~a
0x07   7:  ~((c&b)|a)
0x08   8:  (~a&c)&b (~a&b)&c (c&b)&~a
0x09   9:  ~((c^b)|a)
0x0a  10:  ~a&c
0x0b  11:  ~((~c&b)|a) (~b|c)&~a
0x0c  12:  ~a&b
0x0d  13:  ~((~b&c)|a) (~c|b)&~a
0x0e  14:  (c|b)&~a
0x0f  15:  ~a
0x10  16:  (~(c|b))&a
0x11  17:  ~(c|b)
...
0xf4 244:  (~c&b)|a
0xf5 245:  ~c|a
0xf6 246:  (c^b)|a
0xf7 247:  (~(c&b))|a
0xf8 248:  (c&b)|a
0xf9 249:  (~(c^b))|a
0xfa 250:  c|a
0xfb 251:  (c|a)|~b (~b|a)|c (~b|c)|a
0xfc 252:  b|a
0xfd 253:  (b|a)|~c (~c|a)|b (~c|b)|a
0xfe 254:  (b|a)|c (c|a)|b (c|b)|a

A naive implementation (in many compilers) might be add define_insn
patterns for all 256 different functions.  The situation is even
worse as many of these Boolean functions don't have a "canonical form"
(as produced by simplify_rtx) and would each need multiple patterns.
See the space-separated equivalent expressions in the table above.

This need to provide instruction "templates" might explain why GCC,
LLVM and ICC all exhibit similar coverage problems in their ability
to recognize x86 ternlog ternary functions.

Perhaps a unique feature of GCC's design is that in addition to regular
define_insn templates, machine descriptions can also perform pattern
matching via a match_operator (and its corresponding predicate).
This patch introduces a ternlog_operand predicate that matches a
(possibly infinite) set of expression trees, identifying those that
have at most three unique operands.  This then allows a
define_insn_and_split to recognize suitable expressions and then
transform them into the appropriate UNSPEC_VTERNLOG as a pre-reload
splitter.  This design allows combine to smash together arbitrarily
complex Boolean expressions, then transform them into an UNSPEC
before register allocation.  As an "optimization", where possible
ix86_expand_ternlog generates a simpler binary operation, using
AND, XOR, IOR or ANDN where possible, and in a few cases attempts
to "canonicalize" the ternlog, by reordering or duplicating operands,
so that later CSE passes have a hope of spotting equivalent values.

This patch leaves the existing ternlog patterns in sse.md (for now),
many of which are made obsolete by these changes.  In theory we now
only need one define_insn for UNSPEC_VTERNLOG.  One complication from
these previous variants was that they inconsistently used decimal vs.
hexadecimal to specify the immediate constant operand in assembly
language, making the list of tweaks to the testsuite with this patch
larger than it might have been.  I propose to remove the vestigial
patterns in a follow-up patch, once this approach has baked (proven
to be stable) on mainline.

2024-06-07  Roger Sayle  
Hongtao Liu  

gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_expand_args_builtin): Call
fixup_modeless_constant before testing predicates.  Only call
copy_to_mode_reg on memory operands (after the first one).
(ix86_gen_bcst_mem): Helper function to convert a CONST_VECTOR
into a VEC_DUPLICATE if possible.
(ix86_tern

[gcc r15-1101] i386: PR target/115351: RTX costs for *concatditi3 and *insvti_highpart.

2024-06-07 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:fb3e4c549d16d5050e10114439ad77149f33c597

commit r15-1101-gfb3e4c549d16d5050e10114439ad77149f33c597
Author: Roger Sayle 
Date:   Fri Jun 7 14:03:20 2024 +0100

i386: PR target/115351: RTX costs for *concatditi3 and *insvti_highpart.

This patch addresses PR target/115351, which is a code quality regression
on x86 when passing floating point complex numbers.  The ABI considers
these arguments to have TImode, requiring interunit moves to place the
FP values (which are actually passed in SSE registers) into the upper
and lower parts of a TImode pseudo, and then similar moves back again
before they can be used.

The cause of the regression is that changes in how TImode initialization
is represented in RTL now prevents the RTL optimizers from eliminating
these redundant moves.  The specific cause is that the *concatditi3
pattern, (zext(hi)<<64)|zext(lo), has an inappropriately high (default)
rtx_cost, preventing fwprop1 from propagating it.  This pattern just
sets the hipart and lopart of a double-word register, typically two
instructions (less if reload can allocate things appropriately) but
the current ix86_rtx_costs actually returns INSN_COSTS(13), i.e. 52.

propagating insn 5 into insn 6, replacing:
(set (reg:TI 110)
(ior:TI (and:TI (reg:TI 110)
(const_wide_int 0x0))
(ashift:TI (zero_extend:TI (subreg:DI (reg:DF 112 [ zD.2796+8 ]) 0))
(const_int 64 [0x40]
successfully matched this instruction to *concatditi3_3:
(set (reg:TI 110)
(ior:TI (ashift:TI (zero_extend:TI (subreg:DI (reg:DF 112 [ zD.2796+8 
]) 0))
(const_int 64 [0x40]))
(zero_extend:TI (subreg:DI (reg:DF 111 [ zD.2796 ]) 0
change not profitable (cost 50 -> cost 52)

This issue is resolved by having ix86_rtx_costs return more reasonable
values for these (place-holder) patterns.

2024-06-07  Roger Sayle  

gcc/ChangeLog
PR target/115351
* config/i386/i386.cc (ix86_rtx_costs): Provide estimates for
the *concatditi3 and *insvti_highpart patterns, about two insns.

gcc/testsuite/ChangeLog
PR target/115351
* g++.target/i386/pr115351.C: New test case.

Diff:
---
 gcc/config/i386/i386.cc  | 43 
 gcc/testsuite/g++.target/i386/pr115351.C | 19 ++
 2 files changed, 62 insertions(+)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 4126ab24a79..173db213d14 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21912,6 +21912,49 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
}
  *total = ix86_vec_cost (mode, cost->sse_op);
}
+  else if (TARGET_64BIT
+  && mode == TImode
+  && GET_CODE (XEXP (x, 0)) == ASHIFT
+  && GET_CODE (XEXP (XEXP (x, 0), 0)) == ZERO_EXTEND
+  && GET_MODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == DImode
+  && CONST_INT_P (XEXP (XEXP (x, 0), 1))
+  && INTVAL (XEXP (XEXP (x, 0), 1)) == 64
+  && GET_CODE (XEXP (x, 1)) == ZERO_EXTEND
+  && GET_MODE (XEXP (XEXP (x, 1), 0)) == DImode)
+   {
+ /* *concatditi3 is cheap.  */
+ rtx op0 = XEXP (XEXP (XEXP (x, 0), 0), 0);
+ rtx op1 = XEXP (XEXP (x, 1), 0);
+ *total = (SUBREG_P (op0) && GET_MODE (SUBREG_REG (op0)) == DFmode)
+  ? COSTS_N_INSNS (1)/* movq.  */
+  : set_src_cost (op0, DImode, speed);
+ *total += (SUBREG_P (op1) && GET_MODE (SUBREG_REG (op1)) == DFmode)
+   ? COSTS_N_INSNS (1)/* movq.  */
+   : set_src_cost (op1, DImode, speed);
+ return true;
+   }
+  else if (TARGET_64BIT
+  && mode == TImode
+  && GET_CODE (XEXP (x, 0)) == AND
+  && REG_P (XEXP (XEXP (x, 0), 0))
+  && CONST_WIDE_INT_P (XEXP (XEXP (x, 0), 1))
+  && CONST_WIDE_INT_NUNITS (XEXP (XEXP (x, 0), 1)) == 2
+  && CONST_WIDE_INT_ELT (XEXP (XEXP (x, 0), 1), 0) == -1
+  && CONST_WIDE_INT_ELT (XEXP (XEXP (x, 0), 1), 1) == 0
+  && GET_CODE (XEXP (x, 1)) == ASHIFT
+  && GET_CODE (XEXP (XEXP (x, 1), 0)) == ZERO_EXTEND
+  && GET_MODE (XEXP (XEXP (XEXP (x, 1), 0), 0)) == DImode
+  && CONST_INT_P (XEXP (XEXP (x, 1), 1))
+  && INTVAL (XEXP (XEXP (x, 1), 1)) == 64)
+   {
+ /* *insvti_highpart is cheap.  */
+ rtx op = XEXP (XEXP (XEXP (x, 1), 0), 0);
+ *total = COSTS_N_INSNS (1) + 1;
+ *total += (SUBREG_P (op) && GET_MODE (SUBREG_REG (op)) == DFmode)
+   ? COSTS_N_INSNS (1)/* movq.  */
+   : set_src_cost (op, DImode, speed);
+ return true

[gcc r15-1111] analyzer: Restore g++ 4.8 bootstrap; use std::move to return std::unique_ptr.

2024-06-07 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:e22b7f741ab54ff3a3f8a676ce9e7414fe174958

commit r15--ge22b7f741ab54ff3a3f8a676ce9e7414fe174958
Author: Roger Sayle 
Date:   Sat Jun 8 05:01:38 2024 +0100

analyzer: Restore g++ 4.8 bootstrap; use std::move to return 
std::unique_ptr.

This patch restores bootstrap when using g++ 4.8 as a host compiler.
Returning a std::unique_ptr requires a std::move on C++ compilers
(pre-C++17) that don't guarantee copy elision/return value optimization.

2024-06-08  Roger Sayle  

gcc/analyzer/ChangeLog
* constraint-manager.cc (equiv_class::make_dump_widget): Use
std::move to return a std::unique_ptr.
(bounded_ranges_constraint::make_dump_widget): Likewise.
(constraint_manager::make_dump_widget): Likewise.
* program-state.cc (sm_state_map::make_dump_widget): Likewise.
(program_state::make_dump_widget): Likewise.
* region-model.cc (region_to_value_map::make_dump_widget): Likewise.
(region_model::make_dump_widget): Likewise.
* region.cc (region::make_dump_widget): Likewise.
* store.cc (binding_cluster::make_dump_widget): Likewise.
(store::make_dump_widget): Likewise.
* svalue.cc (svalue::make_dump_widget): Likewise.

Diff:
---
 gcc/analyzer/constraint-manager.cc | 6 +++---
 gcc/analyzer/program-state.cc  | 4 ++--
 gcc/analyzer/region-model.cc   | 4 ++--
 gcc/analyzer/region.cc | 2 +-
 gcc/analyzer/store.cc  | 4 ++--
 gcc/analyzer/svalue.cc | 2 +-
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/gcc/analyzer/constraint-manager.cc 
b/gcc/analyzer/constraint-manager.cc
index 707385d3fa6..883f33b2cdd 100644
--- a/gcc/analyzer/constraint-manager.cc
+++ b/gcc/analyzer/constraint-manager.cc
@@ -1176,7 +1176,7 @@ equiv_class::make_dump_widget (const 
text_art::dump_widget_info &dwi,
   ec_widget->add_child (tree_widget::make (dwi, &pp));
 }
 
-  return ec_widget;
+  return std::move (ec_widget);
 }
 
 /* Generate a hash value for this equiv_class.
@@ -1500,7 +1500,7 @@ make_dump_widget (const text_art::dump_widget_info &dwi) 
const
 (tree_widget::from_fmt (dwi, nullptr,
"ec%i bounded ranges", m_ec_id.as_int ()));
   m_ranges->add_to_dump_widget (*brc_widget.get (), dwi);
-  return brc_widget;
+  return std::move (brc_widget);
 }
 
 bool
@@ -1853,7 +1853,7 @@ constraint_manager::make_dump_widget (const 
text_art::dump_widget_info &dwi) con
   if (cm_widget->get_num_children () == 0)
 return nullptr;
 
-  return cm_widget;
+  return std::move (cm_widget);
 }
 
 /* Attempt to add the constraint LHS OP RHS to this constraint_manager.
diff --git a/gcc/analyzer/program-state.cc b/gcc/analyzer/program-state.cc
index dc2d4bdf7b0..efaf569a490 100644
--- a/gcc/analyzer/program-state.cc
+++ b/gcc/analyzer/program-state.cc
@@ -382,7 +382,7 @@ sm_state_map::make_dump_widget (const 
text_art::dump_widget_info &dwi,
   state_widget->add_child (tree_widget::make (dwi, pp));
 }
 
-  return state_widget;
+  return std::move (state_widget);
 }
 
 /* Return true if no states have been set within this map
@@ -1247,7 +1247,7 @@ program_state::make_dump_widget (const 
text_art::dump_widget_info &dwi) const
state_widget->add_child (smap->make_dump_widget (dwi, m_region_model));
   }
 
-  return state_widget;
+  return std::move (state_widget);
 }
 
 /* Update this program_state to reflect a top-level call to FUN.
diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index a25181f2a3e..1a44ff073bd 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -288,7 +288,7 @@ make_dump_widget (const text_art::dump_widget_info &dwi) 
const
   sval->dump_to_pp (pp, true);
   w->add_child (text_art::tree_widget::make (dwi, pp));
 }
-  return w;
+  return std::move (w);
 }
 
 /* Attempt to merge THIS with OTHER, writing the result
@@ -556,7 +556,7 @@ region_model::make_dump_widget (const 
text_art::dump_widget_info &dwi) const
   m_mgr->get_store_manager ()));
   model_widget->add_child (m_constraints->make_dump_widget (dwi));
   model_widget->add_child (m_dynamic_extents.make_dump_widget (dwi));
-  return model_widget;
+  return std::move (model_widget);
 }
 
 /* Assert that this object is valid.  */
diff --git a/gcc/analyzer/region.cc b/gcc/analyzer/region.cc
index 1fc42f2cd97..d5cfd476fd8 100644
--- a/gcc/analyzer/region.cc
+++ b/gcc/analyzer/region.cc
@@ -1101,7 +1101,7 @@ region::make_dump_widget (const 
text_art::dump_widget_info &dwi,
   if (m_parent)
 w->add_child (m_parent->make_dump_widget (dwi, "parent"));
 
-  return w;
+  return std::move (w);
 }
 
 void
diff --git a/gcc/analyzer/store.cc b/gcc/analyzer/store.cc
index d5c1a9f6aff..5a33d740ce2 100644
--- a/gcc/analyzer/store.cc
+++ b/gcc/analyzer/store.cc
@@ -1489,7 +1489,7 @@ binding_cluster::mak

[gcc r15-1175] i386: PR target/115397: AVX512 ternlog vs. -m32 -fPIC constant pool.

2024-06-11 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:a797398cfbc75899fdb7d97436c0c89c02b133c0

commit r15-1175-ga797398cfbc75899fdb7d97436c0c89c02b133c0
Author: Roger Sayle 
Date:   Tue Jun 11 09:31:34 2024 +0100

i386: PR target/115397: AVX512 ternlog vs. -m32 -fPIC constant pool.

This patch fixes PR target/115397, a recent regression caused by my
ternlog patch that results in an ICE (building numpy) with -m32 -fPIC.
The problem is that ix86_broadcast_from_constant, which calls
get_pool_constant, doesn't handle the UNSPEC_GOTOFF that's created by
calling validize_mem when using -fPIC on i686.  The logic here is a bit
convoluted (and my future patches will clean some of this up), but the
simplest fix is to call ix86_broadcast_from_constant between the calls
to force_const_mem and the call to validize_mem.

Perhaps a better solution might be to call targetm.delegitimize_address
from the middle-end's get_pool_constant, but ultimately the best approach
would be to not place things in the constant pool if we don't need to.
My plans to move (broadcast) constant handling from expand to split1
should simplify this.

2024-06-11  Roger Sayle  

gcc/ChangeLog
PR target/115397
* config/i386/i386-expand.cc (ix86_expand_ternlog): Move call to
ix86_broadcast_from_constant before call to validize_mem, but after
call to force_const_mem.

gcc/testsuite/ChangeLog
PR target/115397
* gcc.target/i386/pr115397.c: New test case.

Diff:
---
 gcc/config/i386/i386-expand.cc   |  3 ++-
 gcc/testsuite/gcc.target/i386/pr115397.c | 17 +
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 9b60264dce2..312329e550b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -26041,8 +26041,9 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx 
op1, rtx op2, int idx,
   tmp2 = ix86_gen_bcst_mem (mode, op2);
   if (!tmp2)
{
- tmp2 = validize_mem (force_const_mem (mode, op2));
+ tmp2 = force_const_mem (mode, op2);
  rtx bcast = ix86_broadcast_from_constant (mode, tmp2);
+ tmp2 = validize_mem (tmp2);
  if (bcast)
{
  rtx reg2 = gen_reg_rtx (mode);
diff --git a/gcc/testsuite/gcc.target/i386/pr115397.c 
b/gcc/testsuite/gcc.target/i386/pr115397.c
new file mode 100644
index 000..27835782b78
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115397.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-fPIC -mavx512f -O3" } */
+
+int LONG_divide_AVX512F_dimensions_0;
+void npy_set_floatstatus_overflow();
+void LONG_divide_AVX512F() {
+  long *src;
+  int raise_err = 0;
+  for (; LONG_divide_AVX512F_dimensions_0;
+   --LONG_divide_AVX512F_dimensions_0, ++src) {
+long a = *src;
+if (a)
+  raise_err = 1;
+  }
+  if (raise_err)
+npy_set_floatstatus_overflow();
+}


[gcc r15-1306] i386: More use of m{32, 64}bcst addressing modes with ternlog.

2024-06-13 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:c129a34dc8e69f7b34cf72835aeba2cefbb8673a

commit r15-1306-gc129a34dc8e69f7b34cf72835aeba2cefbb8673a
Author: Roger Sayle 
Date:   Fri Jun 14 06:29:27 2024 +0100

i386: More use of m{32,64}bcst addressing modes with ternlog.

This patch makes more use of m32bcst and m64bcst addressing modes in
ix86_expand_ternlog.  Previously, the i386 backend would only consider
using a m32bcst if the inner mode of the vector was 32-bits, or using
m64bcst if the inner mode was 64-bits.  For ternlog (and other logic
operations) this is a strange restriction, as how the same constant
is materialized is dependent upon the mode it is used/operated on.
Hence, the V16QI constant {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2} wouldn't
use m??bcst, but (V4SI){0x02020202,0x02020202,0x02020202,0x02020202}
which has the same bit pattern would.  This can optimized by (re)checking
whether a CONST_VECTOR can be broadcast from memory after casting it
to VxSI (or for m64bst to VxDI) where x has the appropriate vector size.

Taking the test case from pr115407:

__attribute__((__vector_size__(64))) char v;
void foo() {
  v = v | v << 7;
}

Compiled with -O2 -mcmodel=large -mavx512bw
GCC 14 generates a 64-byte (512-bit) load from the constant pool:

foo:movabsq $v, %rax// 10
movabsq $.LC0, %rdx // 10
vpsllw  $7, (%rax), %zmm1   // 7
vmovdqa64   (%rax), %zmm0   // 6
vpternlogd  $248, (%rdx), %zmm1, %zmm0  // 7
vmovdqa64   %zmm0, (%rax)   // 6
vzeroupper  // 3
ret // 1
.LC0:   .byte   -12 // 64 = 114 bytes
.byte   -128
;; repeated another 62 times

mainline currently generates two instructions, using interunit broadcast:

foo:movabsq $v, %rdx// 10
movl$-2139062144, %eax  // 5
vmovdqa64   (%rdx), %zmm2   // 6
vpbroadcastd%eax, %zmm0 // 6
vpsllw  $7, %zmm2, %zmm1// 7
vpternlogd  $236, %zmm0, %zmm2, %zmm1   // 7
vmovdqa64   %zmm1, (%rdx)   // 6
vzeroupper  // 3
ret // 1 = 51 bytes

With this patch, we now generate a broadcast addressing mode:

foo:movabsq $v, %rax   // 10
movabsq $.LC1, %rdx// 10
vmovdqa64   (%rax), %zmm1  // 6
vpsllw  $7, %zmm1, %zmm0   // 7
vpternlogd  $236, (%rdx){1to16}, %zmm1, %zmm0  // 7
vmovdqa64   %zmm0, (%rax)  // 6
vzeroupper // 3
ret// 1 = 50 total

Without -mcmodel=large, the benefit is two instructions:

foo:vmovdqa64   v(%rip), %zmm1 // 10
vpsllw  $7, %zmm1, %zmm0   // 7
vpternlogd  $236, .LC2(%rip){1to16}, %zmm1, %zmm0  // 11
vmovdqa64   %zmm0, v(%rip) // 10
vzeroupper // 3
ret// 1 = 42 
total

2024-06-14  Roger Sayle  

gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_expand_ternlog): Try performing
logic operation in a different vector mode if that enables use of
a 32-bit or 64-bit broadcast addressing mode.

gcc/testsuite/ChangeLog
* gcc.target/i386/pr115407.c: New test case.

Diff:
---
 gcc/config/i386/i386-expand.cc   | 63 
 gcc/testsuite/gcc.target/i386/pr115407.c |  9 +
 2 files changed, 72 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 312329e550b6..a4379b863170 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -26041,6 +26041,69 @@ ix86_expand_ternlog (machine_mode mode, rtx op0, rtx 
op1, rtx op2, int idx,
   tmp2 = ix86_gen_bcst_mem (mode, op2);
   if (!tmp2)
{
+ machine_mode bcst32_mode = mode;
+ machine_mode bcst64_mode = mode;
+ switch (mode)
+   {
+   case V1TImode:
+   case V4SImode:
+   case V4SFmode:
+   case V8HImode:
+   case V

[gcc r15-1502] i386: Allow all register_operand SUBREGs in x86_ternlog_idx.

2024-06-20 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:9a76db24e044c8058497051a652cca4228cbc8e9

commit r15-1502-g9a76db24e044c8058497051a652cca4228cbc8e9
Author: Roger Sayle 
Date:   Thu Jun 20 16:30:15 2024 +0100

i386: Allow all register_operand SUBREGs in x86_ternlog_idx.

This patch tweaks ix86_ternlog_idx to allow any SUBREG that matches
the register_operand predicate, and is split out as an independent
piece of a patch that I have to clean-up redundant ternlog patterns
in sse.md.  It turns out that some of these patterns aren't (yet)
sufficiently redundant to be obsolete.  The problem is that the
"new" ternlog pattern has the restriction that it allows SUBREGs,
but only those where the inner and outer modes are the same size,
where regular patterns use "register_operand" which allows arbitrary
including paradoxical SUBREGs.

A motivating example is f2 in gcc.target/i386/avx512dq-abs-copysign-1.c

void f2 (float x, float y)
{
  register float a __asm ("xmm16"), b __asm ("xmm17");
  a = x;
  b = y;
  asm volatile ("" : "+v" (a), "+v" (b));
  a = __builtin_copysignf (a, b);
  asm volatile ("" : "+v" (a));
}

for which combine tries:

(set (subreg:V4SF (reg:SF 100 [ _3 ]) 0)
(ior:V4SF (and:V4SF (not:V4SF (reg:V4SF 104))
(subreg:V4SF (reg:SF 110) 0))
(reg:V4SF 106)))

where the SUBREG is paradoxical, with inner mode SF and outer mode V4SF.
This patch allows the recently added ternlog_operand to accept this case.

2024-06-20  Roger Sayle  

gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_ternlog_idx): Allow any SUBREG
that matches register_operand.  Use rtx_equal_p to compare REG
or SUBREG "leaf" operands.

Diff:
---
 gcc/config/i386/i386-expand.cc | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 5c29ee1353f7..ac423000ce67 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -25576,27 +25576,32 @@ ix86_ternlog_idx (rtx op, rtx *args)
 
   switch (GET_CODE (op))
 {
+case SUBREG:
+  if (!register_operand (op, GET_MODE (op)))
+   return -1;
+  /* FALLTHRU */
+
 case REG:
   if (!args[0])
{
  args[0] = op;
  return 0xf0;
}
-  if (REGNO (op) == REGNO (args[0]))
+  if (rtx_equal_p (op, args[0]))
return 0xf0;
   if (!args[1])
{
  args[1] = op;
  return 0xcc;
}
-  if (REGNO (op) == REGNO (args[1]))
+  if (rtx_equal_p (op, args[1]))
return 0xcc;
   if (!args[2])
{
  args[2] = op;
  return 0xaa;
}
-  if (REG_P (args[2]) && REGNO (op) == REGNO (args[2]))
+  if (rtx_equal_p (op, args[2]))
return 0xaa;
   return -1;
 
@@ -25634,12 +25639,6 @@ ix86_ternlog_idx (rtx op, rtx *args)
return 0x55;
   return -1;
 
-case SUBREG:
-  if (GET_MODE_SIZE (GET_MODE (SUBREG_REG (op)))
- != GET_MODE_SIZE (GET_MODE (op)))
-   return -1;
-  return ix86_ternlog_idx (SUBREG_REG (op), args);
-
 case NOT:
   idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
   return (idx0 >= 0) ? idx0 ^ 0xff : -1;


[gcc r15-1584] PR tree-optimization/113673: Avoid load merging when potentially trapping.

2024-06-24 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:d8b05aef77443e1d3d8f3f5d2c56ac49a503fee3

commit r15-1584-gd8b05aef77443e1d3d8f3f5d2c56ac49a503fee3
Author: Roger Sayle 
Date:   Mon Jun 24 15:34:03 2024 +0100

PR tree-optimization/113673: Avoid load merging when potentially trapping.

This patch fixes PR tree-optimization/113673, a P2 ice-on-valid regression
caused by load merging of (ptr[0]<<8)+ptr[1] when -ftrapv has been
specified.  When the operator is | or ^ this is safe, but for addition
of signed integer types, a trap may be generated/required, so merging this
idiom into a single non-trapping instruction is inappropriate, confusing
the compiler by transforming a basic block with an exception edge into one
without.

This revision implements Richard Biener's feedback to add an early check
for stmt_can_throw_internal (cfun, stmt) to prevent transforming in the
presence of any statement that could trap, not just overflow on addition.
The one other tweak included in this patch is to mark the local function
find_bswap_or_nop_load as static ensuring that it isn't called from outside
this file, and guaranteeing that it is dominated by stmt_can_throw_internal
checking.

2024-06-24  Roger Sayle  
Richard Biener  

gcc/ChangeLog
PR tree-optimization/113673
* gimple-ssa-store-merging.cc (find_bswap_or_nop_load): Make static.
(find_bswap_or_nop_1): Avoid transformations (load merging) when
stmt_can_throw_internal indicates that a statement can trap.

gcc/testsuite/ChangeLog
PR tree-optimization/113673
* g++.dg/pr113673.C: New test case.

Diff:
---
 gcc/gimple-ssa-store-merging.cc |  6 --
 gcc/testsuite/g++.dg/pr113673.C | 14 ++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc
index cb0cb5f42f6..7dba4a7a781 100644
--- a/gcc/gimple-ssa-store-merging.cc
+++ b/gcc/gimple-ssa-store-merging.cc
@@ -363,7 +363,7 @@ init_symbolic_number (struct symbolic_number *n, tree src)
the answer. If so, REF is that memory source and the base of the memory area
accessed and the offset of the access from that base are recorded in N.  */
 
-bool
+static bool
 find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
 {
   /* Leaf node is an array or component ref. Memorize its base and
@@ -610,7 +610,9 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number 
*n, int limit)
   gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
   enum gimple_rhs_class rhs_class;
 
-  if (!limit || !is_gimple_assign (stmt))
+  if (!limit
+  || !is_gimple_assign (stmt)
+  || stmt_can_throw_internal (cfun, stmt))
 return NULL;
 
   rhs1 = gimple_assign_rhs1 (stmt);
diff --git a/gcc/testsuite/g++.dg/pr113673.C b/gcc/testsuite/g++.dg/pr113673.C
new file mode 100644
index 000..11489777f5b
--- /dev/null
+++ b/gcc/testsuite/g++.dg/pr113673.C
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-Os -fnon-call-exceptions -ftrapv" } */
+
+struct s { ~s(); };
+void
+h (unsigned char *data, int c)
+{
+  s a1;
+  while (c)
+{
+  int m = *data++ << 8;
+  m += *data++;
+}
+}


[gcc r15-2758] i386: Refactor V2DI arithmetic right shift expansion for STV.

2024-08-06 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:2f759fa9f4dd78ae8d86482ccda72a335aaac404

commit r15-2758-g2f759fa9f4dd78ae8d86482ccda72a335aaac404
Author: Roger Sayle 
Date:   Tue Aug 6 17:19:29 2024 +0100

i386: Refactor V2DI arithmetic right shift expansion for STV.

This patch refactors ashrv2di RTL expansion into a function so that it may
be reused by a pre-reload splitter, such that DImode right shifts may be
considered candidates during the Scalar-To-Vector (STV) pass.  Currently
DImode arithmetic right shifts are not considered potential candidates
during STV, so for the following testcase:

long long m;
typedef long long v2di __attribute__((vector_size (16)));
void foo(v2di x) { m = x[0]>>63; }

We currently see the following warning/error during STV2
>  r101 use in insn 7 isn't convertible

And end up generating scalar code with an interunit move:

foo:movq%xmm0, %rax
sarq$63, %rax
movq%rax, m(%rip)
ret

With this patch, we can reuse the RTL expansion logic and produce:

foo:psrad   $31, %xmm0
pshufd  $245, %xmm0, %xmm0
movq%xmm0, m(%rip)
ret

Or with the addition of -mavx2, the equivalent:

foo:vpxor   %xmm1, %xmm1, %xmm1
vpcmpgtq%xmm0, %xmm1, %xmm0
vmovq   %xmm0, m(%rip)
ret

The only design decision of note is the choice to continue lowering V2DI
into vector sequences during RTL expansion, to enable combine to optimize
things if possible.  Using just define_insn_and_split potentially misses
optimizations, such as reusing the zero vector produced by vpxor above.
It may be necessary to tweak STV's compute gain at some point, but this
patch controls what's possible (rather than what's beneficial).

2024-08-06  Roger Sayle  

gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_expand_v2di_ashiftrt): New
function refactored from define_expand ashrv2di3.
* config/i386/i386-features.cc 
(general_scalar_to_vector_candidate_p)
: Handle like other shifts and rotates.
* config/i386/i386-protos.h (ix86_expand_v2di_ashiftrt): Prototype.
* config/i386/sse.md (ashrv2di3): Call ix86_expand_v2di_ashiftrt.
(*ashrv2di3): New define_insn_and_split to enable creation by stv2
pass, and splitting during split1 reusing ix86_expand_v2di_ashiftrt.

gcc/testsuite/ChangeLog
* gcc.target/i386/sse2-stv-2.c: New test case.

Diff:
---
 gcc/config/i386/i386-expand.cc | 156 
 gcc/config/i386/i386-features.cc   |   6 +-
 gcc/config/i386/i386-protos.h  |   1 +
 gcc/config/i386/sse.md | 159 +++--
 gcc/testsuite/gcc.target/i386/sse2-stv-2.c |  10 ++
 5 files changed, 180 insertions(+), 152 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index d9ad06264aaf..bdbc14232679 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -7471,6 +7471,162 @@ ix86_expand_v1ti_ashiftrt (rtx operands[])
 }
 }
 
+/* Expand V2DI mode ashiftrt.  */
+void
+ix86_expand_v2di_ashiftrt (rtx operands[])
+{
+  if (operands[2] == const0_rtx)
+{
+  emit_move_insn (operands[0], operands[1]);
+  return;
+}
+
+  if (TARGET_SSE4_2
+  && CONST_INT_P (operands[2])
+  && UINTVAL (operands[2]) >= 63
+  && !optimize_insn_for_size_p ())
+{
+  rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
+  emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
+  return;
+}
+
+  if (CONST_INT_P (operands[2])
+  && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
+{
+  vec_perm_builder sel (4, 4, 1);
+  sel.quick_grow (4);
+  rtx arg0, arg1;
+  rtx op1 = lowpart_subreg (V4SImode,
+   force_reg (V2DImode, operands[1]),
+   V2DImode);
+  rtx target = gen_reg_rtx (V4SImode);
+  if (UINTVAL (operands[2]) >= 63)
+   {
+ arg0 = arg1 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
+ sel[0] = 1;
+ sel[1] = 1;
+ sel[2] = 3;
+ sel[3] = 3;
+   }
+  else if (INTVAL (operands[2]) > 32)
+   {
+ arg0 = gen_reg_rtx (V4SImode);
+ arg1 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
+ emit_insn (gen_ashrv4si3 (arg0, op1,
+   GEN_INT (INTVAL (operands[2]) - 32)));
+ sel[0] = 1;
+ sel[1] = 5;
+ sel[2] = 3;
+ sel[3] = 7;
+   }
+  else if (INTVAL (operands[2]) == 32)
+   {
+ arg0 = op1;
+ arg1 = gen_reg_rtx (V4SImode);
+ emit_insn (gen_ashrv4si3 (arg1, 

[gcc r15-2793] testsuite: Fix recent regression of g++.dg/other/sse2-pr85572-1.C

2024-08-07 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:990a65fb1aa5d1b05a7737df879afb6900e2ce96

commit r15-2793-g990a65fb1aa5d1b05a7737df879afb6900e2ce96
Author: Roger Sayle 
Date:   Wed Aug 7 12:52:26 2024 +0100

testsuite: Fix recent regression of g++.dg/other/sse2-pr85572-1.C

My sincere apologies for not noticing that g++.dg/other/sse2-pr85572-1.C
was FAILing with my recent ashrv2di patch.  I'm not sure how that happened.
Many thanks to Andrew Pinski for alerting me, and confirming that the
changes are harmless/beneficial.  Sorry again for the inconvenience.

2024-08-07  Roger Sayle  

gcc/testsuite/ChangeLog
* g++.dg/other/sse2-pr85572-1.C: Update expected output after
my recent patch for ashrv2di3.  Now with one less instruction.

Diff:
---
 gcc/testsuite/g++.dg/other/sse2-pr85572-1.C | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/g++.dg/other/sse2-pr85572-1.C 
b/gcc/testsuite/g++.dg/other/sse2-pr85572-1.C
index e4c442394243..46edc065c33c 100644
--- a/gcc/testsuite/g++.dg/other/sse2-pr85572-1.C
+++ b/gcc/testsuite/g++.dg/other/sse2-pr85572-1.C
@@ -1,9 +1,10 @@
 // PR target/85572
 // { dg-do compile { target i?86-*-* x86_64-*-* } }
 // { dg-options "-O2 -msse2 -mno-sse3" }
-// { dg-final { scan-assembler-times {\mpxor\M} 2 } }
-// { dg-final { scan-assembler-times {\mpsubq\M} 2 } }
-// { dg-final { scan-assembler-times {\mpsrlq\M} 1 } }
+// { dg-final { scan-assembler-times {\mpsrad\M} 1 } }
+// { dg-final { scan-assembler-times {\mpshufd\M} 1 } }
+// { dg-final { scan-assembler-times {\mpxor\M} 1 } }
+// { dg-final { scan-assembler-times {\mpsubq\M} 1 } }
 
 typedef long long V __attribute__((vector_size (16)));


[gcc r15-2816] i386: Tweak ix86_mode_can_transfer_bits to restore bootstrap on RHEL.

2024-08-08 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:4d44f3fc387815eb232d7757352857993a1d21d9

commit r15-2816-g4d44f3fc387815eb232d7757352857993a1d21d9
Author: Roger Sayle 
Date:   Thu Aug 8 11:16:29 2024 +0100

i386: Tweak ix86_mode_can_transfer_bits to restore bootstrap on RHEL.

This minor patch, very similar to one posted and approved previously at
https://gcc.gnu.org/pipermail/gcc-patches/2024-July/657229.html is
required to restore builds on systems using gcc 4.8 as a host compiler.
Using the enumeration constants E_SFmode and E_DFmode avoids issues with
SFmode and DFmode being "non-literal types in constant expressions".

2024-08-08  Roger Sayle  

gcc/ChangeLog
* config/i386/i386.cc (ix86_mode_can_transfer_bits): Use E_?Fmode
enumeration constants in switch statement.

Diff:
---
 gcc/config/i386/i386.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 8f289b5bc228..02e282904410 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26113,8 +26113,8 @@ ix86_mode_can_transfer_bits (machine_mode mode)
   || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
 switch (GET_MODE_INNER (mode))
   {
-  case SFmode:
-  case DFmode:
+  case E_SFmode:
+  case E_DFmode:
/* These suffer from normalization upon load when not using SSE.  */
return !(ix86_fpmath & FPMATH_387);
   default:


[gcc r15-2880] PR target/116275: Handle STV of *extenddi2_doubleword_highpart on i386.

2024-08-11 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:7a970bd03f1d8eed7703db8a8db3c753ea68899f

commit r15-2880-g7a970bd03f1d8eed7703db8a8db3c753ea68899f
Author: Roger Sayle 
Date:   Mon Aug 12 06:52:48 2024 +0100

PR target/116275: Handle STV of *extenddi2_doubleword_highpart on i386.

This patch resolves PR target/116275, a recent ICE-on-valid regression on
-m32 caused by my recent change to enable STV of DImode arithmeric right
shift on non-AVX512VL targets.  The oversight is that the i386 backend
contains an *extenddi2_doubleword_highpart instruction (whose pattern
is an arithmetic right shift of a left shift) that optimizes the case where
sign-extension need only update the highpart word of a DImode value when
generating 32-bit code (!TARGET_64BIT).  STV accepts this pattern as a
candidate, as there are patterns to handle this form of extension on SSE
using AVX512VL instructions (and previously ASHIFTRT was only allowed on
AVX512VL).  Now that ASHIFTRT is a candidate on non-AVX512vL targets, we
either need to check that the first operand is a register, or as done
below provide the define_insn_and_split that provides a non-AVX512VL
implementation of *extendv2di_highpart_stv.

The new testcase only ICEed with -m32, so this test could be limited to
target ia32, but there's no harm also running this test on -m64 to
provide a little extra test coverage.

2024-08-12  Roger Sayle  

gcc/ChangeLog
PR target/116275
* config/i386/i386.md (*extendv2di2_highpart_stv_noavx512vl): New
define_insn_and_split to handle the STV conversion of the DImode
pattern *extendsi2_doubleword_highpart.

gcc/testsuite/ChangeLog
PR target/116275
* g++.target/i386/pr116275.C: New test case.

Diff:
---
 gcc/config/i386/i386.md  | 18 ++
 gcc/testsuite/g++.target/i386/pr116275.C | 15 +++
 2 files changed, 33 insertions(+)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index db7789c17d2a..1a6188f5161b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -17393,6 +17393,24 @@
(ashift:V2DI (match_dup 1) (match_dup 2)))
(set (match_dup 0)
(ashiftrt:V2DI (match_dup 0) (match_dup 2)))])
+
+;; Without AVX512VL, split this instruction before reload.
+(define_insn_and_split "*extendv2di2_highpart_stv_noavx512vl"
+  [(set (match_operand:V2DI 0 "register_operand" "=v")
+   (ashiftrt:V2DI
+ (ashift:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "vm")
+  (match_operand:QI 2 "const_int_operand"))
+ (match_operand:QI 3 "const_int_operand")))]
+  "!TARGET_AVX512VL
+   && INTVAL (operands[2]) == INTVAL (operands[3])
+   && UINTVAL (operands[2]) < 32
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (ashift:V2DI (match_dup 1) (match_dup 2)))
+   (set (match_dup 0)
+   (ashiftrt:V2DI (match_dup 0) (match_dup 2)))])
 
 ;; Rotate instructions
 
diff --git a/gcc/testsuite/g++.target/i386/pr116275.C 
b/gcc/testsuite/g++.target/i386/pr116275.C
new file mode 100644
index ..69c5b5a2ef9f
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr116275.C
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -std=c++11" } */
+
+struct SymbolDesc push_back(SymbolDesc);
+struct SymbolDesc {
+  long long ELFLocalSymIdx;
+};
+struct Expected {
+  long long &operator*();
+};
+void SymbolizableObjectFileaddSymbol() {
+  Expected SymbolAddressOrErr;
+  long long SymbolAddress = *SymbolAddressOrErr << 8 >> 8;
+  push_back({SymbolAddress});
+}


[gcc r15-2940] i386: Improve split of *extendv2di2_highpart_stv_noavx512vl.

2024-08-15 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:b6fb4f7f651d2aa89548c5833fe2679af2638df5

commit r15-2940-gb6fb4f7f651d2aa89548c5833fe2679af2638df5
Author: Roger Sayle 
Date:   Thu Aug 15 22:02:05 2024 +0100

i386: Improve split of *extendv2di2_highpart_stv_noavx512vl.

This patch follows up on the previous patch to fix PR target/116275 by
improving the code STV (ultimately) generates for highpart sign extensions
like (x<<8)>>8.  The arithmetic right shift is able to take advantage of
the available common subexpressions from the preceding left shift.

Hence previously with -O2 -m32 -mavx -mno-avx512vl we'd generate:

vpsllq  $8, %xmm0, %xmm0
vpsrad  $8, %xmm0, %xmm1
vpsrlq  $8, %xmm0, %xmm0
vpblendw$51, %xmm0, %xmm1, %xmm0

But with improved splitting, we now generate three instructions:

vpslld  $8, %xmm1, %xmm0
vpsrad  $8, %xmm0, %xmm0
vpblendw$51, %xmm1, %xmm0, %xmm0

This patch also implements Uros' suggestion that the pre-reload
splitter could introduced a new pseudo to hold the intermediate
to potentially help reload with register allocation, which applies
when not performing the above optimization, i.e. on TARGET_XOP.

2024-08-15  Roger Sayle  
Uros Bizjak  

gcc/ChangeLog
* config/i386/i386.md (*extendv2di2_highpart_stv_noavx512vl): Split
to an improved implementation on !TARGET_XOP.  On TARGET_XOP, use
a new pseudo for the intermediate to simplify register allocation.

gcc/testsuite/ChangeLog
* g++.target/i386/pr116275-2.C: New test case.

Diff:
---
 gcc/config/i386/i386.md| 32 --
 gcc/testsuite/g++.target/i386/pr116275-2.C | 19 ++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index efbab2f25ec..36108e5c2c9 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -17872,10 +17872,38 @@
&& ix86_pre_reload_split ()"
   "#"
   "&& 1"
-  [(set (match_dup 0)
+  [(set (match_dup 4)
(ashift:V2DI (match_dup 1) (match_dup 2)))
(set (match_dup 0)
-   (ashiftrt:V2DI (match_dup 0) (match_dup 2)))])
+   (ashiftrt:V2DI (match_dup 4) (match_dup 2)))]
+{
+  if (!TARGET_XOP)
+{
+  rtx op0 = operands[0];
+  rtx op2 = operands[2];
+  rtx tmp1 = gen_reg_rtx (V4SImode);
+  rtx tmp2 = gen_reg_rtx (V4SImode);
+  rtx tmp3 = gen_reg_rtx (V4SImode);
+  rtx tmp4 = gen_reg_rtx (V4SImode);
+  emit_move_insn (tmp1, lowpart_subreg (V4SImode, operands[1], V2DImode));
+  emit_insn (gen_ashlv4si3 (tmp2, tmp1, op2));
+  emit_insn (gen_ashrv4si3 (tmp3, tmp2, op2));
+  vec_perm_builder sel (4, 4, 1);
+  sel.quick_grow (4);
+  sel[0] = 0;
+  sel[1] = 5;
+  sel[2] = 2;
+  sel[3] = 7;
+  vec_perm_indices indices(sel, 2, 4);
+  bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode, tmp4,
+ tmp1, tmp3, indices);
+  gcc_assert (ok);
+  emit_move_insn (op0, lowpart_subreg (V2DImode, tmp4, V4SImode));
+  DONE;
+}
+  else
+operands[4] = gen_reg_rtx (V2DImode);
+})
 
 ;; Rotate instructions
 
diff --git a/gcc/testsuite/g++.target/i386/pr116275-2.C 
b/gcc/testsuite/g++.target/i386/pr116275-2.C
new file mode 100644
index 000..98d3c19e59c
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr116275-2.C
@@ -0,0 +1,19 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -mavx -mno-avx512vl -std=c++11" } */
+
+struct SymbolDesc push_back(SymbolDesc);
+struct SymbolDesc {
+  long long ELFLocalSymIdx;
+};
+struct Expected {
+  long long &operator*();
+};
+void SymbolizableObjectFileaddSymbol() {
+  Expected SymbolAddressOrErr;
+  long long SymbolAddress = *SymbolAddressOrErr << 8 >> 8;
+  push_back({SymbolAddress});
+}
+
+/* { dg-final { scan-assembler "vpslld" } } */
+/* { dg-final { scan-assembler-not "vpsllq" } } */
+/* { dg-final { scan-assembler-not "vpsrlq" } } */


[gcc r15-222] PR target/106060: Improved SSE vector constant materialization on x86.

2024-05-06 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:79649a5dcd81bc05c0ba591068c9075de43bd417

commit r15-222-g79649a5dcd81bc05c0ba591068c9075de43bd417
Author: Roger Sayle 
Date:   Tue May 7 07:14:40 2024 +0100

PR target/106060: Improved SSE vector constant materialization on x86.

This patch resolves PR target/106060 by providing efficient methods for
materializing/synthesizing special "vector" constants on x86.  Currently
there are three methods of materializing a vector constant; the most
general is to load a vector from the constant pool, secondly "duplicated"
constants can be synthesized by moving an integer between units and
broadcasting (of shuffling it), and finally the special cases of the
all-zeros vector and all-ones vectors can be loaded via a single SSE
instruction.   This patch handle additional cases that can be synthesized
in two instructions, loading an all-ones vector followed by another SSE
instruction.  Following my recent patch for PR target/112992, there's
conveniently a single place in i386-expand.cc where these special cases
can be handled.

Two examples are given in the original bugzilla PR for 106060.

__m256i should_be_cmpeq_abs ()
{
  return _mm256_set1_epi8 (1);
}

is now generated (with -O3 -march=x86-64-v3) as:

vpcmpeqd%ymm0, %ymm0, %ymm0
vpabsb  %ymm0, %ymm0
ret

and

__m256i should_be_cmpeq_add ()
{
  return _mm256_set1_epi8 (-2);
}

is now generated as:

vpcmpeqd%ymm0, %ymm0, %ymm0
vpaddb  %ymm0, %ymm0, %ymm0
ret

2024-05-07  Roger Sayle  
Hongtao Liu  

gcc/ChangeLog
PR target/106060
* config/i386/i386-expand.cc (enum ix86_vec_bcast_alg): New.
(struct ix86_vec_bcast_map_simode_t): New type for table below.
(ix86_vec_bcast_map_simode): Table of SImode constants that may
be efficiently synthesized by a ix86_vec_bcast_alg method.
(ix86_vec_bcast_map_simode_cmp): New comparator for bsearch.
(ix86_vector_duplicate_simode_const): Efficiently synthesize
V4SImode and V8SImode constants that duplicate special constants.
(ix86_vector_duplicate_value): Attempt to synthesize "special"
vector constants using ix86_vector_duplicate_simode_const.
* config/i386/i386.cc (ix86_rtx_costs) : ABS of a
vector integer mode costs with a single SSE instruction.

gcc/testsuite/ChangeLog
PR target/106060
* gcc.target/i386/auto-init-8.c: Update test case.
* gcc.target/i386/avx512fp16-13.c: Likewise.
* gcc.target/i386/pr100865-9a.c: Likewise.
* gcc.target/i386/pr101796-1.c: Likewise.
* gcc.target/i386/pr106060-1.c: New test case.
* gcc.target/i386/pr106060-2.c: Likewise.
* gcc.target/i386/pr106060-3.c: Likewise.
* gcc.target/i386/pr70314.c: Update test case.
* gcc.target/i386/vect-shiftv4qi.c: Likewise.
* gcc.target/i386/vect-shiftv8qi.c: Likewise.

Diff:
---
 gcc/config/i386/i386-expand.cc | 364 -
 gcc/config/i386/i386.cc|   2 +
 gcc/testsuite/gcc.target/i386/auto-init-8.c|   2 +-
 gcc/testsuite/gcc.target/i386/avx512fp16-13.c  |   3 -
 gcc/testsuite/gcc.target/i386/pr100865-9a.c|   2 +-
 gcc/testsuite/gcc.target/i386/pr101796-1.c |   6 +-
 gcc/testsuite/gcc.target/i386/pr106060-1.c |  12 +
 gcc/testsuite/gcc.target/i386/pr106060-2.c |  13 +
 gcc/testsuite/gcc.target/i386/pr106060-3.c |  14 +
 gcc/testsuite/gcc.target/i386/pr70314.c|   2 +-
 gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c |   2 +-
 gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c |   2 +-
 12 files changed, 411 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 8bb8f21e686..a6132911e6a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -15696,6 +15696,332 @@ s4fma_expand:
   gcc_unreachable ();
 }
 
+/* See below where shifts are handled for explanation of this enum.  */
+enum ix86_vec_bcast_alg
+{
+  VEC_BCAST_PXOR,
+  VEC_BCAST_PCMPEQ,
+  VEC_BCAST_PABSB,
+  VEC_BCAST_PADDB,
+  VEC_BCAST_PSRLW,
+  VEC_BCAST_PSRLD,
+  VEC_BCAST_PSLLW,
+  VEC_BCAST_PSLLD
+};
+
+struct ix86_vec_bcast_map_simode_t
+{
+  unsigned int key;
+  enum ix86_vec_bcast_alg alg;
+  unsigned int arg;
+};
+
+/* This table must be kept sorted as values are looked-up using bsearch.  */
+static const ix86_vec_bcast_map_simode_t ix86_vec_bcast_map_simode[] = {
+  { 0x, VEC_BCAST_PXOR,0 },
+  { 0x0001, VEC_BCAST_PSRLD,  31 },
+  { 0x0003, VEC_BCAST_PSRLD,  30 },
+  { 0x0007, VEC_BCAST_PSRLD,  29 },
+  { 0x000f, VEC_BCAST_PSRLD,  28 },
+  { 0x001f, VE

[gcc r15-352] Constant fold {-1,-1} << 1 in simplify-rtx.cc

2024-05-09 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:f2449b55fb2d32fc4200667ba79847db31f6530d

commit r15-352-gf2449b55fb2d32fc4200667ba79847db31f6530d
Author: Roger Sayle 
Date:   Thu May 9 22:45:54 2024 +0100

Constant fold {-1,-1} << 1 in simplify-rtx.cc

This patch addresses a missed optimization opportunity in the RTL
optimization passes.  The function simplify_const_binary_operation
will constant fold binary operators with two CONST_INT operands,
and those with two CONST_VECTOR operands, but is missing compile-time
evaluation of binary operators with a CONST_VECTOR and a CONST_INT,
such as vector shifts and rotates.

The first version of this patch didn't contain a switch statement to
explicitly check for valid binary opcodes, which bootstrapped and
regression tested fine, but my paranoia has got the better of me,
so this version now checks that VEC_SELECT or some funky (future)
rtx_code doesn't cause problems.

2024-05-09  Roger Sayle  

gcc/ChangeLog
* simplify-rtx.cc (simplify_const_binary_operation): Constant
fold binary operations where the LHS is CONST_VECTOR and the
RHS is CONST_INT (or CONST_DOUBLE) such as vector shifts.

Diff:
---
 gcc/simplify-rtx.cc | 54 +
 1 file changed, 54 insertions(+)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index dceaa1ca..53f54d1d3928 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -5021,6 +5021,60 @@ simplify_const_binary_operation (enum rtx_code code, 
machine_mode mode,
   return gen_rtx_CONST_VECTOR (mode, v);
 }
 
+  if (VECTOR_MODE_P (mode)
+  && GET_CODE (op0) == CONST_VECTOR
+  && (CONST_SCALAR_INT_P (op1) || CONST_DOUBLE_AS_FLOAT_P (op1))
+  && (CONST_VECTOR_DUPLICATE_P (op0)
+ || CONST_VECTOR_NUNITS (op0).is_constant ()))
+{
+  switch (code)
+   {
+   case PLUS:
+   case MINUS:
+   case MULT:
+   case DIV:
+   case MOD:
+   case UDIV:
+   case UMOD:
+   case AND:
+   case IOR:
+   case XOR:
+   case SMIN:
+   case SMAX:
+   case UMIN:
+   case UMAX:
+   case LSHIFTRT:
+   case ASHIFTRT:
+   case ASHIFT:
+   case ROTATE:
+   case ROTATERT:
+   case SS_PLUS:
+   case US_PLUS:
+   case SS_MINUS:
+   case US_MINUS:
+   case SS_ASHIFT:
+   case US_ASHIFT:
+   case COPYSIGN:
+ break;
+   default:
+ return NULL_RTX;
+   }
+
+  unsigned int npatterns = (CONST_VECTOR_DUPLICATE_P (op0)
+   ? CONST_VECTOR_NPATTERNS (op0)
+   : CONST_VECTOR_NUNITS (op0).to_constant ());
+  rtx_vector_builder builder (mode, npatterns, 1);
+  for (unsigned i = 0; i < npatterns; i++)
+   {
+ rtx x = simplify_binary_operation (code, GET_MODE_INNER (mode),
+CONST_VECTOR_ELT (op0, i), op1);
+ if (!x || !valid_for_const_vector_p (mode, x))
+   return 0;
+ builder.quick_push (x);
+   }
+  return builder.build ();
+}
+
   if (SCALAR_FLOAT_MODE_P (mode)
   && CONST_DOUBLE_AS_FLOAT_P (op0) 
   && CONST_DOUBLE_AS_FLOAT_P (op1)


[gcc r15-366] i386: Improve V[48]QI shifts on AVX512/SSE4.1

2024-05-10 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:f5a8cdc1ef5d6aa2de60849c23658ac5298df7bb

commit r15-366-gf5a8cdc1ef5d6aa2de60849c23658ac5298df7bb
Author: Roger Sayle 
Date:   Fri May 10 20:26:40 2024 +0100

i386: Improve V[48]QI shifts on AVX512/SSE4.1

The following one line patch improves the code generated for V8QI and V4QI
shifts when AV512BW and AVX512VL functionality is available.

For the testcase (from gcc.target/i386/vect-shiftv8qi.c):

typedef signed char v8qi __attribute__ ((__vector_size__ (8)));
v8qi foo (v8qi x) { return x >> 5; }

GCC with -O2 -march=cascadelake currently generates:

foo:movl$67372036, %eax
vpsraw  $5, %xmm0, %xmm2
vpbroadcastd%eax, %xmm1
movl$117901063, %eax
vpbroadcastd%eax, %xmm3
vmovdqa %xmm1, %xmm0
vmovdqa %xmm3, -24(%rsp)
vpternlogd  $120, -24(%rsp), %xmm2, %xmm0
vpsubb  %xmm1, %xmm0, %xmm0
ret

with this patch we now generate the much improved:

foo:vpmovsxbw   %xmm0, %xmm0
vpsraw  $5, %xmm0, %xmm0
vpmovwb %xmm0, %xmm0
ret

This patch also fixes the FAILs of gcc.target/i386/vect-shiftv[48]qi.c
when run with the additional -march=cascadelake flag, by splitting these
tests into two; one form testing code generation with -msse2 (and
-mno-avx512vl) as originally intended, and the other testing AVX512
code generation with an explicit -march=cascadelake.

2024-05-10  Roger Sayle  
Hongtao Liu  

gcc/ChangeLog
* config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial):
Don't attempt ix86_expand_vec_shift_qihi_constant on SSE4.1.

gcc/testsuite/ChangeLog
* gcc.target/i386/vect-shiftv4qi.c: Specify -mno-avx512vl.
* gcc.target/i386/vect-shiftv8qi.c: Likewise.
* gcc.target/i386/vect-shiftv4qi-2.c: New test case.
* gcc.target/i386/vect-shiftv8qi-2.c: Likewise.

Diff:
---
 gcc/config/i386/i386-expand.cc   |  3 ++
 gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c | 43 
 gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c   |  2 +-
 gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c | 43 
 gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c   |  2 +-
 5 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c2..1ab22fe79736 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24283,6 +24283,9 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx 
dest, rtx op1, rtx op2)
 
   if (CONST_INT_P (op2)
   && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+  /* With AVX512 it's cheaper to do vpmovsxbw/op/vpmovwb.
+ Even with SSE4.1 the alternative is better.  */
+  && !TARGET_SSE4_1
   && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
 {
   emit_move_insn (dest, gen_lowpart (qimode, qdest));
diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c 
b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c
new file mode 100644
index ..abc1a276b043
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=cascadelake" } */
+
+#define N 4
+
+typedef unsigned char __vu __attribute__ ((__vector_size__ (N)));
+typedef signed char __vi __attribute__ ((__vector_size__ (N)));
+
+__vu sll (__vu a, int n)
+{
+  return a << n;
+}
+
+__vu sll_c (__vu a)
+{
+  return a << 5;
+}
+
+/* { dg-final { scan-assembler-times "vpsllw" 2 } } */
+
+__vu srl (__vu a, int n)
+{
+  return a >> n;
+}
+
+__vu srl_c (__vu a)
+{
+  return a >> 5;
+}
+
+/* { dg-final { scan-assembler-times "vpsrlw" 2 } } */
+
+__vi sra (__vi a, int n)
+{
+  return a >> n;
+}
+
+__vi sra_c (__vi a)
+{
+  return a >> 5;
+}
+
+/* { dg-final { scan-assembler-times "vpsraw" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c 
b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c
index b7e45c2e8799..9b52582d01f8 100644
--- a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c
+++ b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2" } */
+/* { dg-options "-O2 -msse2 -mno-avx2 -mno-avx512vl" } */
 
 #define N 4
 
diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c 
b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c
new file mode 100644
index ..52760f5a0607
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c
@@ -0,0 +1,43 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=cascadelake" } */
+
+#define N 8
+
+typedef unsigned char __vu __attribute__ ((__vector_size__ (N)));
+typedef signed char __vi __attribute__ ((__vector_size__ (N)));
+

[gcc r15-390] arm: Use utxb rN, rM, ror #8 to implement zero_extract on armv6.

2024-05-12 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:46077992180d6d86c86544df5e8cb943492d3b01

commit r15-390-g46077992180d6d86c86544df5e8cb943492d3b01
Author: Roger Sayle 
Date:   Sun May 12 16:27:22 2024 +0100

arm: Use utxb rN, rM, ror #8 to implement zero_extract on armv6.

Examining the code generated for the following C snippet on a
raspberry pi:

int popcount_lut8(unsigned *buf, int n)
{
  int cnt=0;
  unsigned int i;
  do {
i = *buf;
cnt += lut[i&255];
cnt += lut[i>>8&255];
cnt += lut[i>>16&255];
cnt += lut[i>>24];
buf++;
  } while(--n);
  return cnt;
}

I was surprised to see following instruction sequence generated by the
compiler:

  movr5, r2, lsr #8
  uxtb   r5, r5

This sequence can be performed by a single ARM instruction:

  uxtb   r5, r2, ror #8

The attached patch allows GCC's combine pass to take advantage of ARM's
uxtb with rotate functionality to implement the above zero_extract, and
likewise to use the sxtb with rotate to implement sign_extract.  ARM's
uxtb and sxtb can only be used with rotates of 0, 8, 16 and 24, and of
these only the 8 and 16 are useful [ror #0 is a nop, and extends with
ror #24 can be implemented using regular shifts],  so the approach here
is to add the six missing but useful instructions as 6 different
define_insn in arm.md, rather than try to be clever with new predicates.

Later ARM hardware has advanced bit field instructions, and earlier
ARM cores didn't support extend-with-rotate, so this appears to only
benefit armv6 era CPUs (e.g. the raspberry pi).

Patch posted:
https://gcc.gnu.org/legacy-ml/gcc-patches/2018-01/msg01339.html
Approved by Kyrill Tkachov:
https://gcc.gnu.org/legacy-ml/gcc-patches/2018-01/msg01881.html

2024-05-12  Roger Sayle  
Kyrill Tkachov  

* config/arm/arm.md (*arm_zeroextractsi2_8_8, 
*arm_signextractsi2_8_8,
*arm_zeroextractsi2_8_16, *arm_signextractsi2_8_16,
*arm_zeroextractsi2_16_8, *arm_signextractsi2_16_8): New.

2024-05-12  Roger Sayle  
Kyrill Tkachov  

* gcc.target/arm/extend-ror.c: New test.

Diff:
---
 gcc/config/arm/arm.md | 66 +++
 gcc/testsuite/gcc.target/arm/extend-ror.c | 38 ++
 2 files changed, 104 insertions(+)

diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 1fd00146ca9e..f47e036a8034 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -12647,6 +12647,72 @@
 ""
 )
 
+;; Implement zero_extract using uxtb/uxth instruction with 
+;; the ror #N qualifier when applicable.
+
+(define_insn "*arm_zeroextractsi2_8_8"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+   (zero_extract:SI (match_operand:SI 1 "s_register_operand" "r")
+(const_int 8) (const_int 8)))]
+  "TARGET_ARM && arm_arch6"
+  "uxtb%?\\t%0, %1, ror #8"
+  [(set_attr "predicable" "yes")
+   (set_attr "type" "extend")]
+)
+
+(define_insn "*arm_zeroextractsi2_8_16"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+   (zero_extract:SI (match_operand:SI 1 "s_register_operand" "r")
+(const_int 8) (const_int 16)))]
+  "TARGET_ARM && arm_arch6"
+  "uxtb%?\\t%0, %1, ror #16"
+  [(set_attr "predicable" "yes")
+   (set_attr "type" "extend")]
+)
+
+(define_insn "*arm_zeroextractsi2_16_8"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+   (zero_extract:SI (match_operand:SI 1 "s_register_operand" "r")
+(const_int 16) (const_int 8)))]
+  "TARGET_ARM && arm_arch6"
+  "uxth%?\\t%0, %1, ror #8"
+  [(set_attr "predicable" "yes")
+   (set_attr "type" "extend")]
+)
+
+;; Implement sign_extract using sxtb/sxth instruction with 
+;; the ror #N qualifier when applicable.
+
+(define_insn "*arm_signextractsi2_8_8"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+   (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r")
+(const_int 8) (const_int 8)))]
+  "TARGET_ARM && arm_arch6"
+  "sxtb%?\\t%0, %1, ror #8"
+  [(set_attr "predicable" "yes")
+   (set_attr "type" "extend")]
+)
+
+(define_insn "*arm_signextractsi2_8_16"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+   (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r")
+(const_int 8) (const_int 16)))]
+  "TARGET_ARM && arm_arch6"
+  "sxtb%?\\t%0, %1, ror #16"
+  [(set_attr "predicable" "yes")
+   (set_attr "type" "extend")]
+)
+
+(define_insn "*arm_signextractsi2_16_8"
+  [(set (match_operand:SI 0 "s_register_operand" "=r")
+   (sign_extract:SI (match_operand:SI 1 "s_register_operand" "r")
+(const_int 16) (const_int 8)))]
+  "TARGET_ARM && arm_arch6"
+  "sxth%?\\t%0, %1, ror #8"
+  [(set_attr "predicable" "yes")
+   (set_attr "type" 

[gcc r15-648] nvptx: Correct pattern for popcountdi2 insn in nvptx.md.

2024-05-19 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:1676ef6e91b902f592270e4bcf10b4fc342e200d

commit r15-648-g1676ef6e91b902f592270e4bcf10b4fc342e200d
Author: Roger Sayle 
Date:   Sun May 19 09:49:45 2024 +0100

nvptx: Correct pattern for popcountdi2 insn in nvptx.md.

The result of a POPCOUNT operation in RTL should have the same mode
as its operand.  This corrects the specification of popcount in
the nvptx backend, splitting the current generic define_insn into
two, one for popcountsi2 and the other for popcountdi2 (the latter
with an explicit truncate).

2024-05-19  Roger Sayle  

gcc/ChangeLog
* config/nvptx/nvptx.md (popcount2): Split into...
(popcountsi2): define_insn handling SImode popcount.
(popcountdi2): define_insn handling DImode popcount, with an
explicit truncate:SI to produce an SImode result.

Diff:
---
 gcc/config/nvptx/nvptx.md | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 96e6c9116080..ef7e3fb00fac 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -655,11 +655,18 @@
   DONE;
 })
 
-(define_insn "popcount2"
+(define_insn "popcountsi2"
   [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
-   (popcount:SI (match_operand:SDIM 1 "nvptx_register_operand" "R")))]
+   (popcount:SI (match_operand:SI 1 "nvptx_register_operand" "R")))]
   ""
-  "%.\\tpopc.b%T1\\t%0, %1;")
+  "%.\\tpopc.b32\\t%0, %1;")
+
+(define_insn "popcountdi2"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+   (truncate:SI
+ (popcount:DI (match_operand:DI 1 "nvptx_register_operand" "R"]
+  ""
+  "%.\\tpopc.b64\\t%0, %1;")
 
 ;; Multiplication variants


[gcc r15-3162] i386: Update STV's gains for TImode arithmetic right shifts on AVX2.

2024-08-25 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:07d62a1711f3e3bbdd2146ab5914d3bc5e246509

commit r15-3162-g07d62a1711f3e3bbdd2146ab5914d3bc5e246509
Author: Roger Sayle 
Date:   Sun Aug 25 09:14:34 2024 -0600

i386: Update STV's gains for TImode arithmetic right shifts on AVX2.

This patch tweaks timode_scalar_chain::compute_convert_gain to better
reflect the expansion of V1TImode arithmetic right shifts by the i386
backend.  The comment "see ix86_expand_v1ti_ashiftrt" appears after
"case ASHIFTRT" in compute_convert_gain, and the changes below attempt
to better match the logic used there.

The original motivating example is:

__int128 m1;
void foo()
{
  m1 = (m1 << 8) >> 8;
}

which with -O2 -mavx2 we fail to convert to vector form due to the
inappropriate cost of the arithmetic right shift.

  Instruction gain -16 for 7: {r103:TI=r101:TI>>0x8;clobber flags:CC;}
  Total gain: -3
  Chain #1 conversion is not profitable

This is reporting that the ASHIFTRT is four instructions worse using
vectors than in scalar form, which is incorrect as the AVX2 expansion
of this shift only requires three instructions (and the scalar form
requires two).

With more accurate costs in timode_scalar_chain::compute_convert_gain
we now see (with -O2 -mavx2):

  Instruction gain -4 for 7: {r103:TI=r101:TI>>0x8;clobber flags:CC;}
  Total gain: 9
  Converting chain #1...

which results in:

foo:vmovdqa m1(%rip), %xmm0
vpslldq $1, %xmm0, %xmm0
vpsrad  $8, %xmm0, %xmm1
vpsrldq $1, %xmm0, %xmm0
vpblendd$7, %xmm0, %xmm1, %xmm0
vmovdqa %xmm0, m1(%rip)
ret

2024-08-25  Roger Sayle  
Uros Bizjak  

gcc/ChangeLog
* config/i386/i386-features.cc (compute_convert_gain)
: Update to match ix86_expand_v1ti_ashiftrt.

Diff:
---
 gcc/config/i386/i386-features.cc | 21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 7e80e7b0103f..ca902ecf0de5 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1650,23 +1650,28 @@ timode_scalar_chain::compute_convert_gain ()
  else if (op1val == 64)
vcost = COSTS_N_INSNS (3);
  else if (op1val == 96)
-   vcost = COSTS_N_INSNS (4);
- else if (op1val >= 111)
vcost = COSTS_N_INSNS (3);
- else if (TARGET_AVX2 && op1val == 32)
+ else if (op1val >= 111)
vcost = COSTS_N_INSNS (3);
  else if (TARGET_SSE4_1 && op1val == 32)
-   vcost = COSTS_N_INSNS (4);
+   vcost = COSTS_N_INSNS (3);
+ else if (TARGET_SSE4_1
+  && (op1val == 8 || op1val == 16 || op1val == 24))
+   vcost = COSTS_N_INSNS (3);
  else if (op1val >= 96)
-   vcost = COSTS_N_INSNS (5);
+   vcost = COSTS_N_INSNS (4);
+ else if (TARGET_SSE4_1 && (op1val == 28 || op1val == 80))
+   vcost = COSTS_N_INSNS (4);
  else if ((op1val & 7) == 0)
-   vcost = COSTS_N_INSNS (6);
+   vcost = COSTS_N_INSNS (5);
  else if (TARGET_AVX2 && op1val < 32)
vcost = COSTS_N_INSNS (6);
+ else if (TARGET_SSE4_1 && op1val < 15)
+   vcost = COSTS_N_INSNS (6);
  else if (op1val == 1 || op1val >= 64)
-   vcost = COSTS_N_INSNS (9);
+   vcost = COSTS_N_INSNS (8);
  else
-   vcost = COSTS_N_INSNS (10);
+   vcost = COSTS_N_INSNS (9);
}
  igain = scost - vcost;
  break;


[gcc r15-3281] i386: Support wide immediate constants in STV.

2024-08-28 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:3cb92be94e6581697369eeafdb67057c8cfba73f

commit r15-3281-g3cb92be94e6581697369eeafdb67057c8cfba73f
Author: Roger Sayle 
Date:   Wed Aug 28 21:19:28 2024 -0600

i386: Support wide immediate constants in STV.

This patch provides more accurate costs/gains for (wide) immediate
constants in STV, suitably adjusting the costs/gains when the highpart
and lowpart words are the same.

2024-08-28  Roger Sayle  

gcc/ChangeLog
* config/i386/i386-features.cc (timode_immed_const_gain): New
function to determine the gain/cost on a CONST_WIDE_INT.
(timode_scalar_chain::compute_convert_gain): Fix whitespace.
: Provide more accurate estimates using
timode_immed_const_gain.
: Handle CONSTANT_SCALAR_INT_P (src).

Diff:
---
 gcc/config/i386/i386-features.cc | 28 +++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index ca902ecf0de5..c09a5c73a8e3 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1503,6 +1503,23 @@ general_scalar_chain::convert_insn (rtx_insn *insn)
   df_insn_rescan (insn);
 }
 
+/* Helper function to compute gain for loading an immediate constant.
+   Typically, two movabsq for TImode vs. vmovdqa for V1TImode, but
+   with numerous special cases.  */
+
+static int
+timode_immed_const_gain (rtx cst)
+{
+  /* movabsq vs. movabsq+vmovq+vunpacklqdq.  */
+  if (CONST_WIDE_INT_P (cst)
+  && CONST_WIDE_INT_NUNITS (cst) == 2
+  && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1))
+return optimize_insn_for_size_p () ? -COSTS_N_BYTES (9)
+  : -COSTS_N_INSNS (2);
+  /* 2x movabsq ~ vmovdqa.  */
+  return 0;
+}
+
 /* Compute a gain for chain conversion.  */
 
 int
@@ -1549,7 +1566,14 @@ timode_scalar_chain::compute_convert_gain ()
case CONST_INT:
  if (MEM_P (dst)
  && standard_sse_constant_p (src, V1TImode))
-   igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (11) : 1;
+   igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (11) : 1;
+ break;
+
+   case CONST_WIDE_INT:
+ /* 2 x mov vs. vmovdqa.  */
+ if (MEM_P (dst))
+   igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (3)
+   : COSTS_N_INSNS (1);
  break;
 
case NOT:
@@ -1562,6 +1586,8 @@ timode_scalar_chain::compute_convert_gain ()
case IOR:
  if (!MEM_P (dst))
igain = COSTS_N_INSNS (1);
+ if (CONST_SCALAR_INT_P (XEXP (src, 1)))
+   igain += timode_immed_const_gain (XEXP (src, 1));
  break;
 
case ASHIFT:


[gcc r15-3342] i386: Support read-modify-write memory operands in STV.

2024-08-31 Thread Roger Sayle via Gcc-cvs
https://gcc.gnu.org/g:bac00c34226bac3a95979b21dc2d668a96b14f6e

commit r15-3342-gbac00c34226bac3a95979b21dc2d668a96b14f6e
Author: Roger Sayle 
Date:   Sat Aug 31 14:17:18 2024 -0600

i386: Support read-modify-write memory operands in STV.

This patch enables STV when the first operand of a TImode binary
logic operand (AND, IOR or XOR) is a memory operand, which is commonly
the case with read-modify-write instructions.

A different motivating example from the one given previously is:

__int128 m, p, q;
void foo() {
m ^= (p & q);
}

Currently with -O2 -mavx the RMW instructions are rejected by STV,
resulting in scalar code:

foo:movqp(%rip), %rax
movqp+8(%rip), %rdx
andqq(%rip), %rax
andqq+8(%rip), %rdx
xorq%rax, m(%rip)
xorq%rdx, m+8(%rip)
ret

With this patch they become scalar-to-vector candidates:

foo:vmovdqa p(%rip), %xmm0
vpand   q(%rip), %xmm0, %xmm0
vpxor   m(%rip), %xmm0, %xmm0
vmovdqa %xmm0, m(%rip)
ret

2024-08-31  Roger Sayle  

gcc/ChangeLog
* config/i386/i386-features.cc 
(timode_scalar_to_vector_candidate_p):
Support the first operand of AND, IOR and XOR being MEM_P, i.e. a
read-modify-write insn.

gcc/testsuite/ChangeLog
* gcc.target/i386/movti-2.c: Change dg-options to -Os.
* gcc.target/i386/movti-4.c: Expected output of original movti-2.c.

Diff:
---
 gcc/config/i386/i386-features.cc|  6 --
 gcc/testsuite/gcc.target/i386/movti-2.c |  2 +-
 gcc/testsuite/gcc.target/i386/movti-4.c | 11 +++
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c09a5c73a8e3..3434d0069439 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -2330,14 +2330,16 @@ timode_scalar_to_vector_candidate_p (rtx_insn *insn)
  || CONST_SCALAR_INT_P (XEXP (src, 1))
  || timode_mem_p (XEXP (src, 1
return true;
-  return REG_P (XEXP (src, 0))
+  return (REG_P (XEXP (src, 0))
+ || timode_mem_p (XEXP (src, 0)))
 && (REG_P (XEXP (src, 1))
 || CONST_SCALAR_INT_P (XEXP (src, 1))
 || timode_mem_p (XEXP (src, 1)));
 
 case IOR:
 case XOR:
-  return REG_P (XEXP (src, 0))
+  return (REG_P (XEXP (src, 0))
+ || timode_mem_p (XEXP (src, 0)))
 && (REG_P (XEXP (src, 1))
 || CONST_SCALAR_INT_P (XEXP (src, 1))
 || timode_mem_p (XEXP (src, 1)));
diff --git a/gcc/testsuite/gcc.target/i386/movti-2.c 
b/gcc/testsuite/gcc.target/i386/movti-2.c
index 73f69d290cbd..c3a6ae3c51de 100644
--- a/gcc/testsuite/gcc.target/i386/movti-2.c
+++ b/gcc/testsuite/gcc.target/i386/movti-2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target int128 } } */
-/* { dg-options "-O2 -mavx" } */
+/* { dg-options "-Os -mavx" } */
 __int128 m;
 
 void foo()
diff --git a/gcc/testsuite/gcc.target/i386/movti-4.c 
b/gcc/testsuite/gcc.target/i386/movti-4.c
new file mode 100644
index ..eac66fcbf3d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/movti-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -mavx" } */
+__int128 m;
+
+void foo()
+{
+m &= ((__int128)0x0123456789abcdefULL<<64) | 0x0123456789abcdefULL;
+}
+
+/* { dg-final { scan-assembler-times "movabsq" 1 } } */
+/* { dg-final { scan-assembler-times "vpand" 1 } } */