[gcc r15-1191] Fix ICE in rtl check due to CONST_WIDE_INT in CONST_VECTOR_DUPLICATE_P

2024-06-11 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:1d496d2cd1d5d8751a1637abca89339d6f9ddd3b

commit r15-1191-g1d496d2cd1d5d8751a1637abca89339d6f9ddd3b
Author: liuhongt 
Date:   Tue Jun 11 10:23:27 2024 +0800

Fix ICE in rtl check due to CONST_WIDE_INT in CONST_VECTOR_DUPLICATE_P

The patch add extra check to make sure the component of CONST_VECTOR
is CONST_INT_P.

gcc/ChangeLog:

PR target/115384
* simplify-rtx.cc (simplify_context::simplify_binary_operation_1):
Only do the simplification of (AND (ASHIFTRT A imm) mask)
to (LSHIFTRT A imm) when the component of const_vector is
CONST_INT_P.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115384.c: New test.

Diff:
---
 gcc/simplify-rtx.cc  |  6 --
 gcc/testsuite/gcc.target/i386/pr115384.c | 12 
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 9bc3ef9ad9fd..3ee95f74d3db 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4072,9 +4072,11 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
   if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
  && (CONST_INT_P (XEXP (op0, 1))
  || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
- && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
+ && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1))
+ && CONST_INT_P (XVECEXP (XEXP (op0, 1), 0, 0
  && GET_CODE (op1) == CONST_VECTOR
- && CONST_VECTOR_DUPLICATE_P (op1))
+ && CONST_VECTOR_DUPLICATE_P (op1)
+ && CONST_INT_P (XVECEXP (op1, 0, 0)))
{
  unsigned HOST_WIDE_INT shift_count
= (CONST_INT_P (XEXP (op0, 1))
diff --git a/gcc/testsuite/gcc.target/i386/pr115384.c 
b/gcc/testsuite/gcc.target/i386/pr115384.c
new file mode 100644
index ..31dd6f4eb18a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115384.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O" } */
+
+typedef __attribute__((__vector_size__(sizeof(__int128 __int128 W;
+
+W w;
+
+void
+foo()
+{
+  w = w >> 4 & 18446744073709551600llu;
+}


[gcc r12-10497] Disable FMADD in chains for Zen4 and generic

2024-06-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:5d52558a531130675329d72ca5c4713abf5bf885

commit r12-10497-g5d52558a531130675329d72ca5c4713abf5bf885
Author: Jan Hubicka 
Date:   Fri Dec 29 23:51:03 2023 +0100

Disable FMADD in chains for Zen4 and generic

this patch disables use of FMA in matrix multiplication loop for generic 
(for
x86-64-v3) and zen4.  I tested this on zen4 and Xenon Gold Gold 6212U.

For Intel this is neutral both on the matrix multiplication microbenchmark
(attached) and spec2k17 where the difference was within noise for Core.

On core the micro-benchmark runs as follows:

With FMA:

   578,500,241  cycles:u #3.645 GHz
( +-  0.12% )
   753,318,477  instructions:u   #1.30  insn per
cycle  ( +-  0.00% )
   125,417,701  branches:u   #  790.227 M/sec
( +-  0.00% )
  0.159146 +- 0.000363 seconds time elapsed  ( +-  0.23% )

No FMA:

   577,573,960  cycles:u #3.514 GHz
( +-  0.15% )
   878,318,479  instructions:u   #1.52  insn per
cycle  ( +-  0.00% )
   125,417,702  branches:u   #  763.035 M/sec
( +-  0.00% )
  0.164734 +- 0.000321 seconds time elapsed  ( +-  0.19% )

So the cycle count is unchanged and discrete multiply+add takes same time as
FMA.

While on zen:

With FMA:
 484875179  cycles:u #3.599 GHz
 ( +-  0.05% )  (82.11%)
 752031517  instructions:u   #1.55  insn per
cycle
 125106525  branches:u   #  928.712 M/sec
 ( +-  0.03% )  (85.09%)
128356  branch-misses:u  #0.10% of all
branches  ( +-  0.06% )  (83.58%)

No FMA:
 375875209  cycles:u #3.592 GHz
 ( +-  0.08% )  (80.74%)
 875725341  instructions:u   #2.33  insn per
cycle
 124903825  branches:u   #1.194 G/sec
 ( +-  0.04% )  (84.59%)
  0.105203 +- 0.000188 seconds time elapsed  ( +-  0.18% )

The diffrerence is that Cores understand the fact that fmadd does not need
all three parameters to start computation, while Zen cores doesn't.

Since this seems noticeable win on zen and not loss on Core it seems like 
good
default for generic.

float a[SIZE][SIZE];
float b[SIZE][SIZE];
float c[SIZE][SIZE];

void init(void)
{
   int i, j, k;
   for(i=0; i

[gcc r13-8825] Disable FMADD in chains for Zen4 and generic

2024-06-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:e4f85ea6271a10e13c6874709a05e04ab0508fbf

commit r13-8825-ge4f85ea6271a10e13c6874709a05e04ab0508fbf
Author: Jan Hubicka 
Date:   Fri Dec 29 23:51:03 2023 +0100

Disable FMADD in chains for Zen4 and generic

this patch disables use of FMA in matrix multiplication loop for generic 
(for
x86-64-v3) and zen4.  I tested this on zen4 and Xenon Gold Gold 6212U.

For Intel this is neutral both on the matrix multiplication microbenchmark
(attached) and spec2k17 where the difference was within noise for Core.

On core the micro-benchmark runs as follows:

With FMA:

   578,500,241  cycles:u #3.645 GHz
( +-  0.12% )
   753,318,477  instructions:u   #1.30  insn per
cycle  ( +-  0.00% )
   125,417,701  branches:u   #  790.227 M/sec
( +-  0.00% )
  0.159146 +- 0.000363 seconds time elapsed  ( +-  0.23% )

No FMA:

   577,573,960  cycles:u #3.514 GHz
( +-  0.15% )
   878,318,479  instructions:u   #1.52  insn per
cycle  ( +-  0.00% )
   125,417,702  branches:u   #  763.035 M/sec
( +-  0.00% )
  0.164734 +- 0.000321 seconds time elapsed  ( +-  0.19% )

So the cycle count is unchanged and discrete multiply+add takes same time as
FMA.

While on zen:

With FMA:
 484875179  cycles:u #3.599 GHz
 ( +-  0.05% )  (82.11%)
 752031517  instructions:u   #1.55  insn per
cycle
 125106525  branches:u   #  928.712 M/sec
 ( +-  0.03% )  (85.09%)
128356  branch-misses:u  #0.10% of all
branches  ( +-  0.06% )  (83.58%)

No FMA:
 375875209  cycles:u #3.592 GHz
 ( +-  0.08% )  (80.74%)
 875725341  instructions:u   #2.33  insn per
cycle
 124903825  branches:u   #1.194 G/sec
 ( +-  0.04% )  (84.59%)
  0.105203 +- 0.000188 seconds time elapsed  ( +-  0.18% )

The diffrerence is that Cores understand the fact that fmadd does not need
all three parameters to start computation, while Zen cores doesn't.

Since this seems noticeable win on zen and not loss on Core it seems like 
good
default for generic.

float a[SIZE][SIZE];
float b[SIZE][SIZE];
float c[SIZE][SIZE];

void init(void)
{
   int i, j, k;
   for(i=0; i

[gcc r15-1088] Add additional option --param max-completely-peeled-insns=200 for power64*-*-*

2024-06-06 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:b24f2954dbc13d85e9fb62e05a88e9df21e4d4f4

commit r15-1088-gb24f2954dbc13d85e9fb62e05a88e9df21e4d4f4
Author: liuhongt 
Date:   Fri Jun 7 09:29:24 2024 +0800

Add additional option --param max-completely-peeled-insns=200 for 
power64*-*-*

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr112325.c:Add additional option --param
max-completely-peeled-insns=200 for power64*-*-*.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr112325.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c 
b/gcc/testsuite/gcc.dg/vect/pr112325.c
index dea6cca3b86..143903beab2 100644
--- a/gcc/testsuite/gcc.dg/vect/pr112325.c
+++ b/gcc/testsuite/gcc.dg/vect/pr112325.c
@@ -3,6 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 /* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+/* { dg-additional-options "--param max-completely-peeled-insns=200" { target 
powerpc64*-*-* } } */
 
 typedef unsigned short ggml_fp16_t;
 static float table_f32_f16[1 << 16];


[gcc r15-1050] Refine testcase for power10.

2024-06-05 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:fcfce55c85f842ed843cbc4aabe744c6a004dead

commit r15-1050-gfcfce55c85f842ed843cbc4aabe744c6a004dead
Author: liuhongt 
Date:   Thu Jun 6 11:27:53 2024 +0800

Refine testcase for power10.

For power10, there're extra 3 REG_EQUIV notes with (fix:SI. to avoid
the failure. Check (fix:SI is from the pattern not NOTE.

gcc/testsuite/ChangeLog:

PR target/115365
* gcc.dg/pr100927.c: Don't scan fix:SI from the note.

Diff:
---
 gcc/testsuite/gcc.dg/pr100927.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/pr100927.c b/gcc/testsuite/gcc.dg/pr100927.c
index ea0e627befa..8a7d69c3831 100644
--- a/gcc/testsuite/gcc.dg/pr100927.c
+++ b/gcc/testsuite/gcc.dg/pr100927.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -ftrapping-math -fdump-tree-optimized -fdump-rtl-final" } 
*/
 /* { dg-final { scan-tree-dump-times {(?n)= \(int\)} 3 "optimized" } }  */
-/* { dg-final { scan-rtl-dump-times {(?n)\(fix:SI} 3 "final" } }  */
+/* { dg-final { scan-rtl-dump-times {(?n)^[ \t]*\(fix:SI} 3 "final" } }  */
 
 int
 foo_ofr ()


[gcc r15-1048] Adjust rtx_cost for MEM to enable more simplication

2024-06-05 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:961dd0d635217c703a38c48903981e0d60962546

commit r15-1048-g961dd0d635217c703a38c48903981e0d60962546
Author: liuhongt 
Date:   Fri Apr 19 10:39:53 2024 +0800

Adjust rtx_cost for MEM to enable more simplication

For CONST_VECTOR_DUPLICATE_P in constant_pool, it is just broadcast or
variants in ix86_vector_duplicate_simode_const.
Adjust the cost to COSTS_N_INSNS (2) + speed which should be a little
bit larger than broadcast.

gcc/ChangeLog:
PR target/114428
* config/i386/i386.cc (ix86_rtx_costs): Adjust cost for
CONST_VECTOR_DUPLICATE_P in constant_pool.
* config/i386/i386-expand.cc (ix86_broadcast_from_constant):
Remove static.
* config/i386/i386-protos.h (ix86_broadcast_from_constant):
Declare.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114428.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc   |  2 +-
 gcc/config/i386/i386-protos.h|  1 +
 gcc/config/i386/i386.cc  | 13 +
 gcc/testsuite/gcc.target/i386/pr114428.c | 18 ++
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 215a998fc26..56d29c15f9a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -588,7 +588,7 @@ ix86_expand_move (machine_mode mode, rtx operands[])
 
 /* OP is a memref of CONST_VECTOR, return scalar constant mem
if CONST_VECTOR is a vec_duplicate, else return NULL.  */
-static rtx
+rtx
 ix86_broadcast_from_constant (machine_mode mode, rtx op)
 {
   int nunits = GET_MODE_NUNITS (mode);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..90712769200 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -107,6 +107,7 @@ extern void ix86_expand_clear (rtx);
 extern void ix86_expand_move (machine_mode, rtx[]);
 extern void ix86_expand_vector_move (machine_mode, rtx[]);
 extern void ix86_expand_vector_move_misalign (machine_mode, rtx[]);
+extern rtx ix86_broadcast_from_constant (machine_mode, rtx);
 extern rtx ix86_fixup_binary_operands (enum rtx_code, machine_mode,
   rtx[], bool = false);
 extern void ix86_fixup_binary_operands_no_copy (enum rtx_code, machine_mode,
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 271da127a89..a9d62c84c52 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22191,6 +22191,19 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   return true;
 
 case MEM:
+  /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast.
+or variants in ix86_vector_duplicate_simode_const.  */
+
+  if (GET_MODE_SIZE (mode) >= 16
+ && VECTOR_MODE_P (mode)
+ && SYMBOL_REF_P (XEXP (x, 0))
+ && CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))
+ && ix86_broadcast_from_constant (mode, x))
+   {
+ *total = COSTS_N_INSNS (2) + speed;
+ return true;
+   }
+
   /* An insn that accesses memory is slightly more expensive
  than one that does not.  */
   if (speed)
diff --git a/gcc/testsuite/gcc.target/i386/pr114428.c 
b/gcc/testsuite/gcc.target/i386/pr114428.c
new file mode 100644
index 000..bbbc5a080f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114428.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -mno-avx512f -O2" } */
+/* { dg-final { scan-assembler-not "vpsra[dw]" } } */
+
+void
+foo2 (char* __restrict a, short* b)
+{
+  for (int i = 0; i != 32; i++)
+a[i] = b[i] >> (short)8;
+}
+
+void
+foo3 (char* __restrict a, short* b)
+{
+  for (int i = 0; i != 16; i++)
+a[i] = b[i] >> (short)8;
+}
+


[gcc r15-1047] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

2024-06-05 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:7876cde25cbd2f026a0ae488e5263e72f8e9bfa0

commit r15-1047-g7876cde25cbd2f026a0ae488e5263e72f8e9bfa0
Author: liuhongt 
Date:   Fri Apr 19 10:29:34 2024 +0800

Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
of A, then it can be simplified to LSHIFTRT.

i.e Simplify
(and:v8hi
  (ashifrt:v8hi A 8)
  (const_vector 0xff x8))
to
(lshifrt:v8hi A 8)

gcc/ChangeLog:

PR target/114428
* simplify-rtx.cc
(simplify_context::simplify_binary_operation_1):
Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
specific mask.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114428-1.c: New test.

Diff:
---
 gcc/simplify-rtx.cc| 25 +++
 gcc/testsuite/gcc.target/i386/pr114428-1.c | 39 ++
 2 files changed, 64 insertions(+)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index f6b4d73b593..9bc3ef9ad9f 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4065,6 +4065,31 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
return tem;
}
 
+  /* (and:v4si
+  (ashiftrt:v4si A 16)
+  (const_vector: 0x x4))
+is just (lshiftrt:v4si A 16).  */
+  if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
+ && (CONST_INT_P (XEXP (op0, 1))
+ || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
+ && GET_CODE (op1) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (op1))
+   {
+ unsigned HOST_WIDE_INT shift_count
+   = (CONST_INT_P (XEXP (op0, 1))
+  ? UINTVAL (XEXP (op0, 1))
+  : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0)));
+ unsigned HOST_WIDE_INT inner_prec
+   = GET_MODE_PRECISION (GET_MODE_INNER (mode));
+
+ /* Avoid UD shift count.  */
+ if (shift_count < inner_prec
+ && (UINTVAL (XVECEXP (op1, 0, 0))
+ == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1))
+   return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP 
(op0, 1));
+   }
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
diff --git a/gcc/testsuite/gcc.target/i386/pr114428-1.c 
b/gcc/testsuite/gcc.target/i386/pr114428-1.c
new file mode 100644
index 000..927476f2269
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114428-1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psrld" 1 } } */
+/* { dg-final { scan-assembler-times "psrlq" 1 { target { ! ia32 } } } } */
+
+
+#define SHIFTC 12
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+
+v8hi
+foo1 (v8hi a)
+{
+  return
+(a >> (16 - SHIFTC)) & (__extension__(v8hi){(1<> (32 - SHIFTC)) & (__extension__(v4si){(1<> (long long)(64 - SHIFTC)) & (__extension__(v2di){(1ULL<

[gcc r15-1022] Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.

2024-06-04 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:b05288d1f1e4b632eddf8830b4369d4659f6c2ff

commit r15-1022-gb05288d1f1e4b632eddf8830b4369d4659f6c2ff
Author: liuhongt 
Date:   Tue May 21 16:57:17 2024 +0800

Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.

According to IEEE standard, for conversions from floating point to
integer. When a NaN or infinite operand cannot be represented in the
destination format and this cannot otherwise be indicated, the invalid
operation exception shall be signaled. When a numeric operand would
convert to an integer outside the range of the destination format, the
invalid operation exception shall be signaled if this situation cannot
otherwise be indicated.

The patch prevent simplication of the conversion from floating point
to integer for NAN/INF/out-of-range constant when flag_trapping_math.

gcc/ChangeLog:

PR rtl-optimization/100927
PR rtl-optimization/115161
PR rtl-optimization/115115
* simplify-rtx.cc (simplify_const_unary_operation): Prevent
simplication of FIX/UNSIGNED_FIX for NAN/INF/out-of-range
constant when flag_trapping_math.
* fold-const.cc (fold_convert_const_int_from_real): Don't fold
for overflow value when_trapping_math.

gcc/testsuite/ChangeLog:

* gcc.dg/pr100927.c: New test.
* c-c++-common/Wconversion-1.c: Add -fno-trapping-math.
* c-c++-common/dfp/convert-int-saturate.c: Ditto.
* g++.dg/ubsan/pr63956.C: Ditto.
* g++.dg/warn/Wconversion-real-integer.C: Ditto.
* gcc.c-torture/execute/20031003-1.c: Ditto.
* gcc.dg/Wconversion-complex-c99.c: Ditto.
* gcc.dg/Wconversion-real-integer.c: Ditto.
* gcc.dg/c90-const-expr-11.c: Ditto.
* gcc.dg/overflow-warn-8.c: Ditto.

Diff:
---
 gcc/fold-const.cc  | 13 -
 gcc/simplify-rtx.cc| 23 +---
 gcc/testsuite/c-c++-common/Wconversion-1.c |  2 +-
 .../c-c++-common/dfp/convert-int-saturate.c|  1 +
 gcc/testsuite/g++.dg/ubsan/pr63956.C   |  7 -
 .../g++.dg/warn/Wconversion-real-integer.C |  2 +-
 gcc/testsuite/gcc.c-torture/execute/20031003-1.c   |  2 ++
 gcc/testsuite/gcc.dg/Wconversion-complex-c99.c |  2 +-
 gcc/testsuite/gcc.dg/Wconversion-real-integer.c|  2 +-
 gcc/testsuite/gcc.dg/c90-const-expr-11.c   |  2 +-
 gcc/testsuite/gcc.dg/overflow-warn-8.c |  1 +
 gcc/testsuite/gcc.dg/pr100927.c| 31 ++
 12 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 92b048c307e..710d697c021 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -2246,7 +2246,18 @@ fold_convert_const_int_from_real (enum tree_code code, 
tree type, const_tree arg
   if (! overflow)
 val = real_to_integer (, , TYPE_PRECISION (type));
 
-  t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1));
+  /* According to IEEE standard, for conversions from floating point to
+ integer. When a NaN or infinite operand cannot be represented in the
+ destination format and this cannot otherwise be indicated, the invalid
+ operation exception shall be signaled. When a numeric operand would
+ convert to an integer outside the range of the destination format, the
+ invalid operation exception shall be signaled if this situation cannot
+ otherwise be indicated.  */
+  if (!flag_trapping_math || !overflow)
+t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1));
+  else
+t = NULL_TREE;
+
   return t;
 }
 
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 5caf1dfd957..f6b4d73b593 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -2256,14 +2256,25 @@ simplify_const_unary_operation (enum rtx_code code, 
machine_mode mode,
   switch (code)
{
case FIX:
+ /* According to IEEE standard, for conversions from floating point to
+integer. When a NaN or infinite operand cannot be represented in
+the destination format and this cannot otherwise be indicated, the
+invalid operation exception shall be signaled. When a numeric
+operand would convert to an integer outside the range of the
+destination format, the invalid operation exception shall be
+signaled if this situation cannot otherwise be indicated.  */
  if (REAL_VALUE_ISNAN (*x))
-   return const0_rtx;
+   return flag_trapping_math ? NULL_RTX : const0_rtx;
+
+ if (REAL_VALUE_ISINF (*x) && flag_trapping_math)
+   return NULL_RTX;
 
  /* Test against the signed upper bound.  */
  wmax = wi::max_value (width, SIGNED);
  real_from_integer (, VOIDmode, wmax, SIGNED);
  

[gcc r15-1003] Adjust testcase for -march=cascadelake

2024-06-03 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:4d207044195b97ecb27c72a7dc987eb8b86644a0

commit r15-1003-g4d207044195b97ecb27c72a7dc987eb8b86644a0
Author: liuhongt 
Date:   Tue Jun 4 10:13:09 2024 +0800

Adjust testcase for -march=cascadelake

gcc/testsuite/ChangeLog:

PR target/115299
* gcc.target/i386/pr86722.c: Also scan for blendvpd.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr86722.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr86722.c 
b/gcc/testsuite/gcc.target/i386/pr86722.c
index e266a1e56c2..95ddbd8ddb9 100644
--- a/gcc/testsuite/gcc.target/i386/pr86722.c
+++ b/gcc/testsuite/gcc.target/i386/pr86722.c
@@ -6,5 +6,5 @@ void f(double*d,double*e){
 *d=(*d<.5)?.7:0;
 }
 
-/* { dg-final { scan-assembler-times {(?n)(?:andnpd|andpd)} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)(?:andnpd|andpd|blendvpd)} 1 } } */
 /* { dg-final { scan-assembler-not "orpd" } } */


[gcc r15-984] Add some preference for floating point rtl ifcvt when sse4.1 is not available

2024-06-03 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:ac306de7d5100d3682eae2270995a9abbe19db38

commit r15-984-gac306de7d5100d3682eae2270995a9abbe19db38
Author: liuhongt 
Date:   Fri May 31 14:38:07 2024 +0800

Add some preference for floating point rtl ifcvt when sse4.1 is not 
available

W/o TARGET_SSE4_1, it takes 3 instructions (pand, pandn and por) for
movdfcc/movsfcc, and could possibly fail cost comparison. Increase
branch cost could hurt performance for other modes, so specially add
some preference for floating point ifcvt.

gcc/ChangeLog:

PR target/115299
* config/i386/i386.cc (ix86_noce_conversion_profitable_p): Add
some preference for floating point ifcvt when SSE4.1 is not
available.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115299.c: New test.
* gcc.target/i386/pr86722.c: Adjust testcase.

Diff:
---
 gcc/config/i386/i386.cc  | 17 +
 gcc/testsuite/gcc.target/i386/pr115299.c | 10 ++
 gcc/testsuite/gcc.target/i386/pr86722.c  |  2 +-
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1a0206ab573..271da127a89 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24879,6 +24879,23 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, 
struct noce_if_info *if_info)
return false;
}
 }
+
+  /* W/o TARGET_SSE4_1, it takes 3 instructions (pand, pandn and por)
+ for movdfcc/movsfcc, and could possibly fail cost comparison.
+ Increase branch cost will hurt performance for other modes, so
+ specially add some preference for floating point ifcvt.  */
+  if (!TARGET_SSE4_1 && if_info->x
+  && GET_MODE_CLASS (GET_MODE (if_info->x)) == MODE_FLOAT
+  && if_info->speed_p)
+{
+  unsigned cost = seq_cost (seq, true);
+
+  if (cost <= if_info->original_cost)
+   return true;
+
+  return cost <= (if_info->max_seq_cost + COSTS_N_INSNS (2));
+}
+
   return default_noce_conversion_profitable_p (seq, if_info);
 }
 
diff --git a/gcc/testsuite/gcc.target/i386/pr115299.c 
b/gcc/testsuite/gcc.target/i386/pr115299.c
new file mode 100644
index 000..53c5899136a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115299.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-sse4.1 -msse2" } */
+
+void f(double*d,double*e){
+  for(;d

[gcc r15-932] Rename double_u with __double_u to avoid pulluting the namespace.

2024-05-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:3a873c0a7bc8183de95a6103b507101a25eed413

commit r15-932-g3a873c0a7bc8183de95a6103b507101a25eed413
Author: liuhongt 
Date:   Thu May 30 14:15:48 2024 +0800

Rename double_u with __double_u to avoid pulluting the namespace.

gcc/ChangeLog:

* config/i386/emmintrin.h (__double_u): Rename from double_u.
(_mm_load_sd): Replace double_u with __double_u.
(_mm_store_sd): Ditto.
(_mm_loadh_pd): Ditto.
(_mm_loadl_pd): Ditto.
* config/i386/xmmintrin.h (__float_u): Rename from float_u.
(_mm_load_ss): Ditto.
(_mm_store_ss): Ditto.

Diff:
---
 gcc/config/i386/emmintrin.h | 10 +-
 gcc/config/i386/xmmintrin.h |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index fa301103daf..356ca218fcb 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -56,7 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), 
__may_alias__));
 /* Unaligned version of the same types.  */
 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), 
__may_alias__, __aligned__ (1)));
 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef double double_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef double __double_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
@@ -146,7 +146,7 @@ _mm_load1_pd (double const *__P)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_sd (double const *__P)
 {
-  return __extension__ (__m128d) { *(double_u *)__P, 0.0 };
+  return __extension__ (__m128d) { *(__double_u *)__P, 0.0 };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -181,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *(double_u *)__P = ((__v2df)__A)[0] ;
+  *(__double_u *)__P = ((__v2df)__A)[0] ;
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -974,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { ((__v2df)__A)[0], *(double_u*)__B };
+  return __extension__ (__m128d) { ((__v2df)__A)[0], *(__double_u*)__B };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { *(double_u*)__B, ((__v2df)__A)[1] };
+  return __extension__ (__m128d) { *(__double_u*)__B, ((__v2df)__A)[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 87515ecb218..c90fc71331a 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -72,7 +72,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), 
__may_alias__));
 
 /* Unaligned version of the same type.  */
 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef float float_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef float __float_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Internal data types for implementing the intrinsics.  */
 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
@@ -910,7 +910,7 @@ _mm_set_ps1 (float __F)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_ss (float const *__P)
 {
-  return __extension__ (__m128) (__v4sf){ *(float_u *)__P, 0.0f, 0.0f, 0.0f };
+  return __extension__ (__m128) (__v4sf){ *(__float_u *)__P, 0.0f, 0.0f, 0.0f 
};
 }
 
 /* Create a vector with all four elements equal to *P.  */
@@ -966,7 +966,7 @@ _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_ss (float *__P, __m128 __A)
 {
-  *(float_u *)__P = ((__v4sf)__A)[0];
+  *(__float_u *)__P = ((__v4sf)__A)[0];
 }
 
 extern __inline float __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))


[gcc r15-920] Support vcond_mask_qiqi and friends.

2024-05-30 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:b6c6d5abf0d31c936f50f8f9073c5e335b9e24b7

commit r15-920-gb6c6d5abf0d31c936f50f8f9073c5e335b9e24b7
Author: liuhongt 
Date:   Wed Feb 28 11:17:10 2024 +0800

Support vcond_mask_qiqi and friends.

gcc/ChangeLog:

* config/i386/sse.md (vcond_mask_): New expander.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr114125.c: New test.

Diff:
---
 gcc/config/i386/sse.md   | 20 
 gcc/testsuite/gcc.target/i386/pr114125.c | 10 ++
 2 files changed, 30 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0f4fbcb2c5d..7cd912eeeb1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4807,6 +4807,26 @@
   DONE;
 })
 
+(define_expand "vcond_mask_"
+  [(match_operand:SWI1248_AVX512BW 0 "register_operand")
+   (match_operand:SWI1248_AVX512BW 1 "register_operand")
+   (match_operand:SWI1248_AVX512BW 2 "register_operand")
+   (match_operand:SWI1248_AVX512BW 3 "register_operand")]
+  "TARGET_AVX512F"
+{
+  /* (operand[1] & operand[3]) | (operand[2] & ~operand[3])  */
+  rtx op1 = gen_reg_rtx (mode);
+  rtx op2 = gen_reg_rtx (mode);
+  rtx op3 = gen_reg_rtx (mode);
+
+  emit_insn (gen_and3 (op1, operands[1], operands[3]));
+  emit_insn (gen_one_cmpl2 (op3, operands[3]));
+  emit_insn (gen_and3 (op2, operands[2], op3));
+  emit_insn (gen_ior3 (operands[0], op1, op2));
+
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel floating point logical operations
diff --git a/gcc/testsuite/gcc.target/i386/pr114125.c 
b/gcc/testsuite/gcc.target/i386/pr114125.c
new file mode 100644
index 000..e63fbffe965
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114125.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -fdump-tree-forwprop3-raw " } */
+
+typedef long vec __attribute__((vector_size(16)));
+vec f(vec x){
+  vec y = x < 10;
+  return y & (y == 0);
+}
+
+/* { dg-final { scan-tree-dump-not "_expr" "forwprop3" } } */


[gcc r15-919] Don't reduce estimated unrolled size for innermost loop.

2024-05-29 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:ef27b91b62c3aa8841c02665dffa8914c742fd37

commit r15-919-gef27b91b62c3aa8841c02665dffa8914c742fd37
Author: liuhongt 
Date:   Tue Feb 27 15:34:57 2024 +0800

Don't reduce estimated unrolled size for innermost loop.

For the innermost loop, after completely loop unroll, it will most likely
not be able to reduce the body size to 2/3. The current 2/3 reduction
will make some of the larger loops completely unrolled during
cunrolli, which will then result in them not being able to be
vectorized. It also increases the register pressure.

The patch move the 2/3 reduction from estimated_unrolled_size to
tree_unroll_loops_completely.

gcc/ChangeLog:

PR tree-optimization/112325
* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
2 / 3 loop body size reduction to ..
(try_unroll_loop_completely): .. here, add it for the check of
body size shrink, and the check of comparison against
param_max_completely_peeled_insns when
(!cunrolli ||loop->inner).
(canonicalize_loop_induction_variables): Add new parameter
cunrolli and pass down.
(tree_unroll_loops_completely_1): Ditto.
(canonicalize_induction_variables): Pass cunrolli as false to
canonicalize_loop_induction_variables.
(tree_unroll_loops_completely): Set cunrolli to true at
beginning and set it to false after CHANGED is true.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr112325.c: New test.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr112325.c | 59 
 gcc/tree-ssa-loop-ivcanon.cc | 49 --
 2 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c 
b/gcc/testsuite/gcc.dg/vect/pr112325.c
new file mode 100644
index 000..71cf4099253
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr112325.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+unsigned short s;
+__builtin_memcpy(, , sizeof(unsigned short));
+return table_f32_f16[s];
+}
+
+typedef struct {
+ggml_fp16_t d;
+ggml_fp16_t m;
+unsigned char qh[4];
+unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+float d;
+float s;
+char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * 
restrict vx, const void * restrict vy) {
+const int qk = 32;
+const int nb = n / qk;
+
+const block_q5_1 * restrict x = vx;
+const block_q8_1 * restrict y = vy;
+
+float sumf = 0.0;
+
+for (int i = 0; i < nb; i++) {
+unsigned qh;
+__builtin_memcpy(, x[i].qh, sizeof(qh));
+
+int sumi = 0;
+
+for (int j = 0; j < qk/2; ++j) {
+const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+}
+
+sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + 
ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+}
+
+*s = sumf;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf017137260..5ef24a91917 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge 
edge_to_cancel,
It is (NUNROLL + 1) * size of loop body with taking into account
the fact that in last copy everything after exit conditional
is dead and that some instructions will be eliminated after
-   peeling.
-
-   Loop body is likely going to simplify further, this is difficult
-   to guess, we just decrease the result by 1/3.  */
-
+   peeling.  */
 static unsigned HOST_WIDE_INT
 estimated_unrolled_size (struct loop_size *size,
 unsigned HOST_WIDE_INT nunroll)
@@ -453,10 +449,6 @@ estimated_unrolled_size (struct loop_size *size,
 unr_insns = 0;
   unr_insns += size->last_iteration - 
size->last_iteration_eliminated_by_peeling;
 
-  unr_insns = unr_insns * 2 / 3;
-  if (unr_insns <= 0)
-unr_insns = 1;
-
   return unr_insns;
 }
 
@@ -734,7 +726,8 @@ try_unroll_loop_completely (class loop *loop,
edge exit, tree niter, bool may_be_zero,
enum unroll_level ul,
 

[gcc r15-882] Reduce cost of MEM (A + imm).

2024-05-28 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:1d6199e5f8c1c08083eeb0279f71333234fe14ad

commit r15-882-g1d6199e5f8c1c08083eeb0279f71333234fe14ad
Author: liuhongt 
Date:   Mon Feb 19 13:57:24 2024 +0800

Reduce cost of MEM (A + imm).

For MEM, rtx_cost iterates each subrtx, and adds up the costs,
so for MEM (reg) and MEM (reg + 4), the former costs 5,
the latter costs 9, it is not accurate for x86. Ideally
address_cost should be used, but it reduce cost too much.
So current solution is make constant disp as cheap as possible.

gcc/ChangeLog:

PR target/67325
* config/i386/i386.cc (ix86_rtx_costs): Reduce cost of MEM (A
+ imm) to "cost of MEM (A)" + 1.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr67325.c: New test.

Diff:
---
 gcc/config/i386/i386.cc | 18 +-
 gcc/testsuite/gcc.target/i386/pr67325.c |  7 +++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3e2a3a194f1..85d87b9f778 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22194,7 +22194,23 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   /* An insn that accesses memory is slightly more expensive
  than one that does not.  */
   if (speed)
-*total += 1;
+   {
+ *total += 1;
+ rtx addr = XEXP (x, 0);
+ /* For MEM, rtx_cost iterates each subrtx, and adds up the costs,
+so for MEM (reg) and MEM (reg + 4), the former costs 5,
+the latter costs 9, it is not accurate for x86. Ideally
+address_cost should be used, but it reduce cost too much.
+So current solution is make constant disp as cheap as possible.  */
+ if (GET_CODE (addr) == PLUS
+ && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
+   {
+ *total += 1;
+ *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
+ return true;
+   }
+   }
+
   return false;
 
 case ZERO_EXTRACT:
diff --git a/gcc/testsuite/gcc.target/i386/pr67325.c 
b/gcc/testsuite/gcc.target/i386/pr67325.c
new file mode 100644
index 000..c3c1e4c5b4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr67325.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "(?:sar|shr)" } } */
+
+int f(long*l){
+  return *l>>32;
+}


[gcc r15-857] Fix predicate mismatch between vfcmaddcph's define_insn and define_expand.

2024-05-27 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:c65002347e595cda8b15e59e734d209283faf2b6

commit r15-857-gc65002347e595cda8b15e59e734d209283faf2b6
Author: liuhongt 
Date:   Tue May 28 10:32:12 2024 +0800

Fix predicate mismatch between vfcmaddcph's define_insn and define_expand.

When I applied Roger's patch [1], there's ICE due to it.
The patch fix the latent bug.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651365.html

gcc/ChangeLog:

* config/i386/sse.md
(___mask): Align
operands' predicate with corresponding expander.
(__):
Ditto.

Diff:
---
 gcc/config/i386/sse.md | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b59c988fc31..0f4fbcb2c5d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6867,9 +6867,9 @@
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=")
(vec_merge:VHF_AVX512VL
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
-(match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")
-(match_operand:VHF_AVX512VL 3 "register_operand" "0")]
+   [(match_operand:VHF_AVX512VL 1 "" 
"v")
+(match_operand:VHF_AVX512VL 2 "" 
"")
+(match_operand:VHF_AVX512VL 3 "" "0")]
 UNSPEC_COMPLEX_F_C_MA)
  (match_dup 1)
  (unspec:
@@ -6892,8 +6892,8 @@
 (define_insn "__"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=")
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
-(match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")]
+   [(match_operand:VHF_AVX512VL 1 "" 
"v")
+(match_operand:VHF_AVX512VL 2 "" 
"")]
 UNSPEC_COMPLEX_F_C_MUL))]
   "TARGET_AVX512FP16 && "
 {


[gcc r15-814] Fix typo in the testcase.

2024-05-24 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:51f4b47c4f4f61fe31a7bd1fa80e08c2438d76a8

commit r15-814-g51f4b47c4f4f61fe31a7bd1fa80e08c2438d76a8
Author: liuhongt 
Date:   Fri May 24 09:49:08 2024 +0800

Fix typo in the testcase.

gcc/testsuite/ChangeLog:

PR target/114148
* gcc.target/i386/pr106010-7b.c: Refine testcase.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr106010-7b.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c 
b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
index 26482cc10f5..917e56e45f7 100644
--- a/gcc/testsuite/gcc.target/i386/pr106010-7b.c
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
@@ -34,11 +34,11 @@ avx_test (void)
 p_init[i] = i % 2 + 3;
 
   memcpy (pd_src, p_init, 2 * N * sizeof (double));
-  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
-  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
-  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
-  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
-  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
+  memcpy (ps_src, p_init, 2 * N * sizeof (float));
+  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
+  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
+  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
+  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
 
   foo_pd (pd_dst, pd_src[0]);
   foo_ps (ps_dst, ps_src[0]);


[gcc r15-717] Use pblendw instead of pand to clear upper 16 bits.

2024-05-20 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:0ebaffccb294d90184ad78367de66b6307de3ac0

commit r15-717-g0ebaffccb294d90184ad78367de66b6307de3ac0
Author: liuhongt 
Date:   Fri Mar 22 14:40:00 2024 +0800

Use pblendw instead of pand to clear upper 16 bits.

For vec_pack_truncv8si/v4si w/o AVX512,
(const_vector:v4si (const_int 0x) x4) is used as mask to clear
upper 16 bits, but vpblendw with zero_vector can also be used, and
zero vector is cheaper than (const_vector:v4si (const_int 0x) x4).

gcc/ChangeLog:
PR target/114427
* config/i386/i386-expand.cc (expand_vec_perm_even_odd_pack):
Use pblendw instead of pand to clear upper bits.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr114427.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc   | 34 
 gcc/testsuite/gcc.target/i386/pr114427.c | 18 +
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 100fb2afb3a..7142c0a9d77 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22587,6 +22587,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
 {
   rtx op, dop0, dop1, t;
   unsigned i, odd, c, s, nelt = d->nelt;
+  int pblendw_i = 0;
   bool end_perm = false;
   machine_mode half_mode;
   rtx (*gen_and) (rtx, rtx, rtx);
@@ -22608,6 +22609,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv2si3;
   gen_pack = gen_mmx_packusdw;
   gen_shift = gen_lshrv2si3;
+  pblendw_i = 0x5;
   break;
 case E_V8HImode:
   /* Required for "pack".  */
@@ -22619,6 +22621,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv4si3;
   gen_pack = gen_sse4_1_packusdw;
   gen_shift = gen_lshrv4si3;
+  pblendw_i = 0x55;
   break;
 case E_V8QImode:
   /* No check as all instructions are SSE2.  */
@@ -22647,6 +22650,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv8si3;
   gen_pack = gen_avx2_packusdw;
   gen_shift = gen_lshrv8si3;
+  pblendw_i = 0x;
   end_perm = true;
   break;
 case E_V32QImode:
@@ -22682,10 +22686,32 @@ expand_vec_perm_even_odd_pack (struct 
expand_vec_perm_d *d)
   dop1 = gen_reg_rtx (half_mode);
   if (odd == 0)
 {
-  t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
-  t = force_reg (half_mode, t);
-  emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
-  emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+  /* Use pblendw since const_vector 0 should be cheaper than
+const_vector 0x.  */
+  if (d->vmode == V4HImode
+ || d->vmode == E_V8HImode
+ || d->vmode == E_V16HImode)
+   {
+ rtx dop0_t = gen_reg_rtx (d->vmode);
+ rtx dop1_t = gen_reg_rtx (d->vmode);
+ t = gen_reg_rtx (d->vmode);
+ emit_move_insn (t, CONST0_RTX (d->vmode));
+
+ emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
+GEN_INT (pblendw_i)));
+ emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
+GEN_INT (pblendw_i)));
+
+ emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
+ emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
+   }
+  else
+   {
+ t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
+ t = force_reg (half_mode, t);
+ emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+ emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+   }
 }
   else
 {
diff --git a/gcc/testsuite/gcc.target/i386/pr114427.c 
b/gcc/testsuite/gcc.target/i386/pr114427.c
new file mode 100644
index 000..58b66db7fff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114427.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "vpand" } } */
+/* { dg-final { scan-assembler-not "65535" } } */
+
+void
+foo (int* a, short* __restrict b, int* c)
+{
+for (int i = 0; i != 16; i++)
+  b[i] = c[i] + a[i];
+}
+
+void
+foo1 (int* a, short* __restrict b, int* c)
+{
+for (int i = 0; i != 8; i++)
+  b[i] = c[i] + a[i];
+}


[gcc r15-530] Set d.one_operand_p to true when TARGET_SSSE3 in ix86_expand_vecop_qihi_partial.

2024-05-15 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:090714e6cf8029f4ff8883dce687200024adbaeb

commit r15-530-g090714e6cf8029f4ff8883dce687200024adbaeb
Author: liuhongt 
Date:   Wed May 15 10:56:24 2024 +0800

Set d.one_operand_p to true when TARGET_SSSE3 in 
ix86_expand_vecop_qihi_partial.

pshufb is available under TARGET_SSSE3, so
ix86_expand_vec_perm_const_1 must return true when TARGET_SSSE3.

With the patch under -march=x86-64-v2

v8qi
foo (v8qi a)
{
  return a >> 5;
}

<   pmovsxbw%xmm0, %xmm0
<   psraw   $5, %xmm0
<   pshufb  .LC0(%rip), %xmm0

vs.

>   movdqa  %xmm0, %xmm1
>   pcmpeqd %xmm0, %xmm0
>   pmovsxbw%xmm1, %xmm1
>   psrlw   $8, %xmm0
>   psraw   $5, %xmm1
>   pand%xmm1, %xmm0
>   packuswb%xmm0, %xmm0

Although there's a memory load from constant pool, but it should be
better when it's inside a loop. The load from constant pool can be
hoist out. it's 1 instruction vs 4 instructions.

<   pshufb  .LC0(%rip), %xmm0

vs.

>   pcmpeqd %xmm0, %xmm0
>   psrlw   $8, %xmm0
>   pand%xmm1, %xmm0
>   packuswb%xmm0, %xmm0

gcc/ChangeLog:

PR target/114514
* config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial):
Set d.one_operand_p to true when TARGET_SSSE3.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114514-shufb.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc |  2 +-
 gcc/testsuite/gcc.target/i386/pr114514-shufb.c | 35 ++
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4c47cfe468ef..4e16aedc5c13 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24458,7 +24458,7 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx 
dest, rtx op1, rtx op2)
   d.op0 = d.op1 = qres;
   d.vmode = V16QImode;
   d.nelt = 16;
-  d.one_operand_p = false;
+  d.one_operand_p = TARGET_SSSE3;
   d.testing_p = false;
 
   for (i = 0; i < d.nelt; ++i)
diff --git a/gcc/testsuite/gcc.target/i386/pr114514-shufb.c 
b/gcc/testsuite/gcc.target/i386/pr114514-shufb.c
new file mode 100644
index ..71fdc9d8daf1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114514-shufb.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4.1 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "packuswb" } }  */
+/* { dg-final { scan-assembler-times "pshufb" 4 { target { ! ia32 } } } }  */
+/* { dg-final { scan-assembler-times "pshufb" 6 { target  ia32 } } }  */
+
+typedef unsigned char v8uqi __attribute__((vector_size(8)));
+typedef  char v8qi __attribute__((vector_size(8)));
+typedef unsigned char v4uqi __attribute__((vector_size(4)));
+typedef  char v4qi __attribute__((vector_size(4)));
+
+v8qi
+foo (v8qi a)
+{
+  return a >> 5;
+}
+
+v8uqi
+foo1 (v8uqi a)
+{
+  return a >> 5;
+}
+
+v4qi
+foo2 (v4qi a)
+{
+  return a >> 5;
+}
+
+v4uqi
+foo3 (v4uqi a)
+{
+  return a >> 5;
+}
+


[gcc r15-529] Optimize ashift >> 7 to vpcmpgtb for vector int8.

2024-05-15 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:0cc0956b3bb8bcbc9196075b9073a227d799e042

commit r15-529-g0cc0956b3bb8bcbc9196075b9073a227d799e042
Author: liuhongt 
Date:   Tue May 14 18:39:54 2024 +0800

Optimize ashift >> 7 to vpcmpgtb for vector int8.

Since there is no corresponding instruction, the shift operation for
vector int8 is implemented using the instructions for vector int16,
but for some special shift counts, it can be transformed into vpcmpgtb.

gcc/ChangeLog:

PR target/114514
* config/i386/i386-expand.cc
(ix86_expand_vec_shift_qihi_constant): Optimize ashift >> 7 to
vpcmpgtb.
(ix86_expand_vecop_qihi_partial): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114514-shift.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc | 32 +
 gcc/testsuite/gcc.target/i386/pr114514-shift.c | 49 ++
 2 files changed, 81 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index e846a946de07..4c47cfe468ef 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24246,6 +24246,28 @@ ix86_expand_vec_shift_qihi_constant (enum rtx_code 
code,
 return false;
 
   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+
+
+  if (shift_amount == 7
+  && code == ASHIFTRT)
+{
+  if (qimode == V16QImode
+ || qimode == V32QImode)
+   {
+ rtx zero = gen_reg_rtx (qimode);
+ emit_move_insn (zero, CONST0_RTX (qimode));
+ emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
+   }
+  else
+   {
+ gcc_assert (qimode == V64QImode);
+ rtx kmask = gen_reg_rtx (DImode);
+ emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1));
+ emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask));
+   }
+  return true;
+}
+
   /* Record sign bit.  */
   xor_constant = 1 << (8 - shift_amount - 1);
 
@@ -24356,6 +24378,16 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, 
rtx dest, rtx op1, rtx op2)
   return;
 }
 
+  if (CONST_INT_P (op2)
+  && code == ASHIFTRT
+  && INTVAL (op2) == 7)
+{
+  rtx zero = gen_reg_rtx (qimode);
+  emit_move_insn (zero, CONST0_RTX (qimode));
+  emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
+  return;
+}
+
   switch (code)
 {
 case MULT:
diff --git a/gcc/testsuite/gcc.target/i386/pr114514-shift.c 
b/gcc/testsuite/gcc.target/i386/pr114514-shift.c
new file mode 100644
index ..cf8b32b3b1d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114514-shift.c
@@ -0,0 +1,49 @@
+/* { dg-do compile  } */
+/* { dg-options "-mavx512vl -mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "vpxor" 4 } } */
+/* { dg-final { scan-assembler-times "vpcmpgtb" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpcmpgtb" 5 { target  ia32 } } } */
+/* { dg-final { scan-assembler-times "vpmovb2m" 1 } } */
+/* { dg-final { scan-assembler-times "vpmovm2b" 1 } } */
+
+
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v8qi __attribute__((vector_size(8)));
+typedef char v4qi __attribute__((vector_size(4)));
+
+v4qi
+__attribute__((noipa))
+foo1 (v4qi a)
+{
+  return a >> 7;
+}
+
+v8qi
+__attribute__((noipa))
+foo2 (v8qi a)
+{
+  return a >> 7;
+}
+
+v16qi
+__attribute__((noipa))
+foo3 (v16qi a)
+{
+  return a >> 7;
+}
+
+v32qi
+__attribute__((noipa))
+foo4 (v32qi a)
+{
+  return a >> 7;
+}
+
+v64qi
+__attribute__((noipa))
+foo5 (v64qi a)
+{
+  return a >> 7;
+}


[gcc r15-499] x86: Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]

2024-05-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:a71f90c5a7ae2942083921033cb23dcd63e70525

commit r15-499-ga71f90c5a7ae2942083921033cb23dcd63e70525
Author: Levy Hsu 
Date:   Thu May 9 16:50:56 2024 +0800

x86: Add 3-instruction subroutine vector shift for V16QI in 
ix86_expand_vec_perm_const_1 [PR107563]

Hi All

We've introduced a new subroutine in ix86_expand_vec_perm_const_1
to optimize vector shifting for the V16QI type on x86.
This patch uses a three-instruction sequence psrlw, psllw, and por
to handle specific vector shuffle operations more efficiently.
The change aims to improve assembly code generation for configurations
supporting SSE2.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

Best
Levy

gcc/ChangeLog:

PR target/107563
* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
subroutine.
(ix86_expand_vec_perm_const_1): Call 
expand_vec_perm_psrlw_psllw_por.

gcc/testsuite/ChangeLog:

PR target/107563
* g++.target/i386/pr107563-a.C: New test.
* g++.target/i386/pr107563-b.C: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc | 64 ++
 gcc/testsuite/g++.target/i386/pr107563-a.C | 13 ++
 gcc/testsuite/g++.target/i386/pr107563-b.C | 12 ++
 3 files changed, 89 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1ab22fe79736..e846a946de07 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct 
expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+return false;
+
+  switch (d->vmode)
+{
+case E_V8QImode:
+  if (!TARGET_MMX_WITH_SSE)
+   return false;
+  mode = V4HImode;
+  gen_shr = gen_ashrv4hi3;
+  gen_shl = gen_ashlv4hi3;
+  gen_or = gen_iorv4hi3;
+  break;
+case E_V16QImode:
+  mode = V8HImode;
+  gen_shr = gen_vlshrv8hi3;
+  gen_shl = gen_vashlv8hi3;
+  gen_or = gen_iorv8hi3;
+  break;
+default: return false;
+}
+
+  if (!rtx_equal_p (d->op0, d->op1))
+return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+  return false;
+
+  if (d->testing_p)
+return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together.  */
@@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
   if (expand_vec_perm_2perm_pblendv (d, false))
 return true;
 
+  if (expand_vec_perm_psrlw_psllw_por (d))
+return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C 
b/gcc/testsuite/g++.target/i386/pr107563-a.C
new file mode 100755
index ..605c1bdf814b
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-a.C
@@ -0,0 +1,13 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char;
+
+void foo2(temp_vec_type2& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6);
+}
diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C 
b/gcc/testsuite/g++.target/i386/pr107563-b.C
new file mode 100755
index ..0ce3e8263bb5
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-b.C
@@ -0,0 +1,12 @@
+/* PR target/107563.C */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* 

[gcc r15-234] Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf.

2024-05-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:a9f642783853b60bb0a59562b8ab3ed10ec01641

commit r15-234-ga9f642783853b60bb0a59562b8ab3ed10ec01641
Author: liuhongt 
Date:   Wed Dec 20 11:54:43 2023 +0800

Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf.

gcc/ChangeLog:

PR target/113090
* config/i386/i386-expand.cc
(expand_vec_perm_punpckldq_pshuf): New function.
(ix86_expand_vec_perm_const_1): Try
expand_vec_perm_punpckldq_pshuf for sequence of 2
instructions.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr113090.c: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc   | 71 
 gcc/testsuite/gcc.target/i386/pr113090.c | 25 +++
 2 files changed, 96 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a6132911e6a..2f27bfb484c 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -21173,6 +21173,74 @@ expand_vec_perm_pshuflw_pshufhw (struct 
expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle.  */
+static bool
+expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
+{
+  if (GET_MODE_BITSIZE (d->vmode) != 64
+  || !TARGET_MMX_WITH_SSE
+  || d->one_operand_p)
+return false;
+
+  machine_mode widen_vmode;
+  switch (d->vmode)
+{
+/* pshufd.  */
+case E_V2SImode:
+  widen_vmode = V4SImode;
+  break;
+
+/* pshufd.  */
+case E_V2SFmode:
+  widen_vmode = V4SFmode;
+  break;
+
+case E_V4HImode:
+  widen_vmode = V8HImode;
+  /* pshufb.  */
+  if (!TARGET_SSSE3)
+   return false;
+  break;
+
+case E_V8QImode:
+  /* pshufb.  */
+  widen_vmode = V16QImode;
+  if (!TARGET_SSSE3)
+   return false;
+  break;
+
+default:
+  return false;
+}
+
+  if (d->testing_p)
+return true;
+
+  struct expand_vec_perm_d dperm;
+  dperm.target = gen_reg_rtx (widen_vmode);
+  rtx op0 = gen_reg_rtx (widen_vmode);
+  emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
+  dperm.op0 = op0;
+  dperm.op1 = op0;
+  dperm.vmode = widen_vmode;
+  unsigned nelt = GET_MODE_NUNITS (widen_vmode);
+  dperm.nelt = nelt;
+  dperm.one_operand_p = true;
+  dperm.testing_p = false;
+
+  for (unsigned i = 0; i != nelt / 2; i++)
+{
+  dperm.perm[i] = d->perm[i];
+  dperm.perm[i + nelt / 2] = d->perm[i];
+}
+
+  gcc_assert (expand_vec_perm_1 ());
+  emit_move_insn (d->target, lowpart_subreg (d->vmode,
+dperm.target,
+dperm.vmode));
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
the permutation using the SSSE3 palignr instruction.  This succeeds
when all of the elements in PERM fit within one vector and we merely
@@ -23685,6 +23753,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
   if (expand_vec_perm_shufps_shufps (d))
 return true;
 
+  if (expand_vec_perm_punpckldq_pshuf (d))
+return true;
+
   /* Try sequences of three instructions.  */
 
   if (expand_vec_perm_even_odd_pack (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr113090.c 
b/gcc/testsuite/gcc.target/i386/pr113090.c
new file mode 100644
index 000..0f0b7cc0084
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113090.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse4.1" } */
+/* { dg-final { scan-assembler-times "pshufd" 3 } } */
+
+typedef int v2si __attribute__((vector_size(8)));
+typedef short v4hi __attribute__((vector_size(8)));
+typedef char v8qi __attribute__((vector_size(8)));
+
+v2si
+foo (v2si a, v2si b)
+{
+return __builtin_shufflevector (a, b, 1, 2);
+}
+
+v4hi
+foo1 (v4hi a, v4hi b)
+{
+  return __builtin_shufflevector (a, b, 2, 3, 4, 5);
+}
+
+v8qi
+foo2 (v8qi a, v8qi b)
+{
+  return __builtin_shufflevector (a, b, 4, 5, 6, 7, 8, 9, 10, 11);
+}


[gcc r15-236] Extend usdot_prodv*qi with vpmaddwd when AVXVNNI/AVX512VNNI is not available.

2024-05-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:8b974f54393ab2d2d16a0051a68c155455a92aad

commit r15-236-g8b974f54393ab2d2d16a0051a68c155455a92aad
Author: liuhongt 
Date:   Mon Jan 8 15:13:41 2024 +0800

Extend usdot_prodv*qi with vpmaddwd when AVXVNNI/AVX512VNNI is not 
available.

gcc/ChangeLog:

* config/i386/sse.md (usdot_prodv*qi): Extend to VI1_AVX512
with vpmaddwd when avxvnni/avx512vnni is not available.

Diff:
---
 gcc/config/i386/sse.md | 55 +-
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1bf50726e83..f57f36ae380 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -29955,21 +29955,48 @@
 
 (define_expand "usdot_prod"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI1_AVX512VNNI 1 "register_operand")
-   (match_operand:VI1_AVX512VNNI 2 "register_operand")
+   (match_operand:VI1_AVX512 1 "register_operand")
+   (match_operand:VI1_AVX512 2 "register_operand")
(match_operand: 3 "register_operand")]
-  "(( == 64 && TARGET_EVEX512)
-|| ((TARGET_AVX512VNNI && TARGET_AVX512VL)
-   || TARGET_AVXVNNI))"
-{
-  operands[1] = lowpart_subreg (mode,
-   force_reg (mode, operands[1]),
-   mode);
-  operands[2] = lowpart_subreg (mode,
-   force_reg (mode, operands[2]),
-   mode);
-  emit_insn (gen_vpdpbusd_ (operands[0], operands[3],
- operands[1], operands[2]));
+  "TARGET_SSE2"
+{
+  if ( == 64
+ ? TARGET_AVX512VNNI
+ : ((TARGET_AVX512VNNI && TARGET_AVX512VL) || TARGET_AVXVNNI))
+{
+  operands[1] = lowpart_subreg (mode,
+   force_reg (mode, operands[1]),
+   mode);
+  operands[2] = lowpart_subreg (mode,
+   force_reg (mode, operands[2]),
+   mode);
+  emit_insn (gen_vpdpbusd_ (operands[0], operands[3],
+ operands[1], operands[2]));
+}
+  else
+{
+  /* Emulate with vpdpwssd.  */
+  rtx op1_lo = gen_reg_rtx (mode);
+  rtx op1_hi = gen_reg_rtx (mode);
+  rtx op2_lo = gen_reg_rtx (mode);
+  rtx op2_hi = gen_reg_rtx (mode);
+
+  emit_insn (gen_vec_unpacku_lo_ (op1_lo, operands[1]));
+  emit_insn (gen_vec_unpacks_lo_ (op2_lo, operands[2]));
+  emit_insn (gen_vec_unpacku_hi_ (op1_hi, operands[1]));
+  emit_insn (gen_vec_unpacks_hi_ (op2_hi, operands[2]));
+
+  rtx res1 = gen_reg_rtx (mode);
+  rtx res2 = gen_reg_rtx (mode);
+  rtx sum = gen_reg_rtx (mode);
+
+  emit_move_insn (sum, CONST0_RTX (mode));
+  emit_insn (gen_sdot_prod (res1, op1_lo,
+   op2_lo, sum));
+  emit_insn (gen_sdot_prod (res2, op1_hi,
+   op2_hi, operands[3]));
+  emit_insn (gen_add3 (operands[0], res1, res2));
+}
   DONE;
 })


[gcc r15-235] Support dot_prod optabs for 64-bit vector.

2024-05-07 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:fa911365490a7ca308878517a4af6189ffba7ed6

commit r15-235-gfa911365490a7ca308878517a4af6189ffba7ed6
Author: liuhongt 
Date:   Wed Dec 20 11:43:25 2023 +0800

Support dot_prod optabs for 64-bit vector.

gcc/ChangeLog:

PR target/113079
* config/i386/mmx.md (usdot_prodv8qi): New expander.
(sdot_prodv8qi): Ditto.
(udot_prodv8qi): Ditto.
(usdot_prodv4hi): Ditto.
(udot_prodv4hi): Ditto.
(sdot_prodv4hi): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr113079.c: New test.
* gcc.target/i386/pr113079-2.c: New test.
* gcc.target/i386/sse4-pr113079-2.c: New test.

Diff:
---
 gcc/config/i386/mmx.md  | 195 
 gcc/testsuite/gcc.target/i386/pr113079-2.c  | 161 +++
 gcc/testsuite/gcc.target/i386/pr113079.c|  57 +++
 gcc/testsuite/gcc.target/i386/sse4-pr113079-2.c | 158 +++
 4 files changed, 571 insertions(+)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 9a8d6030d8b..5f342497885 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -6342,6 +6342,201 @@
   DONE;
 })
 
+(define_expand "usdot_prodv8qi"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V8QI 1 "register_operand")
+   (match_operand:V8QI 2 "register_operand")
+   (match_operand:V2SI 3 "register_operand")]
+  "TARGET_MMX_WITH_SSE && TARGET_SSE4_1"
+{
+  operands[1] = force_reg (V8QImode, operands[1]);
+  operands[2] = force_reg (V8QImode, operands[2]);
+  operands[3] = force_reg (V2SImode, operands[3]);
+
+  if ((TARGET_AVX512VNNI && TARGET_AVX512VL)
+ || TARGET_AVXVNNI)
+{
+  rtx op1 = lowpart_subreg (V16QImode, operands[1], V8QImode);
+  rtx op2 = lowpart_subreg (V16QImode, operands[2], V8QImode);
+  rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_usdot_prodv16qi (op0, op1, op2, op3));
+  emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
+ }
+   else
+ {
+  rtx op1 = gen_reg_rtx (V8HImode);
+  rtx op2 = gen_reg_rtx (V8HImode);
+  rtx op3 = gen_reg_rtx (V4SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+  rtx op0_1 = gen_reg_rtx (V4SImode);
+
+  emit_move_insn (op3, CONST0_RTX (V4SImode));
+  emit_insn (gen_zero_extendv8qiv8hi2 (op1, operands[1]));
+  emit_insn (gen_extendv8qiv8hi2 (op2, operands[2]));
+  emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3));
+
+  /* vec_perm (op0, 2, 3, 0, 1);  */
+  emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78)));
+  emit_insn (gen_addv4si3 (op0, op0, op0_1));
+  emit_insn (gen_addv2si3 (operands[0], operands[3],
+  lowpart_subreg (V2SImode, op0, V4SImode)));
+ }
+DONE;
+})
+
+(define_expand "sdot_prodv8qi"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V8QI 1 "register_operand")
+   (match_operand:V8QI 2 "register_operand")
+   (match_operand:V2SI 3 "register_operand")]
+  "TARGET_MMX_WITH_SSE && TARGET_SSE4_1"
+{
+  operands[1] = force_reg (V8QImode, operands[1]);
+  operands[2] = force_reg (V8QImode, operands[2]);
+  operands[3] = force_reg (V2SImode, operands[3]);
+
+  if (TARGET_AVXVNNIINT8)
+{
+  rtx op1 = lowpart_subreg (V16QImode, operands[1], V8QImode);
+  rtx op2 = lowpart_subreg (V16QImode, operands[2], V8QImode);
+  rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_sdot_prodv16qi (op0, op1, op2, op3));
+  emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
+}
+  else
+{
+  rtx op1 = gen_reg_rtx (V8HImode);
+  rtx op2 = gen_reg_rtx (V8HImode);
+  rtx op3 = gen_reg_rtx (V4SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+  rtx op0_1 = gen_reg_rtx (V4SImode);
+
+  emit_move_insn (op3, CONST0_RTX (V4SImode));
+  emit_insn (gen_extendv8qiv8hi2 (op1, operands[1]));
+  emit_insn (gen_extendv8qiv8hi2 (op2, operands[2]));
+  emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3));
+
+  /* vec_perm (op0, 2, 3, 0, 1);  */
+  emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78)));
+  emit_insn (gen_addv4si3 (op0, op0, op0_1));
+  emit_insn (gen_addv2si3 (operands[0], operands[3],
+  lowpart_subreg (V2SImode, op0, V4SImode)));
+}
+  DONE;
+
+})
+
+(define_expand "udot_prodv8qi"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V8QI 1 "register_operand")
+   (match_operand:V8QI 2 "register_operand")
+   (match_operand:V2SI 3 "register_operand")]
+  "TARGET_MMX_WITH_SSE && TARGET_SSE4_1"
+{
+  operands[1] = force_reg (V8QImode, operands[1]);
+  operands[2] = force_reg (V8QImode, operands[2]);
+  operands[3] = force_reg (V2SImode, operands[3]);
+
+  if 

[gcc r15-167] Update libbid according to the latest Intel Decimal Floating-Point Math Library.

2024-05-05 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:affd77d3fe7bfb525b3fb23316d164e847ed02d1

commit r15-167-gaffd77d3fe7bfb525b3fb23316d164e847ed02d1
Author: liuhongt 
Date:   Wed Mar 27 08:20:13 2024 +0800

Update libbid according to the latest Intel Decimal Floating-Point Math 
Library.

The Intel Decimal Floating-Point Math Library is available as open-source 
on Netlib[1].

[1] https://www.netlib.org/misc/intel/.

libgcc/config/libbid/ChangeLog:

* bid128_fma.c (add_and_round): Fix bug: the result
of (+5E+368)*(+10E-34)+(-10E+369) was returning
-99E+336 instead of expected
result -10E+337.
(bid128_ext_fma): Ditto.
(bid64qqq_fma): Ditto.
* bid128_noncomp.c: Change return type of bid128_class from
int to class_t.
* bid128_round_integral.c: Add default case to avoid compiler
warning.
* bid128_string.c (bid128_to_string): Replace 0x30 with '0'
for zero digit.
(bid128_from_string): Ditto.
* bid32_to_bid128.c (bid128_to_bid32): Fix Bug. In addition
to the INEXACT flag, the UNDERFLOW flag needs to be set (and
was not) when converting an input such as
+6931674235302037148946035460357709E+1857 to +100E-101
* bid32_to_bid64.c (bid64_to_bid32): fix Bug, In addition to
the INEXACT flag, the UNDERFLOW flag needs to be set (and was
not) when converting an input such as +9991E-111
to +100E-101. Furthermore, significant bits of NaNs are
set correctly now. For example,  0x7c3b9aca was
returning 0x7c02 instead of 0x 7c000100.
* bid64_noncomp.c: Change return type of bid64_class from int
to class_t.
* bid64_round_integral.c (bid64_round_integral_exact): Add
default case to avoid compiler warning.
* bid64_string.c (bid64_from_string): Fix bug for rounding
up. The input string "1" was returning
+1001E+1 instead of +1000E+1.
* bid64_to_bid128.c (bid128_to_bid64): Fix bug, in addition to
the INEXACT flag, the UNDERFLOW flag needs to be set (and was
not) when converting an input such as
+99E-417 to
+1000E-398.
* bid_binarydecimal.c (bid32_to_binary64): Fix bug for
conversion between binary and bid types. For example,
0x7c0F4240 was returning 0x7FFFA120 instead of
expected double precision 0x7FF8.
(binary64_to_bid32): Ditto.
(binary80_to_bid32): Ditto.
(binary128_to_bid32): Ditto.
(binary80_to_bid64): Ditto.
(binary128_to_bid64): Ditto.
* bid_conf.h (BID_HIGH_128W): New macro.
(BID_LOW_128W): Ditto.
* bid_functions.h (__ENABLE_BINARY80__): Ditto.
(ALIGN): Ditto.
* bid_inline_add.h (get_add128): Add default case to avoid compiler
warning.
* bid_internal.h (get_BID64): Ditto.
(fast_get_BID64_check_OF): Ditto.
(ALIGN): New macro.

Co-authored-by: Anderson, Cristina S 
Co-authored-by: Akkas, Ahmet 
Co-authored-by: Cornea, Marius 

Diff:
---
 libgcc/config/libbid/bid128_fma.c| 188 ++-
 libgcc/config/libbid/bid128_noncomp.c|   2 +-
 libgcc/config/libbid/bid128_round_integral.c |   2 +
 libgcc/config/libbid/bid128_string.c |   7 +-
 libgcc/config/libbid/bid32_to_bid128.c   |   3 -
 libgcc/config/libbid/bid32_to_bid64.c|  11 +-
 libgcc/config/libbid/bid64_noncomp.c |   2 +-
 libgcc/config/libbid/bid64_round_integral.c  |   2 +
 libgcc/config/libbid/bid64_string.c  |  21 ++-
 libgcc/config/libbid/bid64_to_bid128.c   |   3 -
 libgcc/config/libbid/bid_binarydecimal.c | 167 
 libgcc/config/libbid/bid_conf.h  |   8 ++
 libgcc/config/libbid/bid_functions.h |  23 +++-
 libgcc/config/libbid/bid_inline_add.h|   2 +
 libgcc/config/libbid/bid_internal.h  |  17 +--
 15 files changed, 220 insertions(+), 238 deletions(-)

diff --git a/libgcc/config/libbid/bid128_fma.c 
b/libgcc/config/libbid/bid128_fma.c
index 67233193a42..cbcf225546f 100644
--- a/libgcc/config/libbid/bid128_fma.c
+++ b/libgcc/config/libbid/bid128_fma.c
@@ -417,13 +417,12 @@ add_and_round (int q3,
   R128.w[1] = R256.w[1];
   R128.w[0] = R256.w[0];
 }
+if (e4 + x0 < expmin) { // for all rounding modes
+  is_tiny = 1;
+}
 // the rounded result has p34 = 34 digits
 e4 = e4 + x0 + incr_exp;
-if (rnd_mode == ROUNDING_TO_NEAREST) {
-  if 

[gcc r15-22] Adjust alternative *k to ?k for avx512 mask in zero_extend patterns

2024-04-28 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:c19a674d03847b900919b97d0957c8ae5164f8f1

commit r15-22-gc19a674d03847b900919b97d0957c8ae5164f8f1
Author: liuhongt 
Date:   Tue Apr 16 08:37:22 2024 +0800

Adjust alternative *k to ?k for avx512 mask in zero_extend patterns

So when both source operand and dest operand require avx512 MASK_REGS, RA
can allocate MASK_REGS register instead of GPR to avoid reload it from
GPR to MASK_REGS.

gcc/ChangeLog:

* config/i386/i386.md: (zero_extendsidi2): Adjust
alternative *k to ?k.
(zero_extenddi2): Ditto.
(*zero_extendsi2): Ditto.
(*zero_extendqihi2): Ditto.

Diff:
---
 gcc/config/i386/i386.md  | 16 -
 gcc/testsuite/gcc.target/i386/zero_extendkmask.c | 43 
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 80e64c603eb..764bfe20ff2 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4569,10 +4569,10 @@
 
 (define_insn "*zero_extendsidi2"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-   "=r,?r,?o,r   ,o,?*y,?!*y,$r,$v,$x,*x,*v,*r,*k")
+   "=r,?r,?o,r   ,o,?*y,?!*y,$r,$v,$x,*x,*v,?r,?k")
(zero_extend:DI
 (match_operand:SI 1 "x86_64_zext_operand"
-   "0 ,rm,r ,rmWz,0,r  ,m   ,v ,r ,m ,*x,*v,*k,*km")))]
+   "0 ,rm,r ,rmWz,0,r  ,m   ,v ,r ,m ,*x,*v,?k,?km")))]
   ""
 {
   switch (get_attr_type (insn))
@@ -4705,9 +4705,9 @@
   [(QI "avx512dq") (HI "avx512f") (SI "avx512bw") (DI "avx512bw")])
 
 (define_insn "zero_extenddi2"
-  [(set (match_operand:DI 0 "register_operand" "=r,*r,*k")
+  [(set (match_operand:DI 0 "register_operand" "=r,?r,?k")
(zero_extend:DI
-(match_operand:SWI12 1 "nonimmediate_operand" "m,*k,*km")))]
+(match_operand:SWI12 1 "nonimmediate_operand" "m,?k,?km")))]
   "TARGET_64BIT"
   "@
movz{l|x}\t{%1, %k0|%k0, %1}
@@ -4760,9 +4760,9 @@
(set_attr "mode" "SI")])
 
 (define_insn "*zero_extendsi2"
-  [(set (match_operand:SI 0 "register_operand" "=r,*r,*k")
+  [(set (match_operand:SI 0 "register_operand" "=r,?r,?k")
(zero_extend:SI
- (match_operand:SWI12 1 "nonimmediate_operand" "m,*k,*km")))]
+ (match_operand:SWI12 1 "nonimmediate_operand" "m,?k,?km")))]
   "!(TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))"
   "@
movz{l|x}\t{%1, %0|%0, %1}
@@ -4815,8 +4815,8 @@
 
 ; zero extend to SImode to avoid partial register stalls
 (define_insn "*zero_extendqihi2"
-  [(set (match_operand:HI 0 "register_operand" "=r,*r,*k")
-   (zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" 
"qm,*k,*km")))]
+  [(set (match_operand:HI 0 "register_operand" "=r,?r,?k")
+   (zero_extend:HI (match_operand:QI 1 "nonimmediate_operand" 
"qm,?k,?km")))]
   "!(TARGET_ZERO_EXTEND_WITH_AND && optimize_function_for_speed_p (cfun))"
   "@
movz{bl|x}\t{%1, %k0|%k0, %1}
diff --git a/gcc/testsuite/gcc.target/i386/zero_extendkmask.c 
b/gcc/testsuite/gcc.target/i386/zero_extendkmask.c
new file mode 100644
index 000..6b18980bbd1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/zero_extendkmask.c
@@ -0,0 +1,43 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not {(?n)shr[bwl]} } } */
+/* { dg-final { scan-assembler-not {(?n)movz[bw]} } } */
+
+#include
+
+__m512
+foo (__m512d a, __m512d b, __m512 c, __m512 d)
+{
+  return _mm512_mask_mov_ps (c, (__mmask16) (_mm512_cmpeq_pd_mask (a, b) >> 
1), d);
+}
+
+
+__m512i
+foo1 (__m512d a, __m512d b, __m512i c, __m512i d)
+{
+  return _mm512_mask_mov_epi16 (c, (__mmask32) (_mm512_cmpeq_pd_mask (a, b) >> 
1), d);
+}
+
+__m512i
+foo2 (__m512d a, __m512d b, __m512i c, __m512i d)
+{
+  return _mm512_mask_mov_epi8 (c, (__mmask64) (_mm512_cmpeq_pd_mask (a, b) >> 
1), d);
+}
+
+__m512i
+foo3 (__m512 a, __m512 b, __m512i c, __m512i d)
+{
+  return _mm512_mask_mov_epi16 (c, (__mmask32) (_mm512_cmpeq_ps_mask (a, b) >> 
1), d);
+}
+
+__m512i
+foo4 (__m512 a, __m512 b, __m512i c, __m512i d)
+{
+  return _mm512_mask_mov_epi8 (c, (__mmask64) (_mm512_cmpeq_ps_mask (a, b) >> 
1), d);
+}
+
+__m512i
+foo5 (__m512i a, __m512i b, __m512i c, __m512i d)
+{
+  return _mm512_mask_mov_epi8 (c, (__mmask64) (_mm512_cmp_epi16_mask (a, b, 5) 
>> 1), d);
+}


[gcc r13-8488] Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.

2024-03-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:e6a3d1f5bcfd954b614155d96c97bde8ac230e2e

commit r13-8488-ge6a3d1f5bcfd954b614155d96c97bde8ac230e2e
Author: liuhongt 
Date:   Fri Mar 22 10:09:43 2024 +0800

Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.

Also fixed a typo in the testcase.

gcc/testsuite/ChangeLog:

PR tree-optimization/114396
* gcc.target/i386/pr114396.c: Move to...
* gcc.c-torture/execute/pr114396.c: ...here.

(cherry picked from commit 9a6c7aa1b011b77fcd9b19f7b8d7ff0fc823cdb2)

Diff:
---
 gcc/testsuite/{gcc.target/i386 => gcc.c-torture/execute}/pr114396.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c 
b/gcc/testsuite/gcc.c-torture/execute/pr114396.c
similarity index 92%
rename from gcc/testsuite/gcc.target/i386/pr114396.c
rename to gcc/testsuite/gcc.c-torture/execute/pr114396.c
index 4c4015f871f..baf90eafabf 100644
--- a/gcc/testsuite/gcc.target/i386/pr114396.c
+++ b/gcc/testsuite/gcc.c-torture/execute/pr114396.c
@@ -1,5 +1,5 @@
-/* { dg-do run } */
-/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */
+/* PR tree-optimization/114396 */
+/* { dg-additional-options "-fwrapv -fno-vect-cost-model" } */
 
 short a = 0xF;
 short b[16];
@@ -88,7 +88,7 @@ int main() {
 
   exp = foo1 (a);
   res = foo1_o3 (a);
-  if (uexp != ures)
+  if (exp != res)
 __builtin_abort ();
 
   uexp = foou (a);


[gcc r14-9603] Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.

2024-03-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:9a6c7aa1b011b77fcd9b19f7b8d7ff0fc823cdb2

commit r14-9603-g9a6c7aa1b011b77fcd9b19f7b8d7ff0fc823cdb2
Author: liuhongt 
Date:   Fri Mar 22 10:09:43 2024 +0800

Move pr114396.c from gcc.target/i386 to gcc.c-torture/execute.

Also fixed a typo in the testcase.

gcc/testsuite/ChangeLog:

PR tree-optimization/114396
* gcc.target/i386/pr114396.c: Move to...
* gcc.c-torture/execute/pr114396.c: ...here.

Diff:
---
 gcc/testsuite/{gcc.target/i386 => gcc.c-torture/execute}/pr114396.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c 
b/gcc/testsuite/gcc.c-torture/execute/pr114396.c
similarity index 92%
rename from gcc/testsuite/gcc.target/i386/pr114396.c
rename to gcc/testsuite/gcc.c-torture/execute/pr114396.c
index 4c4015f871f..baf90eafabf 100644
--- a/gcc/testsuite/gcc.target/i386/pr114396.c
+++ b/gcc/testsuite/gcc.c-torture/execute/pr114396.c
@@ -1,5 +1,5 @@
-/* { dg-do run } */
-/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */
+/* PR tree-optimization/114396 */
+/* { dg-additional-options "-fwrapv -fno-vect-cost-model" } */
 
 short a = 0xF;
 short b[16];
@@ -88,7 +88,7 @@ int main() {
 
   exp = foo1 (a);
   res = foo1_o3 (a);
-  if (uexp != ures)
+  if (exp != res)
 __builtin_abort ();
 
   uexp = foou (a);


[gcc r13-8475] Fix runtime error for nonlinear iv vectorization(step_mult).

2024-03-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:199b021a38f30b681e0dbecd2d0296beabd50b13

commit r13-8475-g199b021a38f30b681e0dbecd2d0296beabd50b13
Author: liuhongt 
Date:   Thu Mar 21 13:15:23 2024 +0800

Fix runtime error for nonlinear iv vectorization(step_mult).

wi::from_mpz doesn't take a sign argument, we want it to be wrapped
instead of saturation, so pass utype and true to it, and it fixes the
bug.

gcc/ChangeLog:

PR tree-optimization/114396
* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Pass utype
and true to wi::from_mpz.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114396.c: New test.

(cherry picked from commit ac2f8c2a367151fc0410f904339c475a953cffc8)

Diff:
---
 gcc/testsuite/gcc.target/i386/pr114396.c | 105 +++
 gcc/tree-vect-loop.cc|   2 +-
 2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c 
b/gcc/testsuite/gcc.target/i386/pr114396.c
new file mode 100644
index 000..4c4015f871f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114396.c
@@ -0,0 +1,105 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */
+
+short a = 0xF;
+short b[16];
+unsigned short ua = 0xF;
+unsigned short ub[16];
+
+short
+__attribute__((noipa))
+foo (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa))
+foo1 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa))
+foou (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa))
+foou1 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa,optimize("O3")))
+foo_o3 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa,optimize("O3")))
+foo1_o3 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa,optimize("O3")))
+foou_o3 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa,optimize("O3")))
+foou1_o3 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= 5;
+  return a;
+}
+
+int main() {
+  unsigned short uexp, ures;
+  short exp, res;
+  exp = foo (a);
+  res = foo_o3 (a);
+  if (exp != res)
+__builtin_abort ();
+
+  exp = foo1 (a);
+  res = foo1_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  uexp = foou (a);
+  ures = foou_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  uexp = foou1 (a);
+  ures = foou1_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index d08d4996771..9615161ad37 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8730,7 +8730,7 @@ vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree 
init_expr,
wi::to_mpz (skipn, exp, UNSIGNED);
mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
mpz_powm (res, base, exp, mod);
-   begin = wi::from_mpz (type, res, TYPE_SIGN (type));
+   begin = wi::from_mpz (utype, res, true);
tree mult_expr = wide_int_to_tree (utype, begin);
init_expr = gimple_build (stmts, MULT_EXPR, utype,
  init_expr, mult_expr);


[gcc r14-9591] Fix runtime error for nonlinear iv vectorization(step_mult).

2024-03-21 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:ac2f8c2a367151fc0410f904339c475a953cffc8

commit r14-9591-gac2f8c2a367151fc0410f904339c475a953cffc8
Author: liuhongt 
Date:   Thu Mar 21 13:15:23 2024 +0800

Fix runtime error for nonlinear iv vectorization(step_mult).

wi::from_mpz doesn't take a sign argument, we want it to be wrapped
instead of saturation, so pass utype and true to it, and it fixes the
bug.

gcc/ChangeLog:

PR tree-optimization/114396
* tree-vect-loop.cc (vect_peel_nonlinear_iv_init): Pass utype
and true to wi::from_mpz.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114396.c: New test.

Diff:
---
 gcc/testsuite/gcc.target/i386/pr114396.c | 105 +++
 gcc/tree-vect-loop.cc|   2 +-
 2 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr114396.c 
b/gcc/testsuite/gcc.target/i386/pr114396.c
new file mode 100644
index 000..4c4015f871f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114396.c
@@ -0,0 +1,105 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -fwrapv -fno-vect-cost-model" } */
+
+short a = 0xF;
+short b[16];
+unsigned short ua = 0xF;
+unsigned short ub[16];
+
+short
+__attribute__((noipa))
+foo (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa))
+foo1 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa))
+foou (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa))
+foou1 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa,optimize("O3")))
+foo_o3 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= 5;
+  return a;
+}
+
+short
+__attribute__((noipa,optimize("O3")))
+foo1_o3 (short a)
+{
+  for (int e = 0; e < 9; e += 1)
+b[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa,optimize("O3")))
+foou_o3 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= -5;
+  return a;
+}
+
+unsigned short
+__attribute__((noipa,optimize("O3")))
+foou1_o3 (unsigned short a)
+{
+  for (int e = 0; e < 9; e += 1)
+ub[e] = a *= 5;
+  return a;
+}
+
+int main() {
+  unsigned short uexp, ures;
+  short exp, res;
+  exp = foo (a);
+  res = foo_o3 (a);
+  if (exp != res)
+__builtin_abort ();
+
+  exp = foo1 (a);
+  res = foo1_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  uexp = foou (a);
+  ures = foou_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  uexp = foou1 (a);
+  ures = foou1_o3 (a);
+  if (uexp != ures)
+__builtin_abort ();
+
+  return 0;
+}
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 4375ebdcb49..2921a9e6aa1 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -9454,7 +9454,7 @@ vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree 
init_expr,
wi::to_mpz (skipn, exp, UNSIGNED);
mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
mpz_powm (res, base, exp, mod);
-   begin = wi::from_mpz (type, res, TYPE_SIGN (type));
+   begin = wi::from_mpz (utype, res, true);
tree mult_expr = wide_int_to_tree (utype, begin);
init_expr = gimple_build (stmts, MULT_EXPR, utype,
  init_expr, mult_expr);


[gcc r14-9588] Document -fexcess-precision=16.

2024-03-20 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:415091f09096a0ebba1fdcd4af8c2fda24cfd411

commit r14-9588-g415091f09096a0ebba1fdcd4af8c2fda24cfd411
Author: liuhongt 
Date:   Mon Mar 18 18:53:59 2024 +0800

Document -fexcess-precision=16.

gcc/ChangeLog:

PR middle-end/114347
* doc/invoke.texi: Document -fexcess-precision=16.

Diff:
---
 gcc/doc/invoke.texi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index b446b2905c7..e0950ca5dc2 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14931,6 +14931,9 @@ assignments).  This option is enabled by default for C 
or C++ if a strict
 conformance option such as @option{-std=c99} or @option{-std=c++17} is used.
 @option{-ffast-math} enables @option{-fexcess-precision=fast} by default
 regardless of whether a strict conformance option is used.
+If @option{-fexcess-precision=16} is specified, constants and the
+results of expressions with types @code{_Float16} and @code{__bf16}
+are computed without excess precision.
 
 @opindex mfpmath
 @option{-fexcess-precision=standard} is not implemented for languages


[gcc r14-9512] Add missing hf/bf patterns.

2024-03-17 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:942d470a5a4fb1baeff943127a81b441dffaa543

commit r14-9512-g942d470a5a4fb1baeff943127a81b441dffaa543
Author: liuhongt 
Date:   Fri Mar 15 10:59:10 2024 +0800

Add missing hf/bf patterns.

It will be used by copysignm3/xorsignm3/lroundmn2 expanders.

gcc/ChangeLog:

PR target/114334
* config/i386/i386.md (mode): Add new number V8BF,V16BF,V32BF.
(MODEF248): New mode iterator.
(ssevecmodesuffix): Hanlde BF and HF.
* config/i386/sse.md (andnot3): Extend to HF/BF.
(3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114334.c: New test.

Diff:
---
 gcc/config/i386/i386.md  | 13 +
 gcc/config/i386/sse.md   | 22 +++---
 gcc/testsuite/gcc.target/i386/pr114334.c |  8 
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index df97a2d6270..11fdc6af3fa 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -543,8 +543,9 @@
 
 ;; Main data type used by the insn
 (define_attr "mode"
-  "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,BF,SF,DF,XF,TF,V32HF,V16HF,V8HF,
-   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF,V4HF,V4BF,V2HF,V2BF"
+  "unknown,none,QI,HI,SI,DI,TI,OI,XI,HF,BF,SF,DF,XF,TF,
+   V32HF,V16HF,V8HF,V4HF,V2HF,V32BF,V16BF,V8BF,V4BF,V2BF,
+   V16SF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF,V8DF"
   (const_string "unknown"))
 
 ;; The CPU unit operations uses.
@@ -1323,6 +1324,8 @@
 ;; SSE and x87 SFmode and DFmode floating point modes
 (define_mode_iterator MODEF [SF DF])
 
+(define_mode_iterator MODEF248 [BF HF SF (DF "TARGET_SSE2")])
+
 ;; SSE floating point modes
 (define_mode_iterator MODEFH [(HF "TARGET_AVX512FP16") SF DF])
 
@@ -1347,7 +1350,8 @@
(V64QI "b") (V32HI "w") (V16SI "d") (V8DI "q")])
 
 ;; SSE vector suffix for floating point modes
-(define_mode_attr ssevecmodesuffix [(SF "ps") (DF "pd")])
+;; BF HF use same suffix as SF for logic operations.
+(define_mode_attr ssevecmodesuffix [(BF "ps") (HF "ps") (SF "ps") (DF "pd")])
 
 ;; SSE vector mode corresponding to a scalar mode
 (define_mode_attr ssevecmode
@@ -1357,7 +1361,8 @@
 
 ;; AVX512F vector mode corresponding to a scalar mode
 (define_mode_attr avx512fvecmode
-  [(QI "V64QI") (HI "V32HI") (SI "V16SI") (DI "V8DI") (SF "V16SF") (DF 
"V8DF")])
+  [(QI "V64QI") (HI "V32HI") (SI "V16SI") (DI "V8DI")
+   (HF "V32HF") (BF "V32BF") (SF "V16SF") (DF "V8DF")])
 
 ;; Instruction suffix for REX 64bit operators.
 (define_mode_attr rex64suffix [(SI "{l}") (DI "{q}")])
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1bc614ab702..3286d3a4fac 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5125,12 +5125,12 @@
 ;; because the native instructions read the full 128-bits.
 
 (define_insn "*andnot3"
-  [(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v")
-   (and:MODEF
- (not:MODEF
-   (match_operand:MODEF 1 "register_operand" "0,x,v,v"))
-   (match_operand:MODEF 2 "register_operand" "x,x,v,v")))]
-  "SSE_FLOAT_MODE_P (mode)"
+  [(set (match_operand:MODEF248 0 "register_operand" "=x,x,v,v")
+   (and:MODEF248
+ (not:MODEF248
+   (match_operand:MODEF248 1 "register_operand" "0,x,v,v"))
+   (match_operand:MODEF248 2 "register_operand" "x,x,v,v")))]
+  "TARGET_SSE"
 {
   char buf[128];
   const char *ops;
@@ -5257,11 +5257,11 @@
  (const_string "TI")))])
 
 (define_insn "3"
-  [(set (match_operand:MODEF 0 "register_operand" "=x,x,v,v")
-   (any_logic:MODEF
- (match_operand:MODEF 1 "register_operand" "%0,x,v,v")
- (match_operand:MODEF 2 "register_operand" "x,x,v,v")))]
-  "SSE_FLOAT_MODE_P (mode)"
+  [(set (match_operand:MODEF248 0 "register_operand" "=x,x,v,v")
+   (any_logic:MODEF248
+ (match_operand:MODEF248 1 "register_operand" "%0,x,v,v")
+ (match_operand:MODEF248 2 "register_operand" "x,x,v,v")))]
+  "TARGET_SSE"
 {
   char buf[128];
   const char *ops;
diff --git a/gcc/testsuite/gcc.target/i386/pr114334.c 
b/gcc/testsuite/gcc.target/i386/pr114334.c
new file mode 100644
index 000..8e38e24cd16
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114334.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx512fp16" } */
+
+long
+foo(_Float16 f)
+{
+  return __builtin_lroundf16(f);
+}


[gcc r12-10214] i386[stv]: Handle REG_EH_REGION note

2024-03-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:a861f940efffae2782c559cd04df2d2740cd28bd

commit r12-10214-ga861f940efffae2782c559cd04df2d2740cd28bd
Author: liuhongt 
Date:   Wed Mar 13 10:40:01 2024 +0800

i386[stv]: Handle REG_EH_REGION note

When we split
(insn 37 36 38 10 (set (reg:DI 104 [ _18 ])
(mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct 
SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 84 
{*movdi_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])

into

(insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0)
(vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) 
[6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])
(const_int 0 [0]))) "test.C":22:42 -1
(nil)))
(insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0)
(subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
(nil)))

we must copy the REG_EH_REGION note to the first insn and split the block
after the newly added insn.  The REG_EH_REGION on the second insn will be
removed later since it no longer traps.

gcc/ChangeLog:

* config/i386/i386-features.cc
(general_scalar_chain::convert_op): Handle REG_EH_REGION note.
(convert_scalars_to_vector): Ditto.
* config/i386/i386-features.h (class scalar_chain): New
memeber control_flow_insns.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr111822.C: New test.

(cherry picked from commit 618e34d56cc38e9c3ae95a413228068e53ed76bb)

Diff:
---
 gcc/config/i386/i386-features.cc | 50 
 gcc/config/i386/i386-features.h  |  1 +
 gcc/testsuite/g++.target/i386/pr111822.C | 45 
 3 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 6a2444eb6b6..37f22ba3733 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -871,20 +871,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 }
   else if (MEM_P (*op))
 {
+  rtx_insn* eh_insn, *movabs = NULL;
   rtx tmp = gen_reg_rtx (GET_MODE (*op));
 
-  /* Handle movabs.  */
+  /* Emit MOVABS to load from a 64-bit absolute address to a GPR.  */
   if (!memory_operand (*op, GET_MODE (*op)))
{
  rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
+ movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
 
- emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
  *op = tmp2;
}
 
-  emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
-gen_gpr_to_xmm_move_src (vmode, *op)),
-   insn);
+  eh_insn
+   = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
+gen_gpr_to_xmm_move_src (vmode, *op)),
+   insn);
+
+  if (cfun->can_throw_non_call_exceptions)
+   {
+ /* Handle REG_EH_REGION note.  */
+ rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+ if (note)
+   {
+ if (movabs)
+   eh_insn = movabs;
+ control_flow_insns.safe_push (eh_insn);
+ add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
+   }
+   }
+
   *op = gen_rtx_SUBREG (vmode, tmp, 0);
 
   if (dump_file)
@@ -1681,6 +1697,7 @@ convert_scalars_to_vector (bool timode_p)
 {
   basic_block bb;
   int converted_insns = 0;
+  auto_vec control_flow_insns;
 
   bitmap_obstack_initialize (NULL);
   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
@@ -1759,6 +1776,11 @@ convert_scalars_to_vector (bool timode_p)
fprintf (dump_file, "Chain #%d conversion is not profitable\n",
 chain->chain_id);
 
+   rtx_insn* iter_insn;
+   unsigned int ii;
+   FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
+ control_flow_insns.safe_push (iter_insn);
+
delete chain;
   }
 
@@ -1826,6 +1848,24 @@ convert_scalars_to_vector (bool timode_p)
  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
  }
  }
+
+  if (!control_flow_insns.is_empty ())
+   {
+ free_dominance_info (CDI_DOMINATORS);
+
+ unsigned int i;
+ rtx_insn* insn;
+ FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
+   if (control_flow_insn_p (insn))
+ {
+   /* Split the block after insn.  There will be a fallthru
+  edge, which is OK so we keep it.  We have to create
+  the exception edges ourselves.  */
+   bb = BLOCK_FOR_INSN 

[gcc r13-8438] i386[stv]: Handle REG_EH_REGION note

2024-03-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:bdbcfbfcf591381f0faf95c881e3772b56d0a404

commit r13-8438-gbdbcfbfcf591381f0faf95c881e3772b56d0a404
Author: liuhongt 
Date:   Wed Mar 13 10:40:01 2024 +0800

i386[stv]: Handle REG_EH_REGION note

When we split
(insn 37 36 38 10 (set (reg:DI 104 [ _18 ])
(mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct 
SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 84 
{*movdi_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])

into

(insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0)
(vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) 
[6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])
(const_int 0 [0]))) "test.C":22:42 -1
(nil)))
(insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0)
(subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
(nil)))

we must copy the REG_EH_REGION note to the first insn and split the block
after the newly added insn.  The REG_EH_REGION on the second insn will be
removed later since it no longer traps.

gcc/ChangeLog:

* config/i386/i386-features.cc
(general_scalar_chain::convert_op): Handle REG_EH_REGION note.
(convert_scalars_to_vector): Ditto.
* config/i386/i386-features.h (class scalar_chain): New
memeber control_flow_insns.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr111822.C: New test.

(cherry picked from commit 618e34d56cc38e9c3ae95a413228068e53ed76bb)

Diff:
---
 gcc/config/i386/i386-features.cc | 50 
 gcc/config/i386/i386-features.h  |  1 +
 gcc/testsuite/g++.target/i386/pr111822.C | 45 
 3 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 74ee14a584a..ed3055b43f8 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -913,20 +913,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 }
   else if (MEM_P (*op))
 {
+  rtx_insn* eh_insn, *movabs = NULL;
   rtx tmp = gen_reg_rtx (GET_MODE (*op));
 
-  /* Handle movabs.  */
+  /* Emit MOVABS to load from a 64-bit absolute address to a GPR.  */
   if (!memory_operand (*op, GET_MODE (*op)))
{
  rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
+ movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
 
- emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
  *op = tmp2;
}
 
-  emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
-gen_gpr_to_xmm_move_src (vmode, *op)),
-   insn);
+  eh_insn
+   = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
+gen_gpr_to_xmm_move_src (vmode, *op)),
+   insn);
+
+  if (cfun->can_throw_non_call_exceptions)
+   {
+ /* Handle REG_EH_REGION note.  */
+ rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+ if (note)
+   {
+ if (movabs)
+   eh_insn = movabs;
+ control_flow_insns.safe_push (eh_insn);
+ add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
+   }
+   }
+
   *op = gen_rtx_SUBREG (vmode, tmp, 0);
 
   if (dump_file)
@@ -2215,6 +2231,7 @@ convert_scalars_to_vector (bool timode_p)
 {
   basic_block bb;
   int converted_insns = 0;
+  auto_vec control_flow_insns;
 
   bitmap_obstack_initialize (NULL);
   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
@@ -2296,6 +2313,11 @@ convert_scalars_to_vector (bool timode_p)
 chain->chain_id);
}
 
+ rtx_insn* iter_insn;
+ unsigned int ii;
+ FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
+   control_flow_insns.safe_push (iter_insn);
+
  delete chain;
}
 }
@@ -2364,6 +2386,24 @@ convert_scalars_to_vector (bool timode_p)
  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
  }
  }
+
+  if (!control_flow_insns.is_empty ())
+   {
+ free_dominance_info (CDI_DOMINATORS);
+
+ unsigned int i;
+ rtx_insn* insn;
+ FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
+   if (control_flow_insn_p (insn))
+ {
+   /* Split the block after insn.  There will be a fallthru
+  edge, which is OK so we keep it.  We have to create
+  the exception edges ourselves.  */
+   bb = BLOCK_FOR_INSN (insn);
+   split_block (bb, 

[gcc r14-9459] i386[stv]: Handle REG_EH_REGION note

2024-03-14 Thread hongtao Liu via Gcc-cvs
https://gcc.gnu.org/g:618e34d56cc38e9c3ae95a413228068e53ed76bb

commit r14-9459-g618e34d56cc38e9c3ae95a413228068e53ed76bb
Author: liuhongt 
Date:   Wed Mar 13 10:40:01 2024 +0800

i386[stv]: Handle REG_EH_REGION note

When we split
(insn 37 36 38 10 (set (reg:DI 104 [ _18 ])
(mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) [6 MEM[(struct 
SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])) "test.C":22:42 84 
{*movdi_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])

into

(insn 104 36 37 10 (set (subreg:V2DI (reg:DI 124) 0)
(vec_concat:V2DI (mem:DI (reg/f:SI 98 [ CallNative_nclosure.0_1 ]) 
[6 MEM[(struct SQRefCounted *)CallNative_nclosure.0_1]._uiRef+0 S8 A32])
(const_int 0 [0]))) "test.C":22:42 -1
(nil)))
(insn 37 104 105 10 (set (subreg:V2DI (reg:DI 104 [ _18 ]) 0)
(subreg:V2DI (reg:DI 124) 0)) "test.C":22:42 2024 {movv2di_internal}
 (expr_list:REG_EH_REGION (const_int -11 [0xfff5])
(nil)))

we must copy the REG_EH_REGION note to the first insn and split the block
after the newly added insn.  The REG_EH_REGION on the second insn will be
removed later since it no longer traps.

gcc/ChangeLog:

* config/i386/i386-features.cc
(general_scalar_chain::convert_op): Handle REG_EH_REGION note.
(convert_scalars_to_vector): Ditto.
* config/i386/i386-features.h (class scalar_chain): New
memeber control_flow_insns.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr111822.C: New test.

Diff:
---
 gcc/config/i386/i386-features.cc | 50 
 gcc/config/i386/i386-features.h  |  1 +
 gcc/testsuite/g++.target/i386/pr111822.C | 45 
 3 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 1de2a07ed75..c7d7a965901 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -998,20 +998,36 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 }
   else if (MEM_P (*op))
 {
+  rtx_insn* eh_insn, *movabs = NULL;
   rtx tmp = gen_reg_rtx (GET_MODE (*op));
 
-  /* Handle movabs.  */
+  /* Emit MOVABS to load from a 64-bit absolute address to a GPR.  */
   if (!memory_operand (*op, GET_MODE (*op)))
{
  rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
+ movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
 
- emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
  *op = tmp2;
}
 
-  emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
-gen_gpr_to_xmm_move_src (vmode, *op)),
-   insn);
+  eh_insn
+   = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
+gen_gpr_to_xmm_move_src (vmode, *op)),
+   insn);
+
+  if (cfun->can_throw_non_call_exceptions)
+   {
+ /* Handle REG_EH_REGION note.  */
+ rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+ if (note)
+   {
+ if (movabs)
+   eh_insn = movabs;
+ control_flow_insns.safe_push (eh_insn);
+ add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
+   }
+   }
+
   *op = gen_rtx_SUBREG (vmode, tmp, 0);
 
   if (dump_file)
@@ -2494,6 +2510,7 @@ convert_scalars_to_vector (bool timode_p)
 {
   basic_block bb;
   int converted_insns = 0;
+  auto_vec control_flow_insns;
 
   bitmap_obstack_initialize (NULL);
   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
@@ -2575,6 +2592,11 @@ convert_scalars_to_vector (bool timode_p)
 chain->chain_id);
}
 
+ rtx_insn* iter_insn;
+ unsigned int ii;
+ FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
+   control_flow_insns.safe_push (iter_insn);
+
  delete chain;
}
 }
@@ -2643,6 +2665,24 @@ convert_scalars_to_vector (bool timode_p)
  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
  }
  }
+
+  if (!control_flow_insns.is_empty ())
+   {
+ free_dominance_info (CDI_DOMINATORS);
+
+ unsigned int i;
+ rtx_insn* insn;
+ FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
+   if (control_flow_insn_p (insn))
+ {
+   /* Split the block after insn.  There will be a fallthru
+  edge, which is OK so we keep it.  We have to create
+  the exception edges ourselves.  */
+   bb = BLOCK_FOR_INSN (insn);
+   split_block (bb, insn);
+   rtl_make_eh_edge (NULL, bb, BB_END (bb));
+ }
+