from:"liuhongt via Gcc\-patches"

[PATCH] Remove constraint modifier % for fcmaddcph/fmaddcph/fcmulcph since there're not commutative.

2023-09-10 Thread liuhongt via Gcc-patches

Here's the patch I've commited.
The patch also remove % for vfmaddcph.

gcc/ChangeLog:

PR target/111306
PR target/111335
* config/i386/sse.md (int_comm): New int_attr.
(fma__):
Remove % for Complex conjugate operations since they're not
commutative.
(fma___pair): Ditto.
(___mask): Ditto.
(cmul3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr111306.c: New test.
---
 gcc/config/i386/sse.md   | 16 ---
 gcc/testsuite/gcc.target/i386/pr111306.c | 36 
 2 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111306.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6d3ae8dea0c..14615999394 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6480,6 +6480,14 @@ (define_int_attr complexpairopname
[(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
 (UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
 
+(define_int_attr int_comm
+   [(UNSPEC_COMPLEX_FMA "")
+(UNSPEC_COMPLEX_FMA_PAIR "")
+(UNSPEC_COMPLEX_FCMA "")
+(UNSPEC_COMPLEX_FCMA_PAIR "")
+(UNSPEC_COMPLEX_FMUL "%")
+(UNSPEC_COMPLEX_FCMUL "")])
+
 (define_int_attr conj_op
[(UNSPEC_COMPLEX_FMA "")
 (UNSPEC_COMPLEX_FCMA "_conj")
@@ -6593,7 +6601,7 @@ (define_expand "cmla4"
 (define_insn "fma__"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=")
(unspec:VHF_AVX512VL
- [(match_operand:VHF_AVX512VL 1 "" "%v")
+ [(match_operand:VHF_AVX512VL 1 "" "v")
   (match_operand:VHF_AVX512VL 2 "" 
"")
   (match_operand:VHF_AVX512VL 3 "" "0")]
   UNSPEC_COMPLEX_F_C_MA))]
@@ -6658,7 +,7 @@ (define_insn_and_split 
"fma___fma_zero"
 (define_insn "fma___pair"
  [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=")
(unspec:VF1_AVX512VL
-[(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
+[(match_operand:VF1_AVX512VL 1 "vector_operand" "v")
  (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
  (match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
  UNSPEC_COMPLEX_F_C_MA_PAIR))]
@@ -6727,7 +6735,7 @@ (define_insn 
"___mask"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=")
(vec_merge:VHF_AVX512VL
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "%v")
+   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
 (match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")
 (match_operand:VHF_AVX512VL 3 "register_operand" "0")]
 UNSPEC_COMPLEX_F_C_MA)
@@ -6752,7 +6760,7 @@ (define_expand "cmul3"
 (define_insn "__"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=")
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "%v")
+   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
 (match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")]
 UNSPEC_COMPLEX_F_C_MUL))]
   "TARGET_AVX512FP16 && "
diff --git a/gcc/testsuite/gcc.target/i386/pr111306.c 
b/gcc/testsuite/gcc.target/i386/pr111306.c
new file mode 100644
index 000..541725ebdad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111306.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#define AVX512FP16
+#include "avx512f-helper.h"
+
+__attribute__((optimize("O2"),noipa))
+void func1(_Float16 *a, _Float16 *b, int n, _Float16 *c) {
+  __m512h rA = _mm512_loadu_ph(a);
+  for (int i = 0; i < n; i += 32) {
+__m512h rB = _mm512_loadu_ph(b + i);
+_mm512_storeu_ph(c + i, _mm512_fcmul_pch(rB, rA));
+  }
+}
+
+void
+test_512 (void)
+{
+  int n = 32;
+  _Float16 a[n], b[n], c[n];
+  _Float16 exp[n];
+  for (int i = 1; i <= n; i++) {
+a[i - 1] = i & 1 ? -i : i;
+b[i - 1] = i;
+  }
+
+  func1(a, b, n, c);
+  for (int i = 0; i < n / 32; i += 2) {
+if (c[i] != a[i] * b[i] + a[i+1] * b[i+1]
+   || c[i+1] != a[i] * b[i+1] - a[i+1]*b[i])
+  __builtin_abort ();
+}
+}
+
+
-- 
2.31.1

[PATCH] Remove constraint modifier % for fcmaddcph/fcmulcph since there're not commutative.

2023-09-07 Thread liuhongt via Gcc-patches

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} on SPR.
Ready push to trunk and backport to GCC13/GCC12.

gcc/ChangeLog:

PR target/111306
* config/i386/sse.md (int_comm): New int_attr.
(fma__):
Remove % for Complex conjugate operations since they're not
commutative.
(fma___pair): Ditto.
(___mask): Ditto.
(cmul3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr111306.c: New test.
---
 gcc/config/i386/sse.md   | 16 ---
 gcc/testsuite/gcc.target/i386/pr111306.c | 36 
 2 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111306.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6d3ae8dea0c..833546c5228 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6480,6 +6480,14 @@ (define_int_attr complexpairopname
[(UNSPEC_COMPLEX_FMA_PAIR "fmaddc")
 (UNSPEC_COMPLEX_FCMA_PAIR "fcmaddc")])
 
+(define_int_attr int_comm
+   [(UNSPEC_COMPLEX_FMA "%")
+(UNSPEC_COMPLEX_FMA_PAIR "%")
+(UNSPEC_COMPLEX_FCMA "")
+(UNSPEC_COMPLEX_FCMA_PAIR "")
+(UNSPEC_COMPLEX_FMUL "%")
+(UNSPEC_COMPLEX_FCMUL "")])
+
 (define_int_attr conj_op
[(UNSPEC_COMPLEX_FMA "")
 (UNSPEC_COMPLEX_FCMA "_conj")
@@ -6593,7 +6601,7 @@ (define_expand "cmla4"
 (define_insn "fma__"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=")
(unspec:VHF_AVX512VL
- [(match_operand:VHF_AVX512VL 1 "" "%v")
+ [(match_operand:VHF_AVX512VL 1 "" "v")
   (match_operand:VHF_AVX512VL 2 "" 
"")
   (match_operand:VHF_AVX512VL 3 "" "0")]
   UNSPEC_COMPLEX_F_C_MA))]
@@ -6658,7 +,7 @@ (define_insn_and_split 
"fma___fma_zero"
 (define_insn "fma___pair"
  [(set (match_operand:VF1_AVX512VL 0 "register_operand" "=")
(unspec:VF1_AVX512VL
-[(match_operand:VF1_AVX512VL 1 "vector_operand" "%v")
+[(match_operand:VF1_AVX512VL 1 "vector_operand" "v")
  (match_operand:VF1_AVX512VL 2 "bcst_vector_operand" "vmBr")
  (match_operand:VF1_AVX512VL 3 "vector_operand" "0")]
  UNSPEC_COMPLEX_F_C_MA_PAIR))]
@@ -6727,7 +6735,7 @@ (define_insn 
"___mask"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=")
(vec_merge:VHF_AVX512VL
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "%v")
+   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
 (match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")
 (match_operand:VHF_AVX512VL 3 "register_operand" "0")]
 UNSPEC_COMPLEX_F_C_MA)
@@ -6752,7 +6760,7 @@ (define_expand "cmul3"
 (define_insn "__"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=")
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "%v")
+   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
 (match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")]
 UNSPEC_COMPLEX_F_C_MUL))]
   "TARGET_AVX512FP16 && "
diff --git a/gcc/testsuite/gcc.target/i386/pr111306.c 
b/gcc/testsuite/gcc.target/i386/pr111306.c
new file mode 100644
index 000..541725ebdad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111306.c
@@ -0,0 +1,36 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512fp16 -mavx512vl" } */
+/* { dg-require-effective-target avx512fp16 } */
+
+#define AVX512FP16
+#include "avx512f-helper.h"
+
+__attribute__((optimize("O2"),noipa))
+void func1(_Float16 *a, _Float16 *b, int n, _Float16 *c) {
+  __m512h rA = _mm512_loadu_ph(a);
+  for (int i = 0; i < n; i += 32) {
+__m512h rB = _mm512_loadu_ph(b + i);
+_mm512_storeu_ph(c + i, _mm512_fcmul_pch(rB, rA));
+  }
+}
+
+void
+test_512 (void)
+{
+  int n = 32;
+  _Float16 a[n], b[n], c[n];
+  _Float16 exp[n];
+  for (int i = 1; i <= n; i++) {
+a[i - 1] = i & 1 ? -i : i;
+b[i - 1] = i;
+  }
+
+  func1(a, b, n, c);
+  for (int i = 0; i < n / 32; i += 2) {
+if (c[i] != a[i] * b[i] + a[i+1] * b[i+1]
+   || c[i+1] != a[i] * b[i+1] - a[i+1]*b[i])
+  __builtin_abort ();
+}
+}
+
+
-- 
2.31.1

[PATCH] Support vpermw/vpermi2w/vpermt2w instructions for vector HF/BFmodes.

2023-09-06 Thread liuhongt via Gcc-patches

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/sse.md
(_vpermt2var3): New define_insn.
(VHFBF_AVX512VL): New mode iterator.
(VI2HFBF_AVX512VL): New mode iterator.
---
 gcc/config/i386/sse.md | 32 
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6d3ae8dea0c..12fe97951ee 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -466,6 +466,10 @@ (define_mode_iterator VHFBF_128 [V8HF V8BF])
 (define_mode_iterator VHF_AVX512VL
   [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")])
 
+(define_mode_iterator VHFBF_AVX512VL
+  [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")
+   V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
+
 ;; All vector integer modes
 (define_mode_iterator VI
   [(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
@@ -565,6 +569,11 @@ (define_mode_iterator VI48_AVX512F_AVX512VL
 (define_mode_iterator VI2_AVX512VL
   [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI])
 
+(define_mode_iterator VI2HFBF_AVX512VL
+  [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI
+   (V8HF "TARGET_AVX512VL") (V16HF "TARGET_AVX512VL") V32HF
+   (V8BF "TARGET_AVX512VL") (V16BF "TARGET_AVX512VL") V32BF])
+
 (define_mode_iterator VI2H_AVX512VL
   [(V8HI "TARGET_AVX512VL") (V16HI "TARGET_AVX512VL") V32HI
(V8SI "TARGET_AVX512VL") V16SI
@@ -26110,13 +26119,13 @@ (define_insn "_permvar"
(set_attr "mode" "")])
 
 (define_insn "_permvar"
-  [(set (match_operand:VI2_AVX512VL 0 "register_operand" "=v")
-   (unspec:VI2_AVX512VL
- [(match_operand:VI2_AVX512VL 1 "nonimmediate_operand" "vm")
+  [(set (match_operand:VI2HFBF_AVX512VL 0 "register_operand" "=v")
+   (unspec:VI2HFBF_AVX512VL
+ [(match_operand:VI2HFBF_AVX512VL 1 "nonimmediate_operand" "vm")
   (match_operand: 2 "register_operand" "v")]
  UNSPEC_VPERMVAR))]
   "TARGET_AVX512BW && "
-  "vperm\t{%1, %2, %0|%0, %2, %1}"
+  "vpermw\t{%1, %2, %0|%0, %2, %1}"
   [(set_attr "type" "sselog")
(set_attr "prefix" "")
(set_attr "mode" "")])
@@ -26987,6 +26996,21 @@ (define_insn 
"_vpermt2var3"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "_vpermt2var3"
+  [(set (match_operand:VHFBF_AVX512VL 0 "register_operand" "=v,v")
+   (unspec:VHFBF_AVX512VL
+ [(match_operand: 1 "register_operand" "v,0")
+  (match_operand:VHFBF_AVX512VL 2 "register_operand" "0,v")
+  (match_operand:VHFBF_AVX512VL 3 "nonimmediate_operand" "vm,vm")]
+ UNSPEC_VPERMT2))]
+  "TARGET_AVX512BW"
+  "@
+   vpermt2w\t{%3, %1, %0|%0, %1, %3}
+   vpermi2w\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "")])
+
 (define_insn "_vpermt2var3_mask"
   [(set (match_operand:VPERMI2 0 "register_operand" "=v")
(vec_merge:VPERMI2
-- 
2.31.1

[PATCH] Generate vmovsh instead of vpblendw for specific vec_merge.

2023-09-04 Thread liuhongt via Gcc-patches

On SPR, vmovsh can be execute on 3 ports, vpblendw can only be
executed on 2 ports.
On znver4, vpblendw can be executed on 4 ports, if vmovsh is similar
as vmovss, then it can also be executed on 4 ports.
So there's no difference for znver? but vmovsh is more optimized on
SPR.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/sse.md: (V8BFH_128): Renamed to ..
(VHFBF_128): .. this.
(V16BFH_256): Renamed to ..
(VHFBF_256): .. this.
(avx512f_mov): Extend to V_128.
(vcvtnee2ps_): Changed to VHFBF_128.
(vcvtneo2ps_): Ditto.
(vcvtnee2ps_): Changed to VHFBF_256.
(vcvtneo2ps_): Ditto.
* config/i386/i386-expand.cc (expand_vec_perm_blend):
Canonicalize vec_merge.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512fp16-vmovsh-1a.c: Remove xfail.
---
 gcc/config/i386/i386-expand.cc| 17 +
 gcc/config/i386/sse.md| 25 ---
 .../gcc.target/i386/avx512fp16-vmovsh-1a.c|  2 +-
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index cbd51a0f362..e42ff27c6ef 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -19433,6 +19433,23 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
   mmode = VOIDmode;
 }
 
+  /* Canonicalize vec_merge.  */
+  if (swap_commutative_operands_p (op1, op0)
+  /* Two operands have same precedence, then
+first bit of mask select first operand.  */
+  || (!swap_commutative_operands_p (op0, op1)
+ && !(mask & 1)))
+{
+  unsigned n_elts = GET_MODE_NUNITS (vmode);
+  std::swap (op0, op1);
+  unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
+  if (n_elts == HOST_BITS_PER_WIDE_INT)
+   mask_all  = -1;
+  else
+   mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
+  mask = ~mask & mask_all;
+}
+
   if (mmode != VOIDmode)
 maskop = force_reg (mmode, gen_int_mode (mask, mmode));
   else
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e282d978a01..6d3ae8dea0c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -459,8 +459,9 @@ (define_mode_iterator VF2_AVX512VL
 (define_mode_iterator VF1_AVX512VL
   [V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")])
 
-(define_mode_iterator VHFBF
-  [V32HF V16HF V8HF V32BF V16BF V8BF])
+(define_mode_iterator VHFBF [V32HF V16HF V8HF V32BF V16BF V8BF])
+(define_mode_iterator VHFBF_256 [V16HF V16BF])
+(define_mode_iterator VHFBF_128 [V8HF V8BF])
 
 (define_mode_iterator VHF_AVX512VL
   [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")])
@@ -11134,13 +11135,11 @@ (define_insn_and_split 
"*vec_setv2di_0_zero_extendhi_1"
   DONE;
 })
 
-(define_mode_iterator V8BFH_128 [V8HF V8BF])
-
 (define_insn "avx512fp16_mov"
-  [(set (match_operand:V8BFH_128 0 "register_operand" "=v")
-   (vec_merge:V8BFH_128
-  (match_operand:V8BFH_128 2 "register_operand" "v")
- (match_operand:V8BFH_128 1 "register_operand" "v")
+  [(set (match_operand:V8_128 0 "register_operand" "=v")
+   (vec_merge:V8_128
+ (match_operand:V8_128 2 "register_operand" "v")
+ (match_operand:V8_128 1 "register_operand" "v")
  (const_int 1)))]
   "TARGET_AVX512FP16"
   "vmovsh\t{%2, %1, %0|%0, %1, %2}"
@@ -30358,8 +30357,6 @@ (define_insn "vbcstnesh2ps_"
   [(set_attr "prefix" "vex")
(set_attr "mode" "")])
 
-(define_mode_iterator V16BFH_256 [V16HF V16BF])
-
 (define_mode_attr bf16_ph
   [(V8HF "ph") (V16HF "ph")
(V8BF "bf16") (V16BF "bf16")])
@@ -30368,7 +30365,7 @@ (define_insn "vcvtnee2ps_"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
(float_extend:V4SF
  (vec_select:
-   (match_operand:V8BFH_128 1 "memory_operand" "m")
+   (match_operand:VHFBF_128 1 "memory_operand" "m")
(parallel [(const_int 0) (const_int 2)
   (const_int 4) (const_int 6)]]
   "TARGET_AVXNECONVERT"
@@ -30380,7 +30377,7 @@ (define_insn "vcvtnee2ps_"
   [(set (match_operand:V8SF 0 "register_operand" "=x")
(float_extend:V8SF
  (vec_select:
-   (match_operand:V16BFH_256 1 "memory_operand" "m")
+   (match_operand:VHFBF_256 1 "memory_operand" "m")
(parallel [(const_int 0) (const_int 2)
   (const_int 4) (const_int 6)
   (const_int 8) (const_int 10)
@@ -30394,7 +30391,7 @@ (define_insn "vcvtneo2ps_"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
(float_extend:V4SF
  (vec_select:
-   (match_operand:V8BFH_128 1 "memory_operand" "m")
+   (match_operand:VHFBF_128 1 "memory_operand" "m")
(parallel [(const_int 1) (const_int 3)
   (const_int 5) (const_int 7)]]
   "TARGET_AVXNECONVERT"
@@ -30406,7 +30403,7 @@ (define_insn

[PATCH] Adjust costing of emulated vectorized gather/scatter

2023-08-30 Thread liuhongt via Gcc-patches

r14-332-g24905a4bd1375c adjusts costing of emulated vectorized
gather/scatter.

commit 24905a4bd1375ccd99c02510b9f9529015a48315
Author: Richard Biener 
Date:   Wed Jan 18 11:04:49 2023 +0100

Adjust costing of emulated vectorized gather/scatter

Emulated gather/scatter behave similar to strided elementwise
accesses in that they need to decompose the offset vector
and construct or decompose the data vector so handle them
the same way, pessimizing the cases with may elements.


But for emulated gather/scatter, offset vector load/vec_construct has
aready been counted, and in real case, it's probably eliminated by
later optimizer.
Also after decomposing, element loads from continous memory could be
less bounded compared to normal elementwise load.
The patch decreases the cost a little bit.

This will enable gather emulation for below loop with VF=8(ymm)

double
foo (double* a, double* b, unsigned int* c, int n)
{
  double sum = 0;
  for (int i = 0; i != n; i++)
sum += a[i] * b[c[i]];
  return sum;
}

For the upper loop, microbenchmark result shows on ICX,
emulated gather with VF=8 is 30% faster than emulated gather with
VF=4 when tripcount is big enough.
It bring back ~4% for 510.parest still ~5% regression compared to
gather instruction due to throughput bound.

For -march=znver1/2/3/4, the change doesn't enable VF=8(ymm) for the
loop, VF remains 4(xmm) as before(guess related to their own cost
model).


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/111064
* config/i386/i386.cc (ix86_vector_costs::add_stmt_cost):
Decrease cost a little bit for vec_to_scalar(offset vector) in
emulated gather.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr111064.c: New test.
---
 gcc/config/i386/i386.cc  | 11 ++-
 gcc/testsuite/gcc.target/i386/pr111064.c | 12 
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr111064.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1bc3f11ff07..337e0f1bfbb 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24079,7 +24079,16 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
  || STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER))
 {
   stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
-  stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
+  /* For emulated gather/scatter, offset vector load/vec_construct has
+already been counted and in real case, it's probably eliminated by
+later optimizer.
+Also after decomposing, element loads from continous memory
+could be less bounded compared to normal elementwise load.  */
+  if (kind == vec_to_scalar
+ && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
+   stmt_cost *= TYPE_VECTOR_SUBPARTS (vectype);
+  else
+   stmt_cost *= (TYPE_VECTOR_SUBPARTS (vectype) + 1);
 }
   else if ((kind == vec_construct || kind == scalar_to_vec)
   && node
diff --git a/gcc/testsuite/gcc.target/i386/pr111064.c 
b/gcc/testsuite/gcc.target/i386/pr111064.c
new file mode 100644
index 000..aa2589bd36f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr111064.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -march=icelake-server -mno-gather" } */
+/* { dg-final { scan-assembler-times {(?n)vfmadd[123]*pd.*ymm} 2 { target { ! 
ia32 } } } }  */
+
+double
+foo (double* a, double* b, unsigned int* c, int n)
+{
+  double sum = 0;
+  for (int i = 0; i != n; i++)
+sum += a[i] * b[c[i]];
+  return sum;
+}
-- 
2.31.1

[PATCH] Refactor vector HF/BF mode iterators and patterns.

2023-08-30 Thread liuhongt via Gcc-patches

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/sse.md (_blendm): Merge
VF_AVX512HFBFVL into VI12HFBF_AVX512VL.
(VF_AVX512HFBF16): Renamed to VHFBF.
(VF_AVX512FP16VL): Renamed to VHF_AVX512VL.
(VF_AVX512FP16): Removed.
(div3): Adjust VF_AVX512FP16VL to VHF_AVX512VL.
(avx512fp16_rcp2): Ditto.
(rsqrt2): Ditto.
(_rsqrt2): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(_fmaddc__mask1): Ditto.
(_fmaddc__maskz): Ditto.
(_fcmaddc__mask1): Ditto.
(_fcmaddc__maskz): Ditto.
(cmla4): Ditto.
(fma__fadd_fmul): Ditto.
(fma__fadd_fcmul): Ditto.
(fma___fma_zero): Ditto.
(fma__fmaddc_bcst): Ditto.
(fma__fcmaddc_bcst): Ditto.
(___mask): Ditto.
(cmul3): Ditto.
(__):
Ditto.
(vec_unpacks_lo_): Ditto.
(vec_unpacks_hi_): Ditto.
(vec_unpack_fix_trunc_lo_): Ditto.
(vec_unpack_fix_trunc_lo_): Ditto.
(*vec_extract_0): Ditto.
(*_cmp3): Extend to V48H_AVX512VL.
---
 gcc/config/i386/sse.md | 238 +++--
 1 file changed, 108 insertions(+), 130 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 192e746fda3..e282d978a01 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -459,18 +459,10 @@ (define_mode_iterator VF2_AVX512VL
 (define_mode_iterator VF1_AVX512VL
   [V16SF (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")])
 
-(define_mode_iterator VF_AVX512FP16
-  [V32HF V16HF V8HF])
+(define_mode_iterator VHFBF
+  [V32HF V16HF V8HF V32BF V16BF V8BF])
 
-(define_mode_iterator VF_AVX512HFBF16
-  [(V32HF "TARGET_AVX512FP16") (V16HF "TARGET_AVX512FP16")
-   (V8HF "TARGET_AVX512FP16") V32BF V16BF V8BF])
-
-(define_mode_iterator VF_AVX512HFBFVL
-  [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")
-   V32BF (V16BF "TARGET_AVX512VL") (V8BF "TARGET_AVX512VL")])
-
-(define_mode_iterator VF_AVX512FP16VL
+(define_mode_iterator VHF_AVX512VL
   [V32HF (V16HF "TARGET_AVX512VL") (V8HF "TARGET_AVX512VL")])
 
 ;; All vector integer modes
@@ -1624,29 +1616,15 @@ (define_insn "_blendm"
(set_attr "mode" "")])
 
 (define_insn "_blendm"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v,v")
-   (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm,vm")
- (match_operand:VI12_AVX512VL 1 "nonimm_or_0_operand" "0C,v")
- (match_operand: 3 "register_operand" "Yk,Yk")))]
-  "TARGET_AVX512BW"
-  "@
-vmovdqu\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}
-vpblendm\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
-  [(set_attr "type" "ssemov")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "")])
-
-(define_insn "_blendm"
-  [(set (match_operand:VF_AVX512HFBFVL 0 "register_operand" "=v,v")
-   (vec_merge:VF_AVX512HFBFVL
- (match_operand:VF_AVX512HFBFVL 2 "nonimmediate_operand" "vm,vm")
- (match_operand:VF_AVX512HFBFVL 1 "nonimm_or_0_operand" "0C,v")
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "register_operand" "=v,v")
+   (vec_merge:VI12HFBF_AVX512VL
+ (match_operand:VI12HFBF_AVX512VL 2 "nonimmediate_operand" "vm,vm")
+ (match_operand:VI12HFBF_AVX512VL 1 "nonimm_or_0_operand" "0C,v")
  (match_operand: 3 "register_operand" "Yk,Yk")))]
   "TARGET_AVX512BW"
   "@
 vmovdqu\t{%2, %0%{%3%}%N1|%0%{%3%}%N1, %2}
-vpblendmw\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
+vpblendm\t{%2, %1, %0%{%3%}|%0%{%3%}, %1, %2}"
   [(set_attr "type" "ssemov")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
@@ -2448,10 +2426,10 @@ (define_expand "div3"
   "TARGET_SSE2")
 
 (define_expand "div3"
-  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
-   (div:VF_AVX512FP16VL
- (match_operand:VF_AVX512FP16VL 1 "register_operand")
- (match_operand:VF_AVX512FP16VL 2 "vector_operand")))]
+  [(set (match_operand:VHF_AVX512VL 0 "register_operand")
+   (div:VHF_AVX512VL
+ (match_operand:VHF_AVX512VL 1 "register_operand")
+ (match_operand:VHF_AVX512VL 2 "vector_operand")))]
   "TARGET_AVX512FP16"
 {
   /* Transform HF vector div to vector mul/rcp.  */
@@ -2568,9 +2546,9 @@ (define_insn "*sse_vmrcpv4sf2"
(set_attr "mode" "SF")])
 
 (define_insn "avx512fp16_rcp2"
-  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand" "=v")
-   (unspec:VF_AVX512FP16VL
- [(match_operand:VF_AVX512FP16VL 1 "nonimmediate_operand" "vm")]
+  [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=v")
+   (unspec:VHF_AVX512VL
+ [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "vm")]
  UNSPEC_RCP))]
   "TARGET_AVX512FP16"
   "vrcpph\t{%1, %0|%0, %1}"
@@ -2731,9 +2709,9 @@ (define_expand "rsqrt2"
 })
 
 (define_expand "rsqrt2"
-  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
-   (unspec:VF_AVX512FP16VL
-

[PATCH] Use vmaskmov{ps, pd} for VI48_128_256 when TARGET_AVX2 is not available.

2023-08-24 Thread liuhongt via Gcc-patches

vpmaskmov{d,q} is available for TARGET_AVX2, vmaskmov{ps,ps} is
available for TARGET_AVX, w/o TARGET_AVX2, we can use vmaskmov{ps,pd}
for VI48_128_256

Bootstrapped and regtested on x86_64-pc-linux{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/19
* config/i386/sse.md (V48_AVX2): Rename to ..
(V48_128_256): .. this.
(ssefltmodesuffix): Extend to V4SF/V8SF/V2DF/V4DF.
(_maskload): Change
V48_AVX2 to V48_128_256, also generate vmaskmov{ps,pd} for
integral modes when TARGET_AVX2 is not available.
(_maskstore): Ditto.
(maskload): Change V48_AVX2 to
V48_128_256.
(maskstore): Ditto.
---
 gcc/config/i386/sse.md | 48 ++
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 59a0eb1c63f..414a807aa6c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -700,11 +700,12 @@ (define_mode_iterator VI12_AVX_AVX512F
   [ (V64QI "TARGET_AVX512F") (V32QI "TARGET_AVX") V16QI
 (V32HI "TARGET_AVX512F") (V16HI "TARGET_AVX") V8HI])
 
-(define_mode_iterator V48_AVX2
+(define_mode_iterator V48_128_256
   [V4SF V2DF
+   V4DI V2DI
V8SF V4DF
-   (V4SI "TARGET_AVX2") (V2DI "TARGET_AVX2")
-   (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")])
+   V8SI V4SI])
+
 
 (define_mode_iterator VF4_128_8_256
   [V4DF V4SF])
@@ -22300,7 +22301,8 @@ (define_insn_and_split 
"*_blendv_lt"
(set_attr "mode" "")])
 
 (define_mode_attr ssefltmodesuffix
-  [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")])
+  [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")
+   (V2DF "pd") (V4DF "pd") (V4SF "ps") (V8SF "ps")])
 
 (define_mode_attr ssefltvecmode
   [(V2DI "V2DF") (V4DI "V4DF") (V4SI "V4SF") (V8SI "V8SF")])
@@ -27411,13 +27413,18 @@ (define_insn "vec_set_hi_v32qi"
(set_attr "mode" "OI")])
 
 (define_insn "_maskload"
-  [(set (match_operand:V48_AVX2 0 "register_operand" "=x")
-   (unspec:V48_AVX2
+  [(set (match_operand:V48_128_256 0 "register_operand" "=x")
+   (unspec:V48_128_256
  [(match_operand: 2 "register_operand" "x")
-  (match_operand:V48_AVX2 1 "memory_operand" "m")]
+  (match_operand:V48_128_256 1 "memory_operand" "m")]
  UNSPEC_MASKMOV))]
   "TARGET_AVX"
-  "vmaskmov\t{%1, %2, %0|%0, %2, %1}"
+{
+  if (TARGET_AVX2)
+return "vmaskmov\t{%1, %2, %0|%0, %2, %1}";
+  else
+return "vmaskmov\t{%1, %2, %0|%0, %2, %1}";
+}
   [(set_attr "type" "sselog1")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "vex")
@@ -27425,14 +27432,19 @@ (define_insn 
"_maskload"
(set_attr "mode" "")])
 
 (define_insn "_maskstore"
-  [(set (match_operand:V48_AVX2 0 "memory_operand" "+m")
-   (unspec:V48_AVX2
+  [(set (match_operand:V48_128_256 0 "memory_operand" "+m")
+   (unspec:V48_128_256
  [(match_operand: 1 "register_operand" "x")
-  (match_operand:V48_AVX2 2 "register_operand" "x")
+  (match_operand:V48_128_256 2 "register_operand" "x")
   (match_dup 0)]
  UNSPEC_MASKMOV))]
   "TARGET_AVX"
-  "vmaskmov\t{%2, %1, %0|%0, %1, %2}"
+{
+  if (TARGET_AVX2)
+return "vmaskmov\t{%2, %1, %0|%0, %1, %2}";
+  else
+return "vmaskmov\t{%2, %1, %0|%0, %1, %2}";
+}
   [(set_attr "type" "sselog1")
(set_attr "prefix_extra" "1")
(set_attr "prefix" "vex")
@@ -27440,10 +27452,10 @@ (define_insn 
"_maskstore"
(set_attr "mode" "")])
 
 (define_expand "maskload"
-  [(set (match_operand:V48_AVX2 0 "register_operand")
-   (unspec:V48_AVX2
+  [(set (match_operand:V48_128_256 0 "register_operand")
+   (unspec:V48_128_256
  [(match_operand: 2 "register_operand")
-  (match_operand:V48_AVX2 1 "memory_operand")]
+  (match_operand:V48_128_256 1 "memory_operand")]
  UNSPEC_MASKMOV))]
   "TARGET_AVX")
 
@@ -27468,10 +27480,10 @@ (define_expand "maskload"
   "TARGET_AVX512BW")
 
 (define_expand "maskstore"
-  [(set (match_operand:V48_AVX2 0 "memory_operand")
-   (unspec:V48_AVX2
+  [(set (match_operand:V48_128_256 0 "memory_operand")
+   (unspec:V48_128_256
  [(match_operand: 2 "register_operand")
-  (match_operand:V48_AVX2 1 "register_operand")
+  (match_operand:V48_128_256 1 "register_operand")
   (match_dup 0)]
  UNSPEC_MASKMOV))]
   "TARGET_AVX")
-- 
2.31.1

[PATCH] [x86] Refactor mode iterator V_128 and V_128H, V_256 and V_256H

2023-08-24 Thread liuhongt via Gcc-patches

Merge V_128H and V_256H into V_128 and V_256, adjust related patterns.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/sse.md (vec_set): Removed.
(V_128H): Merge into ..
(V_128): .. this.
(V_256H): Merge into ..
(V_256): .. this.
(V_512): Add V32HF, V32BF.
(*ssse3_palignr_perm): Adjust mode iterator from V_128H
to V_128.
(vcond): Removed
(vcondu): Removed.
(avx_vbroadcastf128_): Refator from V_256H to V_256.
---
 gcc/config/i386/sse.md | 65 +-
 1 file changed, 7 insertions(+), 58 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index da85223a9b4..b9cf172306c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -312,17 +312,10 @@ (define_mode_iterator V
 
 ;; All 128bit vector modes
 (define_mode_iterator V_128
-  [V16QI V8HI V4SI V2DI V4SF (V2DF "TARGET_SSE2")])
-
-(define_mode_iterator V_128H
   [V16QI V8HI V8HF V8BF V4SI V2DI V4SF (V2DF "TARGET_SSE2")])
 
 ;; All 256bit vector modes
 (define_mode_iterator V_256
-  [V32QI V16HI V8SI V4DI V8SF V4DF])
-
-;; All 256bit vector modes including HF/BF vector modes
-(define_mode_iterator V_256H
   [V32QI V16HI V8SI V4DI V8SF V4DF V16HF V16BF])
 
 ;; All 128bit and 256bit vector modes
@@ -331,7 +324,7 @@ (define_mode_iterator V_128_256
V16HF V8HF V8SF V4SF V4DF V2DF])
 
 ;; All 512bit vector modes
-(define_mode_iterator V_512 [V64QI V32HI V16SI V8DI V16SF V8DF])
+(define_mode_iterator V_512 [V64QI V32HI V16SI V8DI V16SF V8DF V32HF V32BF])
 
 ;; All 256bit and 512bit vector modes
 (define_mode_iterator V_256_512
@@ -4652,21 +4645,6 @@ (define_expand "vcond"
   DONE;
 })
 
-(define_expand "vcond"
-  [(set (match_operand:VF_AVX512HFBFVL 0 "register_operand")
-   (if_then_else:VF_AVX512HFBFVL
- (match_operator 3 ""
-   [(match_operand: 4 "vector_operand")
-(match_operand: 5 "vector_operand")])
- (match_operand:VF_AVX512HFBFVL 1 "general_operand")
- (match_operand:VF_AVX512HFBFVL 2 "general_operand")))]
-  "TARGET_AVX512FP16"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
 (define_expand "vcond"
   [(set (match_operand: 0 "register_operand")
(if_then_else:
@@ -11414,20 +11392,6 @@ (define_expand "vec_set"
   DONE;
 })
 
-(define_expand "vec_set"
-  [(match_operand:V8BFH_128 0 "register_operand")
-   (match_operand: 1 "register_operand")
-   (match_operand 2 "vec_setm_sse41_operand")]
-  "TARGET_SSE"
-{
-  if (CONST_INT_P (operands[2]))
-ix86_expand_vector_set (false, operands[0], operands[1],
-   INTVAL (operands[2]));
-  else
-ix86_expand_vector_set_var (operands[0], operands[1], operands[2]);
-  DONE;
-})
-
 (define_expand "vec_set"
   [(match_operand:V_256_512 0 "register_operand")
(match_operand: 1 "register_operand")
@@ -11884,7 +11848,7 @@ (define_expand "avx512vl_vextractf128"
 
 (define_expand "avx_vextractf128"
   [(match_operand: 0 "nonimmediate_operand")
-   (match_operand:V_256H 1 "register_operand")
+   (match_operand:V_256 1 "register_operand")
(match_operand:SI 2 "const_0_to_1_operand")]
   "TARGET_AVX"
 {
@@ -17326,21 +17290,6 @@ (define_expand "vconduv2di"
   DONE;
 })
 
-(define_expand "vcondu"
-  [(set (match_operand:VF_AVX512FP16VL 0 "register_operand")
-   (if_then_else:VF_AVX512FP16VL
- (match_operator 3 ""
-   [(match_operand: 4 "vector_operand")
-(match_operand: 5 "vector_operand")])
- (match_operand:VF_AVX512FP16VL 1 "general_operand")
- (match_operand:VF_AVX512FP16VL 2 "general_operand")))]
-  "TARGET_AVX512FP16"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
 (define_expand "vcondeqv2di"
   [(set (match_operand:VI8F_128 0 "register_operand")
(if_then_else:VI8F_128
@@ -26879,8 +26828,8 @@ (define_split
   "operands[2] = gen_lowpart (mode, operands[0]);")
 
 (define_insn "avx_vbroadcastf128_"
-  [(set (match_operand:V_256H 0 "register_operand" "=x,x,x,v,v,v,v")
-   (vec_concat:V_256H
+  [(set (match_operand:V_256 0 "register_operand" "=x,x,x,v,v,v,v")
+   (vec_concat:V_256
  (match_operand: 1 "nonimmediate_operand" 
"m,0,?x,m,0,m,0")
  (match_dup 1)))]
   "TARGET_AVX"
@@ -27206,9 +27155,9 @@ (define_insn "*avx_vperm2f128_nozero"
(set_attr "mode" "")])
 
 (define_insn "*ssse3_palignr_perm"
-  [(set (match_operand:V_128H 0 "register_operand" "=x,Yw")
-  (vec_select:V_128H
-   (match_operand:V_128H 1 "register_operand" "0,Yw")
+  [(set (match_operand:V_128 0 "register_operand" "=x,Yw")
+  (vec_select:V_128
+   (match_operand:V_128 1 "register_operand" "0,Yw")
(match_parallel 2 "palignr_operand"
  [(match_operand 3 "const_int_operand")])))]
   "TARGET_SSSE3"
-- 
2.31.1

[PATCH] Fix target_clone ("arch=graniterapids-d") and target_clone ("arch=arrowlake-s")

2023-08-22 Thread liuhongt via Gcc-patches

Both "graniterapid-d" and "graniterapids" are attached with
PROCESSOR_GRANITERAPID in processor_alias_table but mapped to
different __cpu_subtype in get_intel_cpu.

And get_builtin_code_for_version will try to match the first
PROCESSOR_GRANITERAPIDS in processor_alias_table which maps to
"granitepraids" here.

861  else if (new_target->arch_specified && new_target->arch > 0)
1862for (i = 0; i < pta_size; i++)
1863  if (processor_alias_table[i].processor == new_target->arch)
1864{
1865  const pta *arch_info = _alias_table[i];
1866  switch (arch_info->priority)
1867{
1868default:
1869  arg_str = arch_info->name;

This mismatch makes dispatch_function_versions check the preidcate
of__builtin_cpu_is ("graniterapids") for "graniterapids-d" and causes
the issue.
The patch explicitly adds PROCESSOR_ARROWLAKE_S and
PROCESSOR_GRANITERAPIDS_D to make a distinction.

For "alderlake","raptorlake", "meteorlake" they share same isa, cost,
tuning, and mapped to the same __cpu_type/__cpu_subtype in
get_intel_cpu, so no need to add PROCESSOR_RAPTORLAKE and others.


Bootstrapped and regtested on x86_64-pc-linux-gnu.
Ok for trunk(and backport graniterapids-d part to GCC13)?

gcc/ChangeLog:

* common/config/i386/i386-common.cc (processor_names): Add new
member graniterapids-s and arrowlake-s.
* config/i386/i386-options.cc (processor_alias_table): Update
table with PROCESSOR_ARROWLAKE_S and
PROCESSOR_GRANITERAPIDS_D.
(m_GRANITERAPID_D): New macro.
(m_ARROWLAKE_S): Ditto.
(m_CORE_AVX512): Add m_GRANITERAPIDS_D.
(processor_cost_table): Add icelake_cost for
PROCESSOR_GRANITERAPIDS_D and alderlake_cost for
PROCESSOR_ARROWLAKE_S.
* config/i386/x86-tune.def: Hanlde m_ARROWLAKE_S same as
m_ARROWLAKE.
* config/i386/i386.h (enum processor_type): Add new member
PROCESSOR_GRANITERAPIDS_D and PROCESSOR_ARROWLAKE_S.
* config/i386/i386-c.cc (ix86_target_macros_internal): Handle
PROCESSOR_GRANITERAPIDS_D and PROCESSOR_ARROWLAKE_S
---
 gcc/common/config/i386/i386-common.cc | 11 +++--
 gcc/config/i386/i386-c.cc | 15 +++
 gcc/config/i386/i386-options.cc   |  6 ++-
 gcc/config/i386/i386.h|  4 +-
 gcc/config/i386/x86-tune.def  | 63 ++-
 5 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 12a01704a73..1e11163004b 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -2155,7 +2155,9 @@ const char *const processor_names[] =
   "alderlake",
   "rocketlake",
   "graniterapids",
+  "graniterapids-d",
   "arrowlake",
+  "arrowlake-s",
   "intel",
   "lujiazui",
   "geode",
@@ -2279,13 +2281,14 @@ const pta processor_alias_table[] =
 M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
   {"graniterapids", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, PTA_GRANITERAPIDS,
 M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS), P_PROC_AVX512F},
-  {"graniterapids-d", PROCESSOR_GRANITERAPIDS, CPU_HASWELL, 
PTA_GRANITERAPIDS_D,
-M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS_D), P_PROC_AVX512F},
+  {"graniterapids-d", PROCESSOR_GRANITERAPIDS_D, CPU_HASWELL,
+PTA_GRANITERAPIDS_D, M_CPU_SUBTYPE (INTEL_COREI7_GRANITERAPIDS_D),
+P_PROC_AVX512F},
   {"arrowlake", PROCESSOR_ARROWLAKE, CPU_HASWELL, PTA_ARROWLAKE,
 M_CPU_SUBTYPE (INTEL_COREI7_ARROWLAKE), P_PROC_AVX2},
-  {"arrowlake-s", PROCESSOR_ARROWLAKE, CPU_HASWELL, PTA_ARROWLAKE_S,
+  {"arrowlake-s", PROCESSOR_ARROWLAKE_S, CPU_HASWELL, PTA_ARROWLAKE_S,
 M_CPU_SUBTYPE (INTEL_COREI7_ARROWLAKE_S), P_PROC_AVX2},
-  {"lunarlake", PROCESSOR_ARROWLAKE, CPU_HASWELL, PTA_ARROWLAKE_S,
+  {"lunarlake", PROCESSOR_ARROWLAKE_S, CPU_HASWELL, PTA_ARROWLAKE_S,
 M_CPU_SUBTYPE (INTEL_COREI7_ARROWLAKE_S), P_PROC_AVX2},
   {"bonnell", PROCESSOR_BONNELL, CPU_ATOM, PTA_BONNELL,
 M_CPU_TYPE (INTEL_BONNELL), P_PROC_SSSE3},
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index caef5531593..0e11709ebc5 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -258,6 +258,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
   def_or_undef (parse_in, "__graniterapids");
   def_or_undef (parse_in, "__graniterapids__");
   break;
+case PROCESSOR_GRANITERAPIDS_D:
+  def_or_undef (parse_in, "__graniterapids_d");
+  def_or_undef (parse_in, "__graniterapids_d__");
+  break;
 case PROCESSOR_ALDERLAKE:
   def_or_undef (parse_in, "__alderlake");
   def_or_undef (parse_in, "__alderlake__");
@@ -270,6 +274,11 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
   def_or_undef (parse_in, "__arrowlake");
   def_or_undef (parse_in, "__arrowlake__");
   break;
+case PROCESSOR_ARROWLAKE_S:
+

[PATCH] [x86] Testcase fix.

2023-08-21 Thread liuhongt via Gcc-patches

Commit as an abvious fix.

gcc/testsuite/ChangeLog:

* gcc.target/i386/invariant-ternlog-1.c: Only scan %rdx under
TARGET_64BIT.
---
 gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c 
b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
index 21051c6bba0..bf67ed7e43d 100644
--- a/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
+++ b/gcc/testsuite/gcc.target/i386/invariant-ternlog-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-mavx512f -O2" } */
 /* { dg-final { scan-assembler-times "vmovdqa" 4 } } */
-/* { dg-final { scan-assembler-times {vpternlog[^\n\r]*\(%rdx\)} 2 } } */
+/* { dg-final { scan-assembler-times {vpternlog[^\n\r]*\(%rdx\)} 2 { target { 
! ia32 } } } } */
 
 #include 
 
-- 
2.31.1

[PATCH] Adjust testcase for Intel GDS.

2023-08-21 Thread liuhongt via Gcc-patches

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512f-pr88464-2.c: Add -mgather to
options.
* gcc.target/i386/avx512f-pr88464-3.c: Ditto.
* gcc.target/i386/avx512f-pr88464-4.c: Ditto.
* gcc.target/i386/avx512f-pr88464-6.c: Ditto.
* gcc.target/i386/avx512f-pr88464-7.c: Ditto.
* gcc.target/i386/avx512f-pr88464-8.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-10.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-12.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-13.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-14.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-15.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-16.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-2.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-4.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-5.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-6.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-7.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-8.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-8.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-10.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-12.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-13.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-14.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-15.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-16.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-2.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-4.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-5.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-6.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-7.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-8.c  | 2 +-
 18 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c
index 845bf509d82..28827dbd75d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-2.c
@@ -1,6 +1,6 @@
 /* PR tree-optimization/88464 */
 /* { dg-do run { target { avx512f } } } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 
-mtune=skylake-avx512" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 
-mgather" } */
 
 #include "avx512f-check.h"
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c
index 9eda4aa9b13..2df64bfa063 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-3.c
@@ -1,6 +1,6 @@
 /* PR tree-optimization/88464 */
 /* { dg-do compile } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 
-fdump-tree-vect-details" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 
-fdump-tree-vect-details -mgather" } */
 /* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 4 
"vect" } } */
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 4 "vect" 
} } */
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c
index e347e63b17a..173858aadd5 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-4.c
@@ -1,6 +1,6 @@
 /* PR tree-optimization/88464 */
 /* { dg-do run { target { avx512f } } } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 
-mtune=skylake-avx512" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 
-mgather" } */
 
 #include "avx512f-check.h"
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c
index 9ebb72a5bae..0adf3b6726a 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-6.c
@@ -1,6 +1,6 @@
 /* PR tree-optimization/88464 */
 /* { dg-do run { target { avx512f } } } */
-/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 
-mtune=skylake-avx512" } */
+/* { dg-options "-O3 -mavx512f -mprefer-vector-width=512 -mtune=skylake-avx512 
-mgather" } */
 
 #include "avx512f-check.h"
 
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c 
b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c
index 738640c2bf5..471ebc1676d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c
+++ b/gcc/testsuite/gcc.target/i386/avx512f-pr88464-7.c
@@ -1,6 +1,6 @@
 /* PR tree-optimization/88464 */
 /* { dg-do

[PATCH] Mention Intel -march=gracemont for Alderlake-N.

2023-08-20 Thread liuhongt via Gcc-patches

---
 htdocs/gcc-14/changes.html | 4 
 1 file changed, 4 insertions(+)

diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index eae25f1a..2c888660 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -151,6 +151,10 @@ a work-in-progress.
 -march=lunarlake.
 Lunar Lake is based on Arrow Lake S.
   
+  GCC now supports the Intel CPU named Alderlake-N through
+  -march=gracemont.
+  Alderlake-N is E-core only, not hybrid architecture.
+  
 
 
 
-- 
2.31.1

[PATCH] Support -march=gracemont

2023-08-18 Thread liuhongt via Gcc-patches

Alderlake-N is E-core only, add it as an alias of Alderlake.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_intel_cpu): Detect
Alderlake-N.
* common/config/i386/i386-common.cc (alias_table): Support
-march=gracemont as an alias of -march=alderlake.
---
 gcc/common/config/i386/cpuinfo.h  | 3 +++
 gcc/common/config/i386/i386-common.cc | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 13102b9c5dc..941f728b48b 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -533,6 +533,9 @@ get_intel_cpu (struct __processor_model *cpu_model,
   cpu_model->__cpu_type = INTEL_COREI7;
   cpu_model->__cpu_subtype = INTEL_COREI7_TIGERLAKE;
   break;
+
+case 0xbe:
+  /* Alder Lake N, E-core only.  */
 case 0x97:
 case 0x9a:
   /* Alder Lake.  */
diff --git a/gcc/common/config/i386/i386-common.cc 
b/gcc/common/config/i386/i386-common.cc
index 26005914079..8aa8bf12d76 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -2190,6 +2190,8 @@ const pta processor_alias_table[] =
 M_CPU_TYPE (INTEL_GOLDMONT_PLUS), P_PROC_SSE4_2},
   {"tremont", PROCESSOR_TREMONT, CPU_HASWELL, PTA_TREMONT,
 M_CPU_TYPE (INTEL_TREMONT), P_PROC_SSE4_2},
+  {"gracemont", PROCESSOR_ALDERLAKE, CPU_HASWELL, PTA_ALDERLAKE,
+   M_CPU_SUBTYPE (INTEL_COREI7_ALDERLAKE), P_PROC_AVX2},
   {"sierraforest", PROCESSOR_SIERRAFOREST, CPU_HASWELL, PTA_SIERRAFOREST,
 M_CPU_SUBTYPE (INTEL_SIERRAFOREST), P_PROC_AVX2},
   {"grandridge", PROCESSOR_GRANDRIDGE, CPU_HASWELL, PTA_GRANDRIDGE,
-- 
2.31.1

[PATCH] Generate vmovapd instead of vmovsd for moving DFmode between SSE_REGS.

2023-08-13 Thread liuhongt via Gcc-patches

vmovapd can enable register renaming and have same code size as
vmovsd. Similar for vmovsh vs vmovaps, vmovaps is 1 byte less than
vmovsh.

When TARGET_AVX512VL is not available, still generate
vmovsd/vmovss/vmovsh to avoid vmovapd/vmovaps zmm16-31.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.md (movdf_internal): Generate vmovapd instead of
vmovsd when moving DFmode between SSE_REGS.
(movhi_internal): Generate vmovdqa instead of vmovsh when
moving HImode between SSE_REGS.
(mov_internal): Use vmovaps instead of vmovsh when
moving HF/BFmode between SSE_REGS.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr89229-4a.c: Adjust testcase.
---
 gcc/config/i386/i386.md| 20 +---
 gcc/testsuite/gcc.target/i386/pr89229-4a.c |  4 +---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c906d75b13e..77182e34fe1 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2961,8 +2961,12 @@ (define_insn "*movhi_internal"
]
(const_string "TI"))
(eq_attr "alternative" "12")
- (cond [(match_test "TARGET_AVX512FP16")
+ (cond [(match_test "TARGET_AVX512VL")
+  (const_string "TI")
+(match_test "TARGET_AVX512FP16")
   (const_string "HF")
+(match_test "TARGET_AVX512F")
+  (const_string "SF")
 (match_test "TARGET_AVX")
   (const_string "TI")
 (ior (not (match_test "TARGET_SSE2"))
@@ -4099,8 +4103,12 @@ (define_insn "*movdf_internal"
 
   /* movaps is one byte shorter for non-AVX targets.  */
   (eq_attr "alternative" "13,17")
-(cond [(match_test "TARGET_AVX")
+(cond [(match_test "TARGET_AVX512VL")
+ (const_string "V2DF")
+   (match_test "TARGET_AVX512F")
  (const_string "DF")
+   (match_test "TARGET_AVX")
+ (const_string "V2DF")
(ior (not (match_test "TARGET_SSE2"))
 (match_test "optimize_function_for_size_p (cfun)"))
  (const_string "V4SF")
@@ -4380,8 +4388,14 @@ (define_insn "*mov_internal"
   (const_string "HI")
   (const_string "TI"))
   (eq_attr "alternative" "5")
-(cond [(match_test "TARGET_AVX512FP16")
+(cond [(match_test "TARGET_AVX512VL")
+   (const_string "V4SF")
+   (match_test "TARGET_AVX512FP16")
  (const_string "HF")
+   (match_test "TARGET_AVX512F")
+ (const_string "SF")
+   (match_test "TARGET_AVX")
+ (const_string "V4SF")
(ior (match_test "TARGET_SSE_PARTIAL_REG_DEPENDENCY")
 (match_test "TARGET_SSE_SPLIT_REGS"))
  (const_string "V4SF")
diff --git a/gcc/testsuite/gcc.target/i386/pr89229-4a.c 
b/gcc/testsuite/gcc.target/i386/pr89229-4a.c
index 5bc10d25619..8869650b0ad 100644
--- a/gcc/testsuite/gcc.target/i386/pr89229-4a.c
+++ b/gcc/testsuite/gcc.target/i386/pr89229-4a.c
@@ -1,4 +1,4 @@
-/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-do assemble { target { ! ia32 } } } */
 /* { dg-options "-O2 -march=skylake-avx512" } */
 
 extern double d;
@@ -12,5 +12,3 @@ foo1 (double x)
   asm volatile ("" : "+v" (xmm17));
   d = xmm17;
 }
-
-/* { dg-final { scan-assembler-not "vmovapd" } } */
-- 
2.31.1

[PATCH V2] Support -m[no-]gather -m[no-]scatter to enable/disable vectorization for all gather/scatter instructions

2023-08-11 Thread liuhongt via Gcc-patches

Rename original use_gather to use_gather_8parts, Support
-mtune-ctrl={,^}use_gather to set/clear tune features
use_gather_{2parts, 4parts, 8parts}. Support the new option -mgather
as alias of -mtune-ctrl=, use_gather, ^use_gather.

Similar for use_scatter.

How about this version?

gcc/ChangeLog:

* config/i386/i386-builtins.cc
(ix86_vectorize_builtin_gather): Adjust for use_gather_8parts.
* config/i386/i386-options.cc (parse_mtune_ctrl_str):
Set/Clear tune features use_{gather,scatter}_{2parts, 4parts,
8parts} for -mtune-crtl={,^}{use_gather,use_scatter}.
* config/i386/i386.cc (ix86_vectorize_builtin_scatter): Adjust
for use_scatter_8parts
* config/i386/i386.h (TARGET_USE_GATHER): Rename to ..
(TARGET_USE_GATHER_8PARTS): .. this.
(TARGET_USE_SCATTER): Rename to ..
(TARGET_USE_SCATTER_8PARTS): .. this.
* config/i386/x86-tune.def (X86_TUNE_USE_GATHER): Rename to
(X86_TUNE_USE_GATHER_8PARTS): .. this.
(X86_TUNE_USE_SCATTER): Rename to
(X86_TUNE_USE_SCATTER_8PARTS): .. this.
* config/i386/i386.opt: Add new options mgather, mscatter.
---
 gcc/config/i386/i386-builtins.cc |  2 +-
 gcc/config/i386/i386-options.cc  | 54 +++-
 gcc/config/i386/i386.cc  |  2 +-
 gcc/config/i386/i386.h   |  8 ++---
 gcc/config/i386/i386.opt |  8 +
 gcc/config/i386/x86-tune.def |  4 +--
 6 files changed, 56 insertions(+), 22 deletions(-)

diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
index 356b6dfd5fb..8a0b8dfe073 100644
--- a/gcc/config/i386/i386-builtins.cc
+++ b/gcc/config/i386/i386-builtins.cc
@@ -1657,7 +1657,7 @@ ix86_vectorize_builtin_gather (const_tree mem_vectype,
  ? !TARGET_USE_GATHER_2PARTS
  : (known_eq (TYPE_VECTOR_SUBPARTS (mem_vectype), 4u)
 ? !TARGET_USE_GATHER_4PARTS
-: !TARGET_USE_GATHER)))
+: !TARGET_USE_GATHER_8PARTS)))
 return NULL_TREE;
 
   if ((TREE_CODE (index_type) != INTEGER_TYPE
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 127ee24203c..b8d038af69d 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1731,20 +1731,46 @@ parse_mtune_ctrl_str (struct gcc_options *opts, bool 
dump)
   curr_feature_string++;
   clear = true;
 }
-  for (i = 0; i < X86_TUNE_LAST; i++)
-{
-  if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
-{
-  ix86_tune_features[i] = !clear;
-  if (dump)
-fprintf (stderr, "Explicitly %s feature %s\n",
- clear ? "clear" : "set", ix86_tune_feature_names[i]);
-  break;
-}
-}
-  if (i == X86_TUNE_LAST)
-   error ("unknown parameter to option %<-mtune-ctrl%>: %s",
-  clear ? curr_feature_string - 1 : curr_feature_string);
+
+  if (!strcmp (curr_feature_string, "use_gather"))
+   {
+ ix86_tune_features[X86_TUNE_USE_GATHER_2PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_GATHER_4PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_GATHER_8PARTS] = !clear;
+ if (dump)
+   fprintf (stderr, "Explicitly %s features use_gather_2parts,"
+" use_gather_4parts, use_gather_8parts\n",
+clear ? "clear" : "set");
+
+   }
+  else if (!strcmp (curr_feature_string, "use_scatter"))
+   {
+ ix86_tune_features[X86_TUNE_USE_SCATTER_2PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_SCATTER_4PARTS] = !clear;
+ ix86_tune_features[X86_TUNE_USE_SCATTER_8PARTS] = !clear;
+ if (dump)
+   fprintf (stderr, "Explicitly %s features use_scatter_2parts,"
+" use_scatter_4parts, use_scatter_8parts\n",
+clear ? "clear" : "set");
+   }
+  else
+   {
+ for (i = 0; i < X86_TUNE_LAST; i++)
+   {
+ if (!strcmp (curr_feature_string, ix86_tune_feature_names[i]))
+   {
+ ix86_tune_features[i] = !clear;
+ if (dump)
+   fprintf (stderr, "Explicitly %s feature %s\n",
+clear ? "clear" : "set", 
ix86_tune_feature_names[i]);
+ break;
+   }
+   }
+
+ if (i == X86_TUNE_LAST)
+   error ("unknown parameter to option %<-mtune-ctrl%>: %s",
+  clear ? curr_feature_string - 1 : curr_feature_string);
+   }
   curr_feature_string = next_feature_string;
 }
   while (curr_feature_string);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d592ece700a..cd49fb9e47a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -19193,7 +19193,7 @@ ix86_vectorize_builtin_scatter (const_tree vectype,
   ?

[PATCH] Software mitigation: Disable gather generation in vectorization for GDS affected Intel Processors.

2023-08-10 Thread liuhongt via Gcc-patches

For more details of GDS (Gather Data Sampling), refer to
https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/gather-data-sampling.html

After microcode update, there's performance regression. To avoid that,
the patch disables gather generation in autovectorization but uses
gather scalar emulation instead.

Ready push to trunk and backport.
any comments?

gcc/ChangeLog:

* config/i386/i386-options.cc (m_GDS): New macro.
* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Don't
enable for m_GDS.
(X86_TUNE_USE_GATHER_4PARTS): Ditto.
(X86_TUNE_USE_GATHER): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-gather-2.c: Adjust options to keep
gather vectorization.
* gcc.target/i386/avx2-gather-6.c: Ditto.
* gcc.target/i386/avx512f-pr88464-1.c: Ditto.
* gcc.target/i386/avx512f-pr88464-5.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-1.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-11.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-3.c: Ditto.
* gcc.target/i386/avx512vl-pr88464-9.c: Ditto.
* gcc.target/i386/pr88531-1b.c: Ditto.
* gcc.target/i386/pr88531-1c.c: Ditto.
---
 gcc/config/i386/i386-options.cc | 5 +
 gcc/config/i386/x86-tune.def| 6 +++---
 gcc/testsuite/gcc.target/i386/avx2-gather-2.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx2-gather-6.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-1.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512f-pr88464-5.c   | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-1.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-11.c | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-3.c  | 2 +-
 gcc/testsuite/gcc.target/i386/avx512vl-pr88464-9.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr88531-1b.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr88531-1c.c  | 2 +-
 12 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 127ee24203c..e6ba33c370d 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -141,6 +141,11 @@ along with GCC; see the file COPYING3.  If not see
 #define m_ARROWLAKE (HOST_WIDE_INT_1U<

[PATCH] Support -m[no-]gather -m[no-]scatter to enable/disable vectorization for all gather/scatter instructions.

2023-08-09 Thread liuhongt via Gcc-patches

Currently we have 3 different independent tunes for gather
"use_gather,use_gather_2parts,use_gather_4parts",
similar for scatter, there're
"use_scatter,use_scatter_2parts,use_scatter_4parts"

The patch support 2 standardizing options to enable/disable
vectorization for all gather/scatter instructions. The options is
interpreted by driver to 3 tunes.

bootstrapped and regtested on x86_64-pc-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.h (DRIVER_SELF_SPECS): Add
GATHER_SCATTER_DRIVER_SELF_SPECS.
(GATHER_SCATTER_DRIVER_SELF_SPECS): New macro.
* config/i386/i386.opt (mgather): New option.
(mscatter): Ditto.
---
 gcc/config/i386/i386.h   | 12 +++-
 gcc/config/i386/i386.opt |  8 
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index ef342fcee9b..d9ac2c29bde 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -565,7 +565,17 @@ extern GTY(()) tree x86_mfence;
 # define SUBTARGET_DRIVER_SELF_SPECS ""
 #endif
 
-#define DRIVER_SELF_SPECS SUBTARGET_DRIVER_SELF_SPECS
+#ifndef GATHER_SCATTER_DRIVER_SELF_SPECS
+# define GATHER_SCATTER_DRIVER_SELF_SPECS \
+  "%{mno-gather:-mtune-ctrl=^use_gather_2parts,^use_gather_4parts,^use_gather} 
\
+   %{mgather:-mtune-ctrl=use_gather_2parts,use_gather_4parts,use_gather} \
+   
%{mno-scatter:-mtune-ctrl=^use_scatter_2parts,^use_scatter_4parts,^use_scatter} 
\
+   %{mscatter:-mtune-ctrl=use_scatter_2parts,use_scatter_4parts,use_scatter}"
+#endif
+
+#define DRIVER_SELF_SPECS \
+  SUBTARGET_DRIVER_SELF_SPECS " " \
+  GATHER_SCATTER_DRIVER_SELF_SPECS
 
 /* -march=native handling only makes sense with compiler running on
an x86 or x86_64 chip.  If changing this condition, also change
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index ddb7f110aa2..99948644a8d 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -424,6 +424,14 @@ mdaz-ftz
 Target
 Set the FTZ and DAZ Flags.
 
+mgather
+Target
+Enable vectorization for gather instruction.
+
+mscatter
+Target
+Enable vectorization for scatter instruction.
+
 mpreferred-stack-boundary=
 Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
 Attempt to keep stack aligned to this power of 2.
-- 
2.31.1

[PATCH] i386: Do not sanitize upper part of V2HFmode and V4HFmode reg with -fno-trapping-math [PR110832]

2023-08-09 Thread liuhongt via Gcc-patches

Also add ix86_partial_vec_fp_math to to condition of V2HF/V4HF named
patterns in order to avoid generation of partial vector V8HFmode
trapping instructions.

Bootstrapped and regtseted on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

PR target/110832
* config/i386/mmx.md: (movq__to_sse): Also do not
sanitize upper part of V4HFmode register with
-fno-trapping-math.
(v4hf3): Enable for ix86_partial_vec_fp_math.
(v2hf3): Ditto.
(divv2hf3): Ditto.
(movd_v2hf_to_sse): Do not sanitize upper part of V2HFmode
register with -fno-trapping-math.
---
 gcc/config/i386/mmx.md | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index d51b3b9dc71..170432a7128 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -596,7 +596,7 @@ (define_expand "movq__to_sse"
  (match_dup 2)))]
   "TARGET_SSE2"
 {
-  if (mode == V2SFmode
+  if (mode != V2SImode
   && !flag_trapping_math)
 {
   rtx op1 = force_reg (mode, operands[1]);
@@ -1941,7 +1941,7 @@ (define_expand "v4hf3"
(plusminusmult:V4HF
  (match_operand:V4HF 1 "nonimmediate_operand")
  (match_operand:V4HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
 {
   rtx op2 = gen_reg_rtx (V8HFmode);
   rtx op1 = gen_reg_rtx (V8HFmode);
@@ -1961,7 +1961,7 @@ (define_expand "divv4hf3"
(div:V4HF
  (match_operand:V4HF 1 "nonimmediate_operand")
  (match_operand:V4HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
 {
   rtx op2 = gen_reg_rtx (V8HFmode);
   rtx op1 = gen_reg_rtx (V8HFmode);
@@ -1983,14 +1983,22 @@ (define_expand "movd_v2hf_to_sse"
(match_operand:V2HF 1 "nonimmediate_operand"))
  (match_operand:V8HF 2 "reg_or_0_operand")
  (const_int 3)))]
-  "TARGET_SSE")
+  "TARGET_SSE"
+{
+  if (!flag_trapping_math && operands[2] == CONST0_RTX (V8HFmode))
+  {
+rtx op1 = force_reg (V2HFmode, operands[1]);
+emit_move_insn (operands[0], lowpart_subreg (V8HFmode, op1, V2HFmode));
+DONE;
+  }
+})
 
 (define_expand "v2hf3"
   [(set (match_operand:V2HF 0 "register_operand")
(plusminusmult:V2HF
  (match_operand:V2HF 1 "nonimmediate_operand")
  (match_operand:V2HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
 {
   rtx op2 = gen_reg_rtx (V8HFmode);
   rtx op1 = gen_reg_rtx (V8HFmode);
@@ -2009,7 +2017,7 @@ (define_expand "divv2hf3"
(div:V2HF
  (match_operand:V2HF 1 "nonimmediate_operand")
  (match_operand:V2HF 2 "nonimmediate_operand")))]
-  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+  "TARGET_AVX512FP16 && TARGET_AVX512VL && ix86_partial_vec_fp_math"
 {
   rtx op2 = gen_reg_rtx (V8HFmode);
   rtx op1 = gen_reg_rtx (V8HFmode);
-- 
2.31.1

[PATCH] Rename local variable subleaf_level to max_subleaf_level.

2023-08-09 Thread liuhongt via Gcc-patches

This minor fix is preapproved in [1].
Committed to trunk.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-August/626758.html

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features):
Rename local variable subleaf_level to max_subleaf_level.
---
 gcc/common/config/i386/cpuinfo.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 9fa4dec2a7e..70e8d01e09b 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -663,7 +663,6 @@ get_available_features (struct __processor_model *cpu_model,
   unsigned int max_cpuid_level = cpu_model2->__cpu_max_level;
   unsigned int eax, ebx;
   unsigned int ext_level;
-  unsigned int subleaf_level;
 
   /* Get XCR_XFEATURE_ENABLED_MASK register with xgetbv.  */
 #define XCR_XFEATURE_ENABLED_MASK  0x0
@@ -763,7 +762,8 @@ get_available_features (struct __processor_model *cpu_model,
   /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */
   if (max_cpuid_level >= 7)
 {
-  __cpuid_count (7, 0, subleaf_level, ebx, ecx, edx);
+  unsigned int max_subleaf_level;
+  __cpuid_count (7, 0, max_subleaf_level, ebx, ecx, edx);
   if (ebx & bit_BMI)
set_feature (FEATURE_BMI);
   if (ebx & bit_SGX)
@@ -875,7 +875,7 @@ get_available_features (struct __processor_model *cpu_model,
set_feature (FEATURE_AVX512FP16);
}
 
-  if (subleaf_level >= 1)
+  if (max_subleaf_level >= 1)
{
  __cpuid_count (7, 1, eax, ebx, ecx, edx);
  if (eax & bit_HRESET)
-- 
2.31.1

[PATCH V2] [X86] Workaround possible CPUID bug in Sandy Bridge.

2023-08-08 Thread liuhongt via Gcc-patches

> Please rather do it in a more self-descriptive way, as proposed in the
> attached patch. You won't need a comment then.
>

Adjusted in V2 patch.

Don't access leaf 7 subleaf 1 unless subleaf 0 says it is
supported via EAX.

Intel documentation says invalid subleaves return 0. We had been
relying on that behavior instead of checking the max sublef number.

It appears that some Sandy Bridge CPUs return at least the subleaf 0
EDX value for subleaf 1. Best guess is that this is a bug in a
microcode patch since all of the bits we're seeing set in EDX were
introduced after Sandy Bridge was originally released.

This is causing avxvnniint16 to be incorrectly enabled with
-march=native on these CPUs.

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features): Check
EAX for valid subleaf before use CPUID.
---
 gcc/common/config/i386/cpuinfo.h | 82 +---
 1 file changed, 43 insertions(+), 39 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 30ef0d334ca..9fa4dec2a7e 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -663,6 +663,7 @@ get_available_features (struct __processor_model *cpu_model,
   unsigned int max_cpuid_level = cpu_model2->__cpu_max_level;
   unsigned int eax, ebx;
   unsigned int ext_level;
+  unsigned int subleaf_level;
 
   /* Get XCR_XFEATURE_ENABLED_MASK register with xgetbv.  */
 #define XCR_XFEATURE_ENABLED_MASK  0x0
@@ -762,7 +763,7 @@ get_available_features (struct __processor_model *cpu_model,
   /* Get Advanced Features at level 7 (eax = 7, ecx = 0/1). */
   if (max_cpuid_level >= 7)
 {
-  __cpuid_count (7, 0, eax, ebx, ecx, edx);
+  __cpuid_count (7, 0, subleaf_level, ebx, ecx, edx);
   if (ebx & bit_BMI)
set_feature (FEATURE_BMI);
   if (ebx & bit_SGX)
@@ -874,45 +875,48 @@ get_available_features (struct __processor_model 
*cpu_model,
set_feature (FEATURE_AVX512FP16);
}
 
-  __cpuid_count (7, 1, eax, ebx, ecx, edx);
-  if (eax & bit_HRESET)
-   set_feature (FEATURE_HRESET);
-  if (eax & bit_CMPCCXADD)
-   set_feature(FEATURE_CMPCCXADD);
-  if (edx & bit_PREFETCHI)
-   set_feature (FEATURE_PREFETCHI);
-  if (eax & bit_RAOINT)
-   set_feature (FEATURE_RAOINT);
-  if (avx_usable)
-   {
- if (eax & bit_AVXVNNI)
-   set_feature (FEATURE_AVXVNNI);
- if (eax & bit_AVXIFMA)
-   set_feature (FEATURE_AVXIFMA);
- if (edx & bit_AVXVNNIINT8)
-   set_feature (FEATURE_AVXVNNIINT8);
- if (edx & bit_AVXNECONVERT)
-   set_feature (FEATURE_AVXNECONVERT);
- if (edx & bit_AVXVNNIINT16)
-   set_feature (FEATURE_AVXVNNIINT16);
- if (eax & bit_SM3)
-   set_feature (FEATURE_SM3);
- if (eax & bit_SHA512)
-   set_feature (FEATURE_SHA512);
- if (eax & bit_SM4)
-   set_feature (FEATURE_SM4);
-   }
-  if (avx512_usable)
-   {
- if (eax & bit_AVX512BF16)
-   set_feature (FEATURE_AVX512BF16);
-   }
-  if (amx_usable)
+  if (subleaf_level >= 1)
{
- if (eax & bit_AMX_FP16)
-   set_feature (FEATURE_AMX_FP16);
- if (edx & bit_AMX_COMPLEX)
-   set_feature (FEATURE_AMX_COMPLEX);
+ __cpuid_count (7, 1, eax, ebx, ecx, edx);
+ if (eax & bit_HRESET)
+   set_feature (FEATURE_HRESET);
+ if (eax & bit_CMPCCXADD)
+   set_feature(FEATURE_CMPCCXADD);
+ if (edx & bit_PREFETCHI)
+   set_feature (FEATURE_PREFETCHI);
+ if (eax & bit_RAOINT)
+   set_feature (FEATURE_RAOINT);
+ if (avx_usable)
+   {
+ if (eax & bit_AVXVNNI)
+   set_feature (FEATURE_AVXVNNI);
+ if (eax & bit_AVXIFMA)
+   set_feature (FEATURE_AVXIFMA);
+ if (edx & bit_AVXVNNIINT8)
+   set_feature (FEATURE_AVXVNNIINT8);
+ if (edx & bit_AVXNECONVERT)
+   set_feature (FEATURE_AVXNECONVERT);
+ if (edx & bit_AVXVNNIINT16)
+   set_feature (FEATURE_AVXVNNIINT16);
+ if (eax & bit_SM3)
+   set_feature (FEATURE_SM3);
+ if (eax & bit_SHA512)
+   set_feature (FEATURE_SHA512);
+ if (eax & bit_SM4)
+   set_feature (FEATURE_SM4);
+   }
+ if (avx512_usable)
+   {
+ if (eax & bit_AVX512BF16)
+   set_feature (FEATURE_AVX512BF16);
+   }
+ if (amx_usable)
+   {
+ if (eax & bit_AMX_FP16)
+   set_feature (FEATURE_AMX_FP16);
+ if (edx & bit_AMX_COMPLEX)
+   set_feature (FEATURE_AMX_COMPLEX);
+   }
}
 }
 
-- 
2.31.1

[PATCH] [X86] Workaround possible CPUID bug in Sandy Bridge.

2023-08-08 Thread liuhongt via Gcc-patches

Don't access leaf 7 subleaf 1 unless subleaf 0 says it is
supported via EAX.

Intel documentation says invalid subleaves return 0. We had been
relying on that behavior instead of checking the max sublef number.

It appears that some Sandy Bridge CPUs return at least the subleaf 0
EDX value for subleaf 1. Best guess is that this is a bug in a
microcode patch since all of the bits we're seeing set in EDX were
introduced after Sandy Bridge was originally released.

This is causing avxvnniint16 to be incorrectly enabled with
-march=native on these CPUs.

BTW: Thanks for reminder from llvm forks Phoebe and Craig.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport?

gcc/ChangeLog:

* common/config/i386/cpuinfo.h (get_available_features): Check
EAX for valid subleaf before use CPUID.
---
 gcc/common/config/i386/cpuinfo.h | 84 +---
 1 file changed, 46 insertions(+), 38 deletions(-)

diff --git a/gcc/common/config/i386/cpuinfo.h b/gcc/common/config/i386/cpuinfo.h
index 30ef0d334ca..24ab2252eb0 100644
--- a/gcc/common/config/i386/cpuinfo.h
+++ b/gcc/common/config/i386/cpuinfo.h
@@ -874,45 +874,53 @@ get_available_features (struct __processor_model 
*cpu_model,
set_feature (FEATURE_AVX512FP16);
}
 
-  __cpuid_count (7, 1, eax, ebx, ecx, edx);
-  if (eax & bit_HRESET)
-   set_feature (FEATURE_HRESET);
-  if (eax & bit_CMPCCXADD)
-   set_feature(FEATURE_CMPCCXADD);
-  if (edx & bit_PREFETCHI)
-   set_feature (FEATURE_PREFETCHI);
-  if (eax & bit_RAOINT)
-   set_feature (FEATURE_RAOINT);
-  if (avx_usable)
-   {
- if (eax & bit_AVXVNNI)
-   set_feature (FEATURE_AVXVNNI);
- if (eax & bit_AVXIFMA)
-   set_feature (FEATURE_AVXIFMA);
- if (edx & bit_AVXVNNIINT8)
-   set_feature (FEATURE_AVXVNNIINT8);
- if (edx & bit_AVXNECONVERT)
-   set_feature (FEATURE_AVXNECONVERT);
- if (edx & bit_AVXVNNIINT16)
-   set_feature (FEATURE_AVXVNNIINT16);
- if (eax & bit_SM3)
-   set_feature (FEATURE_SM3);
- if (eax & bit_SHA512)
-   set_feature (FEATURE_SHA512);
- if (eax & bit_SM4)
-   set_feature (FEATURE_SM4);
-   }
-  if (avx512_usable)
-   {
- if (eax & bit_AVX512BF16)
-   set_feature (FEATURE_AVX512BF16);
-   }
-  if (amx_usable)
+  /* According to document, when subleaf is invliad, EAX,EBX,ECX,EDX should
+return 0 for CPUID (7, 1, EAX, EBX, ECX, EDX).
+But looks like it doesn't satisfy the document on some CPU, refer to
+https://reviews.llvm.org/D155145.
+Manually check valid subleaf here.  */
+  if (eax)
{
- if (eax & bit_AMX_FP16)
-   set_feature (FEATURE_AMX_FP16);
- if (edx & bit_AMX_COMPLEX)
-   set_feature (FEATURE_AMX_COMPLEX);
+ __cpuid_count (7, 1, eax, ebx, ecx, edx);
+ if (eax & bit_HRESET)
+   set_feature (FEATURE_HRESET);
+ if (eax & bit_CMPCCXADD)
+   set_feature(FEATURE_CMPCCXADD);
+ if (edx & bit_PREFETCHI)
+   set_feature (FEATURE_PREFETCHI);
+ if (eax & bit_RAOINT)
+   set_feature (FEATURE_RAOINT);
+ if (avx_usable)
+   {
+ if (eax & bit_AVXVNNI)
+   set_feature (FEATURE_AVXVNNI);
+ if (eax & bit_AVXIFMA)
+   set_feature (FEATURE_AVXIFMA);
+ if (edx & bit_AVXVNNIINT8)
+   set_feature (FEATURE_AVXVNNIINT8);
+ if (edx & bit_AVXNECONVERT)
+   set_feature (FEATURE_AVXNECONVERT);
+ if (edx & bit_AVXVNNIINT16)
+   set_feature (FEATURE_AVXVNNIINT16);
+ if (eax & bit_SM3)
+   set_feature (FEATURE_SM3);
+ if (eax & bit_SHA512)
+   set_feature (FEATURE_SHA512);
+ if (eax & bit_SM4)
+   set_feature (FEATURE_SM4);
+   }
+ if (avx512_usable)
+   {
+ if (eax & bit_AVX512BF16)
+   set_feature (FEATURE_AVX512BF16);
+   }
+ if (amx_usable)
+   {
+ if (eax & bit_AMX_FP16)
+   set_feature (FEATURE_AMX_FP16);
+ if (edx & bit_AMX_COMPLEX)
+   set_feature (FEATURE_AMX_COMPLEX);
+   }
}
 }
 
-- 
2.31.1

[PATCH] i386: Clear upper bits of XMM register for V4HFmode/V2HFmode operations [PR110762]

2023-08-07 Thread liuhongt via Gcc-patches

Similar like r14-2786-gade30fad6669e5, the patch is for V4HF/V2HFmode.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/110762
* config/i386/mmx.md (3): Changed from define_insn
to define_expand and break into ..
(v4hf3): .. this.
(divv4hf3): .. this.
(v2hf3): .. this.
(divv2hf3): .. this.
(movd_v2hf_to_sse): New define_expand.
(movq__to_sse): Extend to V4HFmode.
(mmxdoublevecmode): Ditto.
(V2FI_V4HF): New mode iterator.
* config/i386/sse.md (*vec_concatv4sf): Extend to hanlde V8HF
by using mode iterator V4SF_V8HF, renamed to ..
(*vec_concat): .. this.
(*vec_concatv4sf_0): Extend to handle V8HF by using mode
iterator V4SF_V8HF, renamed to ..
(*vec_concat_0): .. this.
(*vec_concatv8hf_movss): New define_insn.
(V4SF_V8HF): New mode iterator.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110762-v4hf.c: New test.
---
 gcc/config/i386/mmx.md| 109 +++---
 gcc/config/i386/sse.md|  40 +--
 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c |  57 +
 3 files changed, 177 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110762-v4hf.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 896af76a33f..88bdf084f54 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -79,9 +79,7 @@ (define_mode_iterator V_16_32_64
 ;; V2S* modes
 (define_mode_iterator V2FI [V2SF V2SI])
 
-;; 4-byte and 8-byte float16 vector modes
-(define_mode_iterator VHF_32_64 [V4HF V2HF])
-
+(define_mode_iterator V2FI_V4HF [V2SF V2SI V4HF])
 ;; Mapping from integer vector mode to mnemonic suffix
 (define_mode_attr mmxvecsize
   [(V8QI "b") (V4QI "b") (V2QI "b")
@@ -108,7 +106,7 @@ (define_mode_attr mmxintvecmodelower
 
 ;; Mapping of vector modes to a vector mode of double size
 (define_mode_attr mmxdoublevecmode
-  [(V2SF "V4SF") (V2SI "V4SI")])
+  [(V2SF "V4SF") (V2SI "V4SI") (V4HF "V8HF")])
 
 ;; Mapping of vector modes back to the scalar modes
 (define_mode_attr mmxscalarmode
@@ -594,7 +592,7 @@ (define_insn "sse_movntq"
 (define_expand "movq__to_sse"
   [(set (match_operand: 0 "register_operand")
(vec_concat:
- (match_operand:V2FI 1 "nonimmediate_operand")
+ (match_operand:V2FI_V4HF 1 "nonimmediate_operand")
  (match_dup 2)))]
   "TARGET_SSE2"
   "operands[2] = CONST0_RTX (mode);")
@@ -1927,21 +1925,94 @@ (define_expand "lroundv2sfv2si2"
 ;;
 ;
 
-(define_insn "3"
-  [(set (match_operand:VHF_32_64 0 "register_operand" "=v")
-   (plusminusmultdiv:VHF_32_64
- (match_operand:VHF_32_64 1 "register_operand" "v")
- (match_operand:VHF_32_64 2 "register_operand" "v")))]
+(define_expand "v4hf3"
+  [(set (match_operand:V4HF 0 "register_operand")
+   (plusminusmult:V4HF
+ (match_operand:V4HF 1 "nonimmediate_operand")
+ (match_operand:V4HF 2 "nonimmediate_operand")))]
   "TARGET_AVX512FP16 && TARGET_AVX512VL"
-  "vph\t{%2, %1, %0|%0, %1, %2}"
-  [(set (attr "type")
-  (cond [(match_test " == MULT")
-   (const_string "ssemul")
-(match_test " == DIV")
-   (const_string "ssediv")]
-(const_string "sseadd")))
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "V8HF")])
+{
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (op2, operands[2]));
+  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+
+  emit_insn (gen_v8hf3 (op0, op1, op2));
+
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "divv4hf3"
+  [(set (match_operand:V4HF 0 "register_operand")
+   (div:V4HF
+ (match_operand:V4HF 1 "nonimmediate_operand")
+ (match_operand:V4HF 2 "nonimmediate_operand")))]
+  "TARGET_AVX512FP16 && TARGET_AVX512VL"
+{
+  rtx op2 = gen_reg_rtx (V8HFmode);
+  rtx op1 = gen_reg_rtx (V8HFmode);
+  rtx op0 = gen_reg_rtx (V8HFmode);
+
+  emit_insn (gen_movq_v4hf_to_sse (op1, operands[1]));
+  rtx tmp = gen_rtx_VEC_CONCAT (V8HFmode, operands[2],
+   force_reg (V4HFmode, CONST1_RTX (V4HFmode)));
+  emit_insn (gen_rtx_SET (op2, tmp));
+  emit_insn (gen_divv8hf3 (op0, op1, op2));
+  emit_move_insn (operands[0], lowpart_subreg (V4HFmode, op0, V8HFmode));
+  DONE;
+})
+
+(define_expand "movd_v2hf_to_sse"
+  [(set (match_operand:V8HF 0 "register_operand")
+   (vec_merge:V8HF
+ (vec_duplicate:V8HF
+   (match_operand:V2HF 1 "nonimmediate_operand"))
+ (match_operand:V8HF 2 "reg_or_0_operand")
+ (const_int 3)))]
+  "TARGET_SSE")
+
+(define_expand "v2hf3"
+  [(set (match_operand:V2HF 0 "register_operand")
+   (plusminusmult:V2HF

[PATCH] Fix ICE in rtl check when bootstrap.

2023-08-07 Thread liuhongt via Gcc-patches

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/libgfortran/generated/matmul_i1.c:
 In function ‘matmul_i1_avx512f’:
/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/libgfortran/generated/matmul_i1.c:1781:1:
 internal compiler error: RTL check: expected elt 0 type 'i' or 'n', have 'w' 
(rtx const_int) in vpternlog_redundant_operand_mask, at 
config/i386/i386.cc:19460
 1781 | }
  | ^
0x5559de26dc2d rtl_check_failed_type2(rtx_def const*, int, int, int, char 
const*, int, char const*)

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/rtl.cc:761
0x5559de340bfe vpternlog_redundant_operand_mask(rtx_def**)

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/config/i386/i386.cc:19460
0x5559dfec67a6 split_44

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/config/i386/sse.md:12730
0x5559dfec67a6 split_63

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/config/i386/sse.md:28428
0x5559deb8a682 try_split(rtx_def*, rtx_insn*, int)

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/emit-rtl.cc:3800
0x5559deb8adf2 try_split(rtx_def*, rtx_insn*, int)

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/emit-rtl.cc:3972
0x5559def69194 split_insn

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/recog.cc:3385
0x5559def70c57 split_all_insns()

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/recog.cc:3489
0x5559def70d0c execute

/var/tmp/portage/sys-devel/gcc-14.0.0_pre20230806/work/gcc-14-20230806/gcc/recog.cc:4413

Use INTVAL (imm_op) instead of XINT (imm_op, 0).

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386-protos.h (vpternlog_redundant_operand_mask):
  Adjust parameter type.
* config/i386/i386.cc (vpternlog_redundant_operand_mask): Use
  INTVAL instead of XINT, also adjust parameter type from rtx* to
  rtx since the function only needs operands[4] in vpternlog
  pattern.
(substitute_vpternlog_operands): Pass operands[4] instead of
  operands to vpternlog_redundant_operand_mask
* config/i386/sse.md: Ditto.
---
 gcc/config/i386/i386-protos.h | 2 +-
 gcc/config/i386/i386.cc   | 6 +++---
 gcc/config/i386/sse.md| 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index e547ee64587..fc2f1f13b78 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -70,7 +70,7 @@ extern machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
 extern int avx_vpermilp_parallel (rtx par, machine_mode mode);
 extern int avx_vperm2f128_parallel (rtx par, machine_mode mode);
 
-extern int vpternlog_redundant_operand_mask (rtx[]);
+extern int vpternlog_redundant_operand_mask (rtx);
 extern void substitute_vpternlog_operands (rtx[]);
 
 extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 8cd26eb54fa..50860050049 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -19454,10 +19454,10 @@ avx_vperm2f128_parallel (rtx par, machine_mode mode)
 /* Return a mask of VPTERNLOG operands that do not affect output.  */
 
 int
-vpternlog_redundant_operand_mask (rtx *operands)
+vpternlog_redundant_operand_mask (rtx pternlog_imm)
 {
   int mask = 0;
-  int imm8 = XINT (operands[4], 0);
+  int imm8 = INTVAL (pternlog_imm);
 
   if (((imm8 >> 4) & 0x0F) == (imm8 & 0x0F))
 mask |= 1;
@@ -19475,7 +19475,7 @@ vpternlog_redundant_operand_mask (rtx *operands)
 void
 substitute_vpternlog_operands (rtx *operands)
 {
-  int mask = vpternlog_redundant_operand_mask (operands);
+  int mask = vpternlog_redundant_operand_mask (operands[4]);
 
   if (mask & 1) /* The first operand is redundant.  */
 operands[1] = operands[2];
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7e2aa3f995c..c53450fd965 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -12706,7 +12706,7 @@ (define_split
   (match_operand:V 3 "memory_operand")
   (match_operand:SI 4 "const_0_to_255_operand")]
  UNSPEC_VTERNLOG))]
-  "!reload_completed && vpternlog_redundant_operand_mask (operands) == 3"
+  "!reload_completed && vpternlog_redundant_operand_mask (operands[4]) == 3"
   [(set (match_dup 0)
(match_dup 3))
(set (match_dup 0)
@@ -12727,7 +12727,7 @@ (define_split
   (match_operand:V 3 "nonimmediate_operand")
   (match_operand:SI 4 "const_0_to_255_operand")]
  UNSPEC_VTERNLOG))]
-  "!reload_completed && vpternlog_redundant_operand_mask (operands) != 0"
+  "!reload_completed && vpternlog_redundant_operand_mask (operands[4]) != 0"
   [(set (match_dup 0)

[PATCH] Optimize vlddqu + inserti128 to vbroadcasti128

2023-08-01 Thread liuhongt via Gcc-patches

In [1], I propose a patch to generate vmovdqu for all vlddqu intrinsics
after AVX2, it's rejected as
> The instruction is reachable only as __builtin_ia32_lddqu* (aka
> _mm_lddqu_si*), so it was chosen by the programmer for a reason. I
> think that in this case, the compiler should not be too smart and
> change the instruction behind the programmer's back. The caveats are
> also explained at length in the ISA manual.

So the patch is more conservative, only optimize vlddqu + vinserti128
to vbroadcasti128.
vlddqu + vinserti128 will use shuffle port in addition to load port
comparing to vbroadcasti128, For latency perspective,vbroadcasti is no
worse than vlddqu + vinserti128.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-July/625122.html

Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/sse.md (*avx2_lddqu_inserti_to_bcasti): New
pre_reload define_insn_and_split.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vlddqu_vinserti128.c: New test.
---
 gcc/config/i386/sse.md | 18 ++
 .../gcc.target/i386/vlddqu_vinserti128.c   | 11 +++
 2 files changed, 29 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2d81347c7b6..4bdd2b43ba7 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -26600,6 +26600,24 @@ (define_insn "avx2_vbroadcasti128_"
(set_attr "prefix" "vex,evex,evex")
(set_attr "mode" "OI")])
 
+;; optimize vlddqu + vinserti128 to vbroadcasti128, the former will use
+;; extra shuffle port in addition to load port than the latter.
+;; For latency perspective,vbroadcasti is no worse.
+(define_insn_and_split "avx2_lddqu_inserti_to_bcasti"
+  [(set (match_operand:V4DI 0 "register_operand" "=x,v,v")
+   (vec_concat:V4DI
+ (subreg:V2DI
+   (unspec:V16QI [(match_operand:V16QI 1 "memory_operand")]
+ UNSPEC_LDDQU) 0)
+ (subreg:V2DI (unspec:V16QI [(match_dup 1)]
+ UNSPEC_LDDQU) 0)))]
+  "TARGET_AVX2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (vec_concat:V4DI (match_dup 1) (match_dup 1)))]
+  "operands[1] = adjust_address (operands[1], V2DImode, 0);")
+
 ;; Modes handled by AVX vec_dup patterns.
 (define_mode_iterator AVX_VEC_DUP_MODE
   [V8SI V8SF V4DI V4DF])
diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c 
b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
new file mode 100644
index 000..29699a5fa7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
+/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
+
+#include 
+__m256i foo(void *data) {
+__m128i X1 = _mm_lddqu_si128((__m128i*)data);
+__m256i V1 = _mm256_broadcastsi128_si256 (X1);
+return V1;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Support vec_fmaddsub/vec_fmsubadd for vector HFmode.

2023-08-01 Thread liuhongt via Gcc-patches

AVX512FP16 supports vfmaddsubXXXph and vfmsubaddXXXph.
Also remove scalar mode from fmaddsub/fmsubadd pattern since there's
no scalar instruction for that.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/81904
* config/i386/sse.md (vec_fmaddsub4): Extend to vector
HFmode, use mode iterator VFH instead.
(vec_fmsubadd4): Ditto.
(fma_fmaddsub_):
Remove scalar mode from iterator, use VFH_AVX512VL instead.
(fma_fmsubadd_):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr81904.c: New test.
---
 gcc/config/i386/sse.md  | 44 -
 gcc/testsuite/gcc.target/i386/pr81904.c | 22 +
 2 files changed, 44 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr81904.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 51961bbfc0b..4e75c9addaa 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5803,21 +5803,21 @@ (define_insn "_fnmsub__mask3"
 ;; But this doesn't seem useful in practice.
 
 (define_expand "vec_fmaddsub4"
-  [(set (match_operand:VF 0 "register_operand")
-   (unspec:VF
- [(match_operand:VF 1 "nonimmediate_operand")
-  (match_operand:VF 2 "nonimmediate_operand")
-  (match_operand:VF 3 "nonimmediate_operand")]
+  [(set (match_operand:VFH 0 "register_operand")
+   (unspec:VFH
+ [(match_operand:VFH 1 "nonimmediate_operand")
+  (match_operand:VFH 2 "nonimmediate_operand")
+  (match_operand:VFH 3 "nonimmediate_operand")]
  UNSPEC_FMADDSUB))]
   "TARGET_FMA || TARGET_FMA4 || ( == 64 || TARGET_AVX512VL)")
 
 (define_expand "vec_fmsubadd4"
-  [(set (match_operand:VF 0 "register_operand")
-   (unspec:VF
- [(match_operand:VF 1 "nonimmediate_operand")
-  (match_operand:VF 2 "nonimmediate_operand")
-  (neg:VF
-(match_operand:VF 3 "nonimmediate_operand"))]
+  [(set (match_operand:VFH 0 "register_operand")
+   (unspec:VFH
+ [(match_operand:VFH 1 "nonimmediate_operand")
+  (match_operand:VFH 2 "nonimmediate_operand")
+  (neg:VFH
+(match_operand:VFH 3 "nonimmediate_operand"))]
  UNSPEC_FMADDSUB))]
   "TARGET_FMA || TARGET_FMA4 || ( == 64 || TARGET_AVX512VL)")
 
@@ -5877,11 +5877,11 @@ (define_insn "*fma_fmaddsub_"
(set_attr "mode" "")])
 
 (define_insn "fma_fmaddsub_"
-  [(set (match_operand:VFH_SF_AVX512VL 0 "register_operand" "=v,v,v")
-   (unspec:VFH_SF_AVX512VL
- [(match_operand:VFH_SF_AVX512VL 1 "" "%0,0,v")
-  (match_operand:VFH_SF_AVX512VL 2 "" 
",v,")
-  (match_operand:VFH_SF_AVX512VL 3 "" 
"v,,0")]
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v,v")
+   (unspec:VFH_AVX512VL
+ [(match_operand:VFH_AVX512VL 1 "" "%0,0,v")
+  (match_operand:VFH_AVX512VL 2 "" 
",v,")
+  (match_operand:VFH_AVX512VL 3 "" 
"v,,0")]
  UNSPEC_FMADDSUB))]
   "TARGET_AVX512F &&  && 
"
   "@
@@ -5943,12 +5943,12 @@ (define_insn "*fma_fmsubadd_"
(set_attr "mode" "")])
 
 (define_insn "fma_fmsubadd_"
-  [(set (match_operand:VFH_SF_AVX512VL 0 "register_operand" "=v,v,v")
-   (unspec:VFH_SF_AVX512VL
- [(match_operand:VFH_SF_AVX512VL   1 "" "%0,0,v")
-  (match_operand:VFH_SF_AVX512VL   2 "" 
",v,")
-  (neg:VFH_SF_AVX512VL
-(match_operand:VFH_SF_AVX512VL 3 "" 
"v,,0"))]
+  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v,v")
+   (unspec:VFH_AVX512VL
+ [(match_operand:VFH_AVX512VL   1 "" "%0,0,v")
+  (match_operand:VFH_AVX512VL   2 "" 
",v,")
+  (neg:VFH_AVX512VL
+(match_operand:VFH_AVX512VL 3 "" 
"v,,0"))]
  UNSPEC_FMADDSUB))]
   "TARGET_AVX512F &&  && 
"
   "@
diff --git a/gcc/testsuite/gcc.target/i386/pr81904.c 
b/gcc/testsuite/gcc.target/i386/pr81904.c
new file mode 100644
index 000..9f5ad0bd952
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr81904.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times "vfmaddsub...ph\[ 
\t\]+\[^\n\]*%zmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vfmsubadd...ph\[ 
\t\]+\[^\n\]*%zmm\[0-9\]" 1 } } */
+
+void vec_fmaddsub_fp16(int n, _Float16 da_r, _Float16 *x, _Float16* y, 
_Float16* __restrict z)
+{
+  for (int i = 0; i < 32; i += 2)
+{
+  z[i] =  da_r * x[i] - y[i];
+  z[i+1]  =  da_r * x[i+1] + y[i+1];
+}
+}
+
+void vec_fmasubadd_fp16(int n, _Float16 da_r, _Float16 *x, _Float16* y, 
_Float16* __restrict z)
+{
+  for (int i = 0; i < 32; i += 2)
+{
+  z[i] =  da_r * x[i] + y[i];
+  z[i+1]  =  da_r * x[i+1] - y[i+1];
+}
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Adjust testcase for more optimal codegen.

2023-07-31 Thread liuhongt via Gcc-patches

After
b9d7140c80bd3c7355b8291bb46f0895dcd8c3cb is the first bad commit
commit b9d7140c80bd3c7355b8291bb46f0895dcd8c3cb
Author: Jan Hubicka 
Date:   Fri Jul 28 09:16:09 2023 +0200

loop-split improvements, part 1

Now we have
vpbroadcastd %ecx, %xmm0
vpaddd .LC3(%rip), %xmm0, %xmm0
vpextrd $3, %xmm0, %eax
vmovddup %xmm3, %xmm0
vrndscalepd $9, %xmm0, %xmm0
vunpckhpd %xmm0, %xmm0, %xmm3

for vrndscalepd, no need to insert pxor since it reuses input register
xmm0 to avoid partial sse dependece.

Pushed to trunk.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr87007-4.c: Adjust testcase.
* gcc.target/i386/pr87007-5.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr87007-4.c | 6 +++---
 gcc/testsuite/gcc.target/i386/pr87007-5.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr87007-4.c 
b/gcc/testsuite/gcc.target/i386/pr87007-4.c
index e91bdcbac44..23b5c5dcc52 100644
--- a/gcc/testsuite/gcc.target/i386/pr87007-4.c
+++ b/gcc/testsuite/gcc.target/i386/pr87007-4.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
-/* { dg-options "-Ofast -march=skylake-avx512 -mfpmath=sse" } */
-
+/* { dg-options "-O2 -march=skylake-avx512 -mfpmath=sse" } */
+/* Load of d2/d3 is hoisted out, vrndscalesd will reuse loades register to 
avoid partial dependence.  */
 
 #include
 
@@ -15,4 +15,4 @@ foo (int n, int k)
   d1 = ceil (d3);
 }
 
-/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 0 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr87007-5.c 
b/gcc/testsuite/gcc.target/i386/pr87007-5.c
index 20d13cf650b..b0b0a7b70ef 100644
--- a/gcc/testsuite/gcc.target/i386/pr87007-5.c
+++ b/gcc/testsuite/gcc.target/i386/pr87007-5.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
-/* { dg-options "-Ofast -march=skylake-avx512 -mfpmath=sse" } */
-
+/* { dg-options "-O2 -march=skylake-avx512 -mfpmath=sse" } */
+/* Load of d2/d3 is hoisted out, vrndscalesd will reuse loades register to 
avoid partial dependence.  */
 
 #include
 
@@ -15,4 +15,4 @@ foo (int n, int k)
   d1 = sqrt (d3);
 }
 
-/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */
+/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 0 } } */
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] [x86] Add UNSPEC_MASKOP to vpbroadcastm pattern.

2023-07-27 Thread liuhongt via Gcc-patches

Prevent rtl optimization of vec_duplicate + zero_extend to
vpbroadcastm since there could be an extra kmov after RA.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ready to push to trunk.

gcc/ChangeLog:

PR target/110788
* config/i386/sse.md (avx512cd_maskb_vec_dup): Add
UNSPEC_MASKOP.
(avx512cd_maskw_vec_dup: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110788.c: New test.
---
 gcc/config/i386/sse.md   |  8 ++--
 gcc/testsuite/gcc.target/i386/pr110788.c | 11 +++
 2 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110788.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 35fd66ed4aa..51961bbfc0b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -26778,11 +26778,14 @@ (define_insn 
"avx512dq_broadcast_1"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+;; Use unspec to prevent rtl optimizer to optimize zero_extend + vec_duplicate
+;; to pbroadcastm, there could be an extra kmov after RA.
 (define_insn "avx512cd_maskb_vec_dup"
   [(set (match_operand:VI8_AVX512VL 0 "register_operand" "=v")
(vec_duplicate:VI8_AVX512VL
  (zero_extend:DI
-   (match_operand:QI 1 "register_operand" "k"]
+   (match_operand:QI 1 "register_operand" "k"
+   (unspec [(const_int 0)] UNSPEC_MASKOP)]
   "TARGET_AVX512CD"
   "vpbroadcastmb2q\t{%1, %0|%0, %1}"
   [(set_attr "type" "mskmov")
@@ -26793,7 +26796,8 @@ (define_insn "avx512cd_maskw_vec_dup"
   [(set (match_operand:VI4_AVX512VL 0 "register_operand" "=v")
(vec_duplicate:VI4_AVX512VL
  (zero_extend:SI
-   (match_operand:HI 1 "register_operand" "k"]
+   (match_operand:HI 1 "register_operand" "k"
+   (unspec [(const_int 0)] UNSPEC_MASKOP)]
   "TARGET_AVX512CD"
   "vpbroadcastmw2d\t{%1, %0|%0, %1}"
   [(set_attr "type" "mskmov")
diff --git a/gcc/testsuite/gcc.target/i386/pr110788.c 
b/gcc/testsuite/gcc.target/i386/pr110788.c
new file mode 100644
index 000..4cf1676ccb6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110788.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -march=cascadelake --param vect-partial-vector-usage=2" } 
*/
+/* { dg-final { scan-assembler-not "vpbroadcastm" } } */
+
+double a[1024], b[1024];
+
+void foo (int n)
+{
+  for (int i = 0; i < n; ++i)
+a[i] = b[i] * 3.;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Optimize vlddqu to vmovdqu for TARGET_AVX

2023-07-20 Thread liuhongt via Gcc-patches

For Intel processors, after TARGET_AVX, vmovdqu is optimized as fast
as vlddqu, UNSPEC_LDDQU can be removed to enable more optimizations.
Can someone confirm this with AMD folks?
If AMD doesn't like such optimization, I'll put my optimization under
micro-architecture tuning.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
If AMD also like such optimization, Ok for trunk?

gcc/ChangeLog:

* config/i386/sse.md (_lddqu): Change to
define_expand, expand as simple move when TARGET_AVX
&& ( == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD).
The original define_insn is renamed to
..
(_lddqu): .. this.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vlddqu_vinserti128.c: New test.
---
 gcc/config/i386/sse.md| 15 ++-
 .../gcc.target/i386/vlddqu_vinserti128.c  | 11 +++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2d81347c7b6..d571a78f4c4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1835,7 +1835,20 @@ (define_peephole2
   [(set (match_dup 4) (match_dup 1))]
   "operands[4] = adjust_address (operands[0], V2DFmode, 0);")
 
-(define_insn "_lddqu"
+(define_expand "_lddqu"
+  [(set (match_operand:VI1 0 "register_operand")
+   (unspec:VI1 [(match_operand:VI1 1 "memory_operand")]
+   UNSPEC_LDDQU))]
+  "TARGET_SSE3"
+{
+  if (TARGET_AVX && ( == 16 || !TARGET_AVX256_SPLIT_UNALIGNED_LOAD))
+{
+  emit_move_insn (operands[0], operands[1]);
+  DONE;
+}
+})
+
+(define_insn "*_lddqu"
   [(set (match_operand:VI1 0 "register_operand" "=x")
(unspec:VI1 [(match_operand:VI1 1 "memory_operand" "m")]
UNSPEC_LDDQU))]
diff --git a/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c 
b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
new file mode 100644
index 000..29699a5fa7f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vlddqu_vinserti128.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times "vbroadcasti128" 1 } } */
+/* { dg-final { scan-assembler-not {(?n)vlddqu.*xmm} } } */
+
+#include 
+__m256i foo(void *data) {
+__m128i X1 = _mm_lddqu_si128((__m128i*)data);
+__m256i V1 = _mm256_broadcastsi128_si256 (X1);
+return V1;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Fix fp16 related testcase failure for i686.

2023-07-19 Thread liuhongt via Gcc-patches

> I see some regressions most likely with this change on i686-linux,
> in particular:
> +FAIL: gcc.dg/pr107547.c (test for excess errors)
> +FAIL: gcc.dg/torture/floatn-convert.c  -O0 (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O0 compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O1 (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O1 compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O2 (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O2 compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O2 -flto (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O2 -flto compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O2 -flto -flto-partition=none (test 
> for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O2 -flto -flto-partition=none 
> compilation failed to produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O3 -fomit-frame-pointer 
> -funroll-loops -fpeel-loops -ftracer -finline-functions (test for excess 
> errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O3 -fomit-frame-pointer 
> -funroll-loops -fpeel-loops -ftracer -finline-functions compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -O3 -g (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -O3 -g compilation failed to 
> produce executable
> +FAIL: gcc.dg/torture/floatn-convert.c  -Os (test for excess errors)
> +UNRESOLVED: gcc.dg/torture/floatn-convert.c  -Os compilation failed to 
> produce executable
> +FAIL: gcc.target/i386/float16-7.c (test for errors, line 7)
>

> Perhaps we need to tweak
> gcc/testsuite/lib/target-supports.exp (add_options_for_float16)
> so that it adds -msse2 for i?86-*-* x86_64-*-* (that would likely
> fix up floatn-convert) and for the others perhaps
> /* { dg-add-options float16 } */
> ?

I've verified the change fixed those failures.
Ready to push to trunk if there's no objections.

gcc/testsuite/ChangeLog:

* gcc.dg/pr107547.c: Add { dg-add-options float16 }.
* gcc.target/i386/float16-7.c: Add -msse2 to dg-options.
* lib/target-supports.exp (add_options_for_float16): Add
-msse2 for i?86-*-* || x86_64-*-*.
---
 gcc/testsuite/gcc.dg/pr107547.c   | 1 +
 gcc/testsuite/gcc.target/i386/float16-7.c | 2 +-
 gcc/testsuite/lib/target-supports.exp | 3 +++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/pr107547.c b/gcc/testsuite/gcc.dg/pr107547.c
index c6992c8674b..7cd68afc0af 100644
--- a/gcc/testsuite/gcc.dg/pr107547.c
+++ b/gcc/testsuite/gcc.dg/pr107547.c
@@ -1,6 +1,7 @@
 /* PR tree-optimization/107547 */
 /* { dg-do compile } */
 /* { dg-options "-O2" } */
+/* { dg-add-options float16 } */
 
 int x;
 
diff --git a/gcc/testsuite/gcc.target/i386/float16-7.c 
b/gcc/testsuite/gcc.target/i386/float16-7.c
index 86641afeba9..660021b6ccc 100644
--- a/gcc/testsuite/gcc.target/i386/float16-7.c
+++ b/gcc/testsuite/gcc.target/i386/float16-7.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mfpmath=387 -fexcess-precision=16" } */
+/* { dg-options "-O2 -msse2 -mfpmath=387 -fexcess-precision=16" } */
 /* { dg-excess-errors "'-fexcess-precision=16' is not compatible with 
'-mfpmath=387'" } */
 _Float16
 foo (_Float16 a, _Float16 b)
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 8ea0d9feb1c..42024474091 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3487,6 +3487,9 @@ proc add_options_for_float16 { flags } {
 if { [istarget arm*-*-*] } {
return "$flags -mfp16-format=ieee"
 }
+if { [istarget i?86-*-*] || [istarget x86_64-*-*] } {
+   return "$flags -msse2"
+}
 return "$flags"
 }
 
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Remove # from one_cmpl2 assemble output.

2023-07-17 Thread liuhongt via Gcc-patches

optimize_insn_for_speed () in assemble output is not aligned with
splitter condition, and it cause an ICE when building SPEC2017
blender_r.

Not sure if ctrl is supposed to be reliable in assemble output, the patch just 
remove that as a walkaround.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ready to push to trunk.

libpng/pngread.c: In function ‘png_read_image’:
libpng/pngread.c:786:1: internal compiler error: in final_scan_insn_1, at 
final.cc:2813
  786 | }
  | ^
0x73ac3d final_scan_insn_1
../../gcc/final.cc:2813
0xb3420b final_scan_insn(rtx_insn*, _IO_FILE*, int, int, int*)
../../gcc/final.cc:2887
0xb344c4 final_1
../../gcc/final.cc:1979
0xb34f64 rest_of_handle_final
../../gcc/final.cc:4240
0xb34f64 execute
../../gcc/final.cc:4318

gcc/ChangeLog:

PR target/110438
* config/i386/sse.md (one_cmpl2):
Remove # from assemble output.
---
 gcc/config/i386/sse.md | 4 
 1 file changed, 4 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6bf9c99a2c1..e1158c5717a 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17220,10 +17220,6 @@ (define_insn_and_split 
"one_cmpl2"
|| mode == SImode
|| mode == DImode)"
 {
-  if (! && which_alternative
-  && optimize_insn_for_speed_p ())
-return "#";
-
   if (TARGET_AVX512VL)
 return "vpternlog\t{$0x55, %1, %0, 
%0|%0, %0, %1, 0x55}";
   else
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Fix typo in the testcase.

2023-07-11 Thread liuhongt via Gcc-patches

Antony Polukhin 2023-07-11 09:51:58 UTC
There's a typo at 
https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=gcc/testsuite/g%2B%2B.target/i386/pr110170.C;h=e638b12a5ee2264ecef77acca86432a9f24b103b;hb=d41a57c46df6f8f7dae0c0a8b349e734806a837b#l87

It should be `|| !test3() || !test3r()` rather than `|| !test3() || !test4r()`

Committed as an obvious fix.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr110170.C: Fix typo.
---
 gcc/testsuite/g++.target/i386/pr110170.C | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/g++.target/i386/pr110170.C 
b/gcc/testsuite/g++.target/i386/pr110170.C
index e638b12a5ee..21cca8f3805 100644
--- a/gcc/testsuite/g++.target/i386/pr110170.C
+++ b/gcc/testsuite/g++.target/i386/pr110170.C
@@ -84,7 +84,7 @@ TEST()
   if (
   !test1() || !test1r()
   || !test2() || !test2r()
-  || !test3() || !test4r()
+  || !test3() || !test3r()
   || !test4() || !test4r()
   ) __builtin_abort();
 }
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Add peephole to eliminate redundant comparison after cmpccxadd.

2023-07-11 Thread liuhongt via Gcc-patches

Similar like we did for CMPXCHG, but extended to all
ix86_comparison_int_operator since CMPCCXADD set EFLAGS exactly same
as CMP.

When operand order in CMP insn is same as that in CMPCCXADD,
CMP insn can be eliminated directly.

When operand order is swapped in CMP insn, only optimize
cmpccxadd + cmpl + jcc/setcc to cmpccxadd + jcc/setcc when FLAGS_REG is dead
after jcc/setcc plus adjusting code for jcc/setcc.

gcc/ChangeLog:

PR target/110591
* config/i386/sync.md (cmpccxadd_): Adjust the pattern
to explicitly set FLAGS_REG like *cmp_1, also add extra
3 define_peephole2 after the pattern.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110591.c: New test.
* gcc.target/i386/pr110591-2.c: New test.
---
 gcc/config/i386/sync.md| 160 -
 gcc/testsuite/gcc.target/i386/pr110591-2.c |  90 
 gcc/testsuite/gcc.target/i386/pr110591.c   |  66 +
 3 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110591-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110591.c

diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index e1fa1504deb..e84226cf895 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -1093,7 +1093,9 @@ (define_insn "cmpccxadd_"
  UNSPECV_CMPCCXADD))
(set (match_dup 1)
(unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
-   (clobber (reg:CC FLAGS_REG))]
+   (set (reg:CC FLAGS_REG)
+   (compare:CC (match_dup 1)
+   (match_dup 2)))]
   "TARGET_CMPCCXADD && TARGET_64BIT"
 {
   char buf[128];
@@ -1105,3 +1107,159 @@ (define_insn "cmpccxadd_"
   output_asm_insn (buf, operands);
   return "";
 })
+
+(define_peephole2
+  [(set (match_operand:SWI48x 0 "register_operand")
+   (match_operand:SWI48x 1 "x86_64_general_operand"))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_operand:SWI48x 2 "memory_operand")
+ (match_dup 0)
+ (match_operand:SWI48x 3 "register_operand")
+ (match_operand:SI 4 "const_int_operand")]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
+ (set (reg:CC FLAGS_REG)
+  (compare:CC (match_dup 2)
+  (match_dup 0)))])
+   (set (reg FLAGS_REG)
+   (compare (match_operand:SWI48x 5 "register_operand")
+(match_operand:SWI48x 6 "x86_64_general_operand")))]
+  "TARGET_CMPCCXADD && TARGET_64BIT
+   && rtx_equal_p (operands[0], operands[5])
+   && rtx_equal_p (operands[1], operands[6])"
+  [(set (match_dup 0)
+   (match_dup 1))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_dup 2)
+ (match_dup 0)
+ (match_dup 3)
+ (match_dup 4)]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
+ (set (reg:CC FLAGS_REG)
+  (compare:CC (match_dup 2)
+  (match_dup 0)))])
+   (set (match_dup 7)
+   (match_op_dup 8
+ [(match_dup 9) (const_int 0)]))])
+
+(define_peephole2
+  [(set (match_operand:SWI48x 0 "register_operand")
+   (match_operand:SWI48x 1 "x86_64_general_operand"))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_operand:SWI48x 2 "memory_operand")
+ (match_dup 0)
+ (match_operand:SWI48x 3 "register_operand")
+ (match_operand:SI 4 "const_int_operand")]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
+ (set (reg:CC FLAGS_REG)
+  (compare:CC (match_dup 2)
+  (match_dup 0)))])
+   (set (reg FLAGS_REG)
+   (compare (match_operand:SWI48x 5 "register_operand")
+(match_operand:SWI48x 6 "x86_64_general_operand")))
+   (set (match_operand:QI 7 "nonimmediate_operand")
+   (match_operator:QI 8 "ix86_comparison_int_operator"
+ [(reg FLAGS_REG) (const_int 0)]))]
+  "TARGET_CMPCCXADD && TARGET_64BIT
+   && rtx_equal_p (operands[0], operands[6])
+   && rtx_equal_p (operands[1], operands[5])
+   && peep2_regno_dead_p (4, FLAGS_REG)"
+  [(set (match_dup 0)
+   (match_dup 1))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_dup 2)
+ (match_dup 0)
+ (match_dup 3)
+ (match_dup 4)]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+

[PATCH v2] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'

2023-07-10 Thread liuhongt via Gcc-patches

Here's updated patch.
1. use optimize_insn_for_speed_p instead of using optimize_function_for_speed_p.
2. explicitly move memory to dest register to avoid false dependence in 
one_cmpl pattern.


False dependency happens when destination is only updated by
pternlog. There is no false dependency when destination is also used
in source. So either a pxor should be inserted, or input operand
should be set with constraint '0'.

gcc/ChangeLog:

PR target/110438
PR target/110202
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov_constm1_pternlog_false_dep): New
define_insn.
(*_cvtmask2_pternlog_false_dep):
Ditto.
(*_cvtmask2_pternlog_false_dep):
Ditto.
(*_cvtmask2): Adjust to
define_insn_and_split to avoid false dependence.
(*_cvtmask2): Ditto.
(one_cmpl2): Adjust constraint
of operands 1 to '0' to avoid false dependence.
(*andnot3): Ditto.
(iornot3): Ditto.
(*3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110438.c: New test.
* gcc.target/i386/pr100711.c: Adjust testcase.
---
 gcc/config/i386/predicates.md  |   8 +-
 gcc/config/i386/sse.md | 145 ++---
 gcc/testsuite/gcc.target/i386/pr100711-6.c |   2 +-
 gcc/testsuite/gcc.target/i386/pr110438.c   |  30 +
 4 files changed, 168 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7ddbe01a6f9..37d20c6303a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
 return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+   (match_operand 0 "float_vector_all_ones_operand")
+   (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 418c337a775..05485b1792d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,29 @@ (define_insn "mov_internal"
  ]
  (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))
+  && optimize_insn_for_speed_p ()"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+ [(set (match_dup 0) (match_dup 1))
+  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (mode);")
+
+(define_insn "*vmov_constm1_pternlog_false_dep"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
""))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL ||  == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the 
inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to 
vpblendd
@@ -9336,7 +9359,7 @@ (define_expand "_cvtmask2"
 operands[3] = CONST0_RTX (mode);
   }")
 
-(define_insn "*_cvtmask2"
+(define_insn_and_split "*_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9346,11 +9369,35 @@ (define_insn "*_cvtmask2"
   "@
vpmovm2\t{%1, %0|%0, %1}
vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, 
%0, 0x81}"
+  "&& !TARGET_AVX512DQ && reload_completed
+   && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+[(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] =

[PATCH] Add peephole to eliminate redundant comparison after cmpccxadd.

2023-07-10 Thread liuhongt via Gcc-patches

Similar like we did for cmpxchg, but extended to all
ix86_comparison_int_operator since cmpccxadd set EFLAGS exactly same
as CMP.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,},
Ok for trunk?

gcc/ChangeLog:

PR target/110591
* config/i386/sync.md (cmpccxadd_): Add a new
define_peephole2 after the pattern.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110591.c: New test.
---
 gcc/config/i386/sync.md  | 56 
 gcc/testsuite/gcc.target/i386/pr110591.c | 66 
 2 files changed, 122 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110591.c

diff --git a/gcc/config/i386/sync.md b/gcc/config/i386/sync.md
index e1fa1504deb..43f6421bcb8 100644
--- a/gcc/config/i386/sync.md
+++ b/gcc/config/i386/sync.md
@@ -1105,3 +1105,59 @@ (define_insn "cmpccxadd_"
   output_asm_insn (buf, operands);
   return "";
 })
+
+(define_peephole2
+  [(set (match_operand:SWI48x 0 "register_operand")
+   (match_operand:SWI48x 1 "x86_64_general_operand"))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_operand:SWI48x 2 "memory_operand")
+ (match_dup 0)
+ (match_operand:SWI48x 3 "register_operand")
+ (match_operand:SI 4 "const_int_operand")]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
+ (clobber (reg:CC FLAGS_REG))])
+   (set (reg FLAGS_REG)
+   (compare (match_operand:SWI48x 5 "register_operand")
+(match_operand:SWI48x 6 "x86_64_general_operand")))
+   (set (match_operand:QI 7 "nonimmediate_operand")
+   (match_operator:QI 8 "ix86_comparison_int_operator"
+ [(reg FLAGS_REG) (const_int 0)]))]
+  "TARGET_CMPCCXADD && TARGET_64BIT
+   && ((rtx_equal_p (operands[0], operands[5])
+   && rtx_equal_p (operands[1], operands[6]))
+   || ((rtx_equal_p (operands[0], operands[6])
+   && rtx_equal_p (operands[1], operands[5]))
+  && peep2_regno_dead_p (4, FLAGS_REG)))"
+  [(set (match_dup 0)
+   (match_dup 1))
+   (parallel [(set (match_dup 0)
+  (unspec_volatile:SWI48x
+[(match_dup 2)
+ (match_dup 0)
+ (match_dup 3)
+ (match_dup 4)]
+UNSPECV_CMPCCXADD))
+ (set (match_dup 2)
+  (unspec_volatile:SWI48x [(const_int 0)] UNSPECV_CMPCCXADD))
+ (clobber (reg:CC FLAGS_REG))])
+   (set (match_dup 7)
+   (match_op_dup 8
+ [(match_dup 9) (const_int 0)]))]
+{
+  operands[9] = gen_rtx_REG (GET_MODE (XEXP (operands[8], 0)), FLAGS_REG);
+  if (rtx_equal_p (operands[0], operands[6])
+ && rtx_equal_p (operands[1], operands[5])
+ && swap_condition (GET_CODE (operands[8])) != GET_CODE (operands[8]))
+ {
+   operands[8] = shallow_copy_rtx (operands[8]);
+   enum rtx_code ccode = swap_condition (GET_CODE (operands[8]));
+   PUT_CODE (operands[8], ccode);
+   operands[9] = gen_rtx_REG (SELECT_CC_MODE (ccode,
+ operands[6],
+ operands[5]),
+  FLAGS_REG);
+ }
+})
diff --git a/gcc/testsuite/gcc.target/i386/pr110591.c 
b/gcc/testsuite/gcc.target/i386/pr110591.c
new file mode 100644
index 000..32a515b429e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110591.c
@@ -0,0 +1,66 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mcmpccxadd -O2" } */
+/* { dg-final { scan-assembler-not {cmp[lq]?[ \t]+} } } */
+/* { dg-final { scan-assembler-times {cmpoxadd[ \t]+} 12 } } */
+
+#include 
+
+_Bool foo_setg (int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) > v;
+}
+
+_Bool foo_setl (int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) < v;
+}
+
+_Bool foo_sete(int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) == v;
+}
+
+_Bool foo_setne(int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) != v;
+}
+
+_Bool foo_setge(int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) >= v;
+}
+
+_Bool foo_setle(int *ptr, int v)
+{
+return _cmpccxadd_epi32(ptr, v, 1, _CMPCCX_O) <= v;
+}
+
+_Bool fooq_setg (long long *ptr, long long v)
+{
+return _cmpccxadd_epi64(ptr, v, 1, _CMPCCX_O) > v;
+}
+
+_Bool fooq_setl (long long *ptr, long long v)
+{
+return _cmpccxadd_epi64(ptr, v, 1, _CMPCCX_O) < v;
+}
+
+_Bool fooq_sete(long long *ptr, long long v)
+{
+return _cmpccxadd_epi64(ptr, v, 1, _CMPCCX_O) == v;
+}
+
+_Bool fooq_setne(long long *ptr, long long v)
+{
+return _cmpccxadd_epi64(ptr, v, 1, _CMPCCX_O) != v;
+}
+
+_Bool fooq_setge(long long *ptr, long long v)
+{
+return _cmpccxadd_epi64(ptr, v, 1, _CMPCCX_O) >= v;
+}
+

[PATCH] Break false dependence for vpternlog by inserting vpxor or setting constraint of input operand to '0'

2023-07-09 Thread liuhongt via Gcc-patches

False dependency happens when destination is only updated by
pternlog. There is no false dependency when destination is also used
in source. So either a pxor should be inserted, or input operand
should be set with constraint '0'.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/110438
PR target/110202
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov_constm1_pternlog_false_dep): New
define_insn.
(*_cvtmask2_pternlog_false_dep):
Ditto.
(*_cvtmask2_pternlog_false_dep):
Ditto.
(*_cvtmask2): Adjust to
define_insn_and_split to avoid false dependence.
(*_cvtmask2): Ditto.
(one_cmpl2): Adjust constraint
of operands 1 to '0' to avoid false dependence.
(*andnot3): Ditto.
(iornot3): Ditto.
(*3): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110438.c: New test.
---
 gcc/config/i386/predicates.md|   8 +-
 gcc/config/i386/sse.md   | 113 ---
 gcc/testsuite/gcc.target/i386/pr110438.c |  30 ++
 3 files changed, 135 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 7ddbe01a6f9..37d20c6303a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
 return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+   (match_operand 0 "float_vector_all_ones_operand")
+   (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 418c337a775..56920a3e1d3 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,29 @@ (define_insn "mov_internal"
  ]
  (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))
+  && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+ [(set (match_dup 0) (match_dup 1))
+  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (mode);")
+
+(define_insn "*vmov_constm1_pternlog_false_dep"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
""))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL ||  == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the 
inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to 
vpblendd
@@ -9336,7 +9359,7 @@ (define_expand "_cvtmask2"
 operands[3] = CONST0_RTX (mode);
   }")
 
-(define_insn "*_cvtmask2"
+(define_insn_and_split "*_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9346,11 +9369,35 @@ (define_insn "*_cvtmask2"
   "@
vpmovm2\t{%1, %0|%0, %1}
vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, 
%0, 0x81}"
+  "&& !TARGET_AVX512DQ && reload_completed
+   && optimize_function_for_speed_p (cfun)"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+[(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] = CONST0_RTX (mode);"
   [(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "*_cvtmask2_pternlog_false_dep"
+  [(set

[PATCH V2] [x86] Add pre_reload splitter to detect fp min/max pattern.

2023-07-06 Thread liuhongt via Gcc-patches

> Please split the above pattern into two, one emitting UNSPEC_IEEE_MAX
> and the other emitting UNSPEC_IEEE_MIN.
Splitted.

> The test involves blendv instruction, which is SSE4.1, so it is
> pointless to test it without -msse4.1. Please add -msse4.1 instead of
> -march=x86_64 and use sse4_runtime target selector, as is the case
> with gcc.target/i386/pr90358.c.
Changed.

> Please also use -msse4.1 instead of -march here. With -mfpmath=sse,
> the test is valid also for 32bit targets, you should use -msseregparm
> additional options for ia32 (please see gcc.target/i386/pr43546.c
> testcase) in the same way as -mregparm to pass SSE arguments in
> registers.
32-bit target still failed to do condition elimination for DFmode due to
below code in rtx_cost

  /* A size N times larger than UNITS_PER_WORD likely needs N times as
 many insns, taking N times as long.  */
  factor = mode_size > UNITS_PER_WORD ? mode_size / UNITS_PER_WORD : 1;

It looks like a separate issue for DFmode operation under 32-bit target.

I've enable 32-bit for the testcase, but only scan for minss/maxss
currently.

Here's updated patch.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

We have ix86_expand_sse_fp_minmax to detect min/max sematics, but
it requires rtx_equal_p for cmp_op0/cmp_op1 and if_true/if_false, for
the testcase in the PR, there's an extra move from cmp_op0 to if_true,
and it failed ix86_expand_sse_fp_minmax.

This patch adds pre_reload splitter to detect the min/max pattern.

Operands order in MINSS matters for signed zero and NANs, since the
instruction always returns second operand when any operand is NAN or
both operands are zero.

gcc/ChangeLog:

PR target/110170
* config/i386/i386.md (*ieee_max3_1): New pre_reload
splitter to detect fp max pattern.
(*ieee_min3_1): Ditto, but for fp min pattern.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr110170.C: New test.
* gcc.target/i386/pr110170.c: New test.
---
 gcc/config/i386/i386.md  | 43 +
 gcc/testsuite/g++.target/i386/pr110170.C | 78 
 gcc/testsuite/gcc.target/i386/pr110170.c | 21 +++
 3 files changed, 142 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/i386/pr110170.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index a82cc353cfd..6f415f899ae 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -23163,6 +23163,49 @@ (define_insn "*ieee_s3"
(set_attr "type" "sseadd")
(set_attr "mode" "")])
 
+;; Operands order in min/max instruction matters for signed zero and NANs.
+(define_insn_and_split "*ieee_max3_1"
+  [(set (match_operand:MODEF 0 "register_operand")
+   (unspec:MODEF
+ [(match_operand:MODEF 1 "register_operand")
+  (match_operand:MODEF 2 "register_operand")
+  (lt:MODEF
+(match_operand:MODEF 3 "register_operand")
+(match_operand:MODEF 4 "register_operand"))]
+ UNSPEC_BLENDV))]
+  "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH
+  && (rtx_equal_p (operands[1], operands[3])
+  && rtx_equal_p (operands[2], operands[4]))
+  && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:MODEF
+ [(match_dup 2)
+  (match_dup 1)]
+UNSPEC_IEEE_MAX))])
+
+(define_insn_and_split "*ieee_min3_1"
+  [(set (match_operand:MODEF 0 "register_operand")
+   (unspec:MODEF
+ [(match_operand:MODEF 1 "register_operand")
+  (match_operand:MODEF 2 "register_operand")
+  (lt:MODEF
+(match_operand:MODEF 3 "register_operand")
+(match_operand:MODEF 4 "register_operand"))]
+ UNSPEC_BLENDV))]
+  "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH
+  && (rtx_equal_p (operands[1], operands[4])
+  && rtx_equal_p (operands[2], operands[3]))
+  && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:MODEF
+ [(match_dup 2)
+  (match_dup 1)]
+UNSPEC_IEEE_MIN))])
+
 ;; Make two stack loads independent:
 ;;   fld aa  fld aa
 ;;   fld %st(0) ->   fld bb
diff --git a/gcc/testsuite/g++.target/i386/pr110170.C 
b/gcc/testsuite/g++.target/i386/pr110170.C
new file mode 100644
index 000..5d6842270d0
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr110170.C
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+/* { dg-options " -O2 -msse4.1 -mfpmath=sse -std=gnu++20" } */
+#include 
+
+void
+__attribute__((noinline))
+__cond_swap(double* __x, double* __y) {
+  bool __r = (*__x < *__y);
+  auto __tmp = __r ? *__x : *__y;
+  *__y = __r ? *__y : *__x;
+  *__x = __tmp;
+}
+
+auto test1() {
+double nan = -0.0;
+double x = 0.0;
+__cond_swap(, );
+return x == -0.0 && nan == 0.0;
+}
+
+auto test1r() {
+double nan = NAN;
+double x = 1.0;
+__cond_swap(, );
+return isnan(x) && signbit(x) == 0 && nan == 1.0;
+}
+
+auto

[PATCH 1/2] [x86] Add pre_reload splitter to detect fp min/max pattern.

2023-07-05 Thread liuhongt via Gcc-patches

We have ix86_expand_sse_fp_minmax to detect min/max sematics, but
it requires rtx_equal_p for cmp_op0/cmp_op1 and if_true/if_false, for
the testcase in the PR, there's an extra move from cmp_op0 to if_true,
and it failed ix86_expand_sse_fp_minmax.

This patch adds pre_reload splitter to detect the min/max pattern.

Operands order in MINSS matters for signed zero and NANs, since the
instruction always returns second operand when any operand is NAN or
both operands are zero.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/110170
* config/i386/i386.md (*ieee_minmax3_1): New pre_reload
splitter to detect fp min/max pattern.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr110170.C: New test.
* gcc.target/i386/pr110170.c: New test.
---
 gcc/config/i386/i386.md  | 30 +
 gcc/testsuite/g++.target/i386/pr110170.C | 78 
 gcc/testsuite/gcc.target/i386/pr110170.c | 18 ++
 3 files changed, 126 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/i386/pr110170.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e6ebc461e52..353bb21993d 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -22483,6 +22483,36 @@ (define_insn "*ieee_s3"
(set_attr "type" "sseadd")
(set_attr "mode" "")])
 
+;; Operands order in min/max instruction matters for signed zero and NANs.
+(define_insn_and_split "*ieee_minmax3_1"
+  [(set (match_operand:MODEF 0 "register_operand")
+   (unspec:MODEF
+ [(match_operand:MODEF 1 "register_operand")
+  (match_operand:MODEF 2 "register_operand")
+  (lt:MODEF
+(match_operand:MODEF 3 "register_operand")
+(match_operand:MODEF 4 "register_operand"))]
+ UNSPEC_BLENDV))]
+  "SSE_FLOAT_MODE_P (mode) && TARGET_SSE_MATH
+  && ((rtx_equal_p (operands[1], operands[3])
+   && rtx_equal_p (operands[2], operands[4]))
+  || (rtx_equal_p (operands[1], operands[4])
+ && rtx_equal_p (operands[2], operands[3])))
+  && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  int u = (rtx_equal_p (operands[1], operands[3])
+  && rtx_equal_p (operands[2], operands[4]))
+  ? UNSPEC_IEEE_MAX : UNSPEC_IEEE_MIN;
+  emit_move_insn (operands[0],
+ gen_rtx_UNSPEC (mode,
+ gen_rtvec (2, operands[2], operands[1]),
+ u));
+  DONE;
+})
+
 ;; Make two stack loads independent:
 ;;   fld aa  fld aa
 ;;   fld %st(0) ->   fld bb
diff --git a/gcc/testsuite/g++.target/i386/pr110170.C 
b/gcc/testsuite/g++.target/i386/pr110170.C
new file mode 100644
index 000..1e9a781ca74
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr110170.C
@@ -0,0 +1,78 @@
+/* { dg-do run } */
+/* { dg-options " -O2 -march=x86-64 -mfpmath=sse -std=gnu++20" } */
+#include 
+
+void
+__attribute__((noinline))
+__cond_swap(double* __x, double* __y) {
+  bool __r = (*__x < *__y);
+  auto __tmp = __r ? *__x : *__y;
+  *__y = __r ? *__y : *__x;
+  *__x = __tmp;
+}
+
+auto test1() {
+double nan = -0.0;
+double x = 0.0;
+__cond_swap(, );
+return x == -0.0 && nan == 0.0;
+}
+
+auto test1r() {
+double nan = NAN;
+double x = 1.0;
+__cond_swap(, );
+return isnan(x) && signbit(x) == 0 && nan == 1.0;
+}
+
+auto test2() {
+double nan = NAN;
+double x = -1.0;
+__cond_swap(, );
+return isnan(x) && signbit(x) == 0 && nan == -1.0;
+}
+
+auto test2r() {
+double nan = NAN;
+double x = -1.0;
+__cond_swap(, );
+return isnan(x) && signbit(x) == 0 && nan == -1.0;
+}
+
+auto test3() {
+double nan = -NAN;
+double x = 1.0;
+__cond_swap(, );
+return isnan(x) && signbit(x) == 1 && nan == 1.0;
+}
+
+auto test3r() {
+double nan = -NAN;
+double x = 1.0;
+__cond_swap(, );
+return isnan(x) && signbit(x) == 1 && nan == 1.0;
+}
+
+auto test4() {
+double nan = -NAN;
+double x = -1.0;
+__cond_swap(, );
+return isnan(x) && signbit(x) == 1 && nan == -1.0;
+}
+
+auto test4r() {
+double nan = -NAN;
+double x = -1.0;
+__cond_swap(, );
+return isnan(x) && signbit(x) == 1 && nan == -1.0;
+}
+
+
+int main() {
+if (
+!test1() || !test1r()
+|| !test2() || !test2r()
+|| !test3() || !test4r()
+|| !test4() || !test4r()
+) __builtin_abort();
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr110170.c 
b/gcc/testsuite/gcc.target/i386/pr110170.c
new file mode 100644
index 000..0f98545cce3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170.c
@@ -0,0 +1,18 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options " -O2 -march=x86-64-v2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times {(?n)mins[sd]} 2 } } */
+/* { dg-final { scan-assembler-times {(?n)maxs[sd]} 2 } } */
+
+void

[PATCH 2/2] Adjust rtx_cost for DF/SFmode AND/IOR/XOR/ANDN operations.

2023-07-05 Thread liuhongt via Gcc-patches

They should have same cost as vector mode since both generate
pand/pandn/pxor/por instruction.

Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for
DF/SFmode AND/IOR/XOR/ANDN operations.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110170-2.c: New test.
---
 gcc/config/i386/i386.cc|  6 --
 gcc/testsuite/gcc.target/i386/pr110170-2.c | 16 
 2 files changed, 20 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ff56ee8dd..fe31acd7646 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21153,7 +21153,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 
 case IOR:
 case XOR:
-  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ || SSE_FLOAT_MODE_P (mode))
*total = ix86_vec_cost (mode, cost->sse_op);
   else if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
*total = cost->add * 2;
@@ -21167,7 +21168,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
  *total = cost->lea;
  return true;
}
-  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+  else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+  || SSE_FLOAT_MODE_P (mode))
{
  /* pandn is a single instruction.  */
  if (GET_CODE (XEXP (x, 0)) == NOT)
diff --git a/gcc/testsuite/gcc.target/i386/pr110170-2.c 
b/gcc/testsuite/gcc.target/i386/pr110170-2.c
new file mode 100644
index 000..d43e322fc49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170-2.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-msse2 -O2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-not "comi" } }  */
+
+double
+foo (double* a, double* b, double c, double d)
+{
+  return *a < *b ? c : d;
+}
+
+float
+foo1 (float* a, float* b, float c, float d)
+{
+  return *a < *b ? c : d;
+}
+
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Disparage slightly for the alternative which move DFmode between SSE_REGS and GENERAL_REGS.

2023-07-05 Thread liuhongt via Gcc-patches

For testcase

void __cond_swap(double* __x, double* __y) {
  bool __r = (*__x < *__y);
  auto __tmp = __r ? *__x : *__y;
  *__y = __r ? *__y : *__x;
  *__x = __tmp;
}

GCC-14 with -O2 and -march=x86-64 options generates the following code:

__cond_swap(double*, double*):
movsd   xmm1, QWORD PTR [rdi]
movsd   xmm0, QWORD PTR [rsi]
comisd  xmm0, xmm1
jbe .L2
movqrax, xmm1
movapd  xmm1, xmm0
movqxmm0, rax
.L2:
movsd   QWORD PTR [rsi], xmm1
movsd   QWORD PTR [rdi], xmm0
ret

rax is used to save and restore DFmode value. In RA both GENERAL_REGS
and SSE_REGS cost zero since we didn't disparage the
alternative in movdf_internal pattern, according to register
allocation order, GENERAL_REGS is allocated. The patch add ? for
alternative (r,v) and (v,r) just like we did for movsf/hf/bf_internal
pattern, after that we get optimal RA.

__cond_swap:
.LFB0:
.cfi_startproc
movsd   (%rdi), %xmm1
movsd   (%rsi), %xmm0
comisd  %xmm1, %xmm0
jbe .L2
movapd  %xmm1, %xmm2
movapd  %xmm0, %xmm1
movapd  %xmm2, %xmm0
.L2:
movsd   %xmm1, (%rsi)
movsd   %xmm0, (%rdi)
ret

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?


gcc/ChangeLog:

PR target/110170
* config/i386/i386.md (movdf_internal): Disparage slightly for
2 alternatives (r,v) and (v,r) by adding constraint modifier
'?'.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110170-3.c: New test.
---
 gcc/config/i386/i386.md|  4 ++--
 gcc/testsuite/gcc.target/i386/pr110170-3.c | 11 +++
 2 files changed, 13 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110170-3.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index a82cc353cfd..e47ced1bb70 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3915,9 +3915,9 @@ (define_split
 ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7.
 (define_insn "*movdf_internal"
   [(set (match_operand:DF 0 "nonimmediate_operand"
-"=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r  
,o ,r  ,m")
+"=Yf*f,m   ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,?r,?v,r 
 ,o ,r  ,m")
(match_operand:DF 1 "general_operand"
-"Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r 
,roF,rF,rmF,rC"))]
+"Yf*fm,Yf*f,G   ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x, v, 
r,roF,rF,rmF,rC"))]
   "!(MEM_P (operands[0]) && MEM_P (operands[1]))
&& (lra_in_progress || reload_completed
|| !CONST_DOUBLE_P (operands[1])
diff --git a/gcc/testsuite/gcc.target/i386/pr110170-3.c 
b/gcc/testsuite/gcc.target/i386/pr110170-3.c
new file mode 100644
index 000..70daa89e9aa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110170-3.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -fno-if-conversion -fno-if-conversion2" } */
+/* { dg-final { scan-assembler-not {(?n)movq.*r} } } */
+
+void __cond_swap(double* __x, double* __y) {
+  _Bool __r = (*__x < *__y);
+  double __tmp = __r ? *__x : *__y;
+  *__y = __r ? *__y : *__x;
+  *__x = __tmp;
+}
+
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Break false dependence for vpternlog by inserting vpxor.

2023-07-03 Thread liuhongt via Gcc-patches

vpternlog is also used for optimization which doesn't need any valid
input operand, in that case, the destination is used as input in the
instruction and that creates a false dependence.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/110438
* config/i386/predicates.md
(int_float_vector_all_ones_operand): New predicate.
* config/i386/sse.md (*vmov_constm1_pternlog): New
define_insn.
(*_cvtmask2): Adjust to
define_insn_and_split to avoid false dependence.
(*_cvtmask2_pternlog): New
define_insn.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110438.c: New test.
---
 gcc/config/i386/predicates.md|  8 ++-
 gcc/config/i386/sse.md   | 69 +++-
 gcc/testsuite/gcc.target/i386/pr110438.c | 30 +++
 3 files changed, 94 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110438.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index fb07707dcba..df0d9e20def 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1192,12 +1192,18 @@ (define_predicate "float_vector_all_ones_operand"
 return false;
 })
 
-/* Return true if operand is a vector constant that is all ones. */
+/* Return true if operand is an integral vector constant that is all ones. */
 (define_predicate "vector_all_ones_operand"
   (and (match_code "const_vector")
(match_test "INTEGRAL_MODE_P (GET_MODE (op))")
(match_test "op == CONSTM1_RTX (GET_MODE (op))")))
 
+/* Return true if operand is a vector constant that is all ones. */
+(define_predicate "int_float_vector_all_ones_operand"
+  (ior (match_operand 0 "vector_all_ones_operand")
+   (match_operand 0 "float_vector_all_ones_operand")
+   (match_test "op == constm1_rtx")))
+
 /* Return true if operand is an 128/256bit all ones vector
that zero-extends to 256/512bit.  */
 (define_predicate "vector_all_ones_zero_extend_half_operand"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 812cfca4b92..93cdd844026 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1382,6 +1382,28 @@ (define_insn "mov_internal"
  ]
  (symbol_ref "true")))])
 
+; False dependency happens on destination register which is not really
+; used when moving all ones to vector register
+(define_split
+  [(set (match_operand:VMOVE 0 "register_operand")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand"))]
+  "TARGET_AVX512F && reload_completed
+  && ( == 64 || EXT_REX_SSE_REG_P (operands[0]))"
+  [(set (match_dup 0) (match_dup 2))
+   (parallel
+ [(set (match_dup 0) (match_dup 1))
+  (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[2] = CONST0_RTX (mode);")
+
+(define_insn "*vmov_constm1_pternlog"
+  [(set (match_operand:VMOVE 0 "register_operand" "=v")
+   (match_operand:VMOVE 1 "int_float_vector_all_ones_operand" 
""))
+   (unspec [(match_operand:VMOVE 2 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+   "TARGET_AVX512VL ||  == 64"
+   "vpternlogd\t{$0xFF, %0, %0, %0|%0, %0, %0, 0xFF}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "evex")])
+
 ;; If mem_addr points to a memory region with less than whole vector size bytes
 ;; of accessible memory and k is a mask that would prevent reading the 
inaccessible
 ;; bytes from mem_addr, add UNSPEC_MASKLOAD to prevent it to be transformed to 
vpblendd
@@ -9336,7 +9358,7 @@ (define_expand "_cvtmask2"
 operands[3] = CONST0_RTX (mode);
   }")
 
-(define_insn "*_cvtmask2"
+(define_insn_and_split "*_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VI48_AVX512VL
  (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
@@ -9345,12 +9367,35 @@ (define_insn "*_cvtmask2"
   "TARGET_AVX512F"
   "@
vpmovm2\t{%1, %0|%0, %1}
-   vpternlog\t{$0x81, %0, %0, %0%{%1%}%{z%}|%0%{%1%}%{z%}, %0, 
%0, 0x81}"
+   #"
+  "&& !TARGET_AVX512DQ && reload_completed"
+  [(set (match_dup 0) (match_dup 4))
+   (parallel
+[(set (match_dup 0)
+ (vec_merge:VI48_AVX512VL
+   (match_dup 2)
+   (match_dup 3)
+   (match_dup 1)))
+ (unspec [(match_dup 0)] UNSPEC_INSN_FALSE_DEP)])]
+  "operands[4] = CONST0_RTX (mode);"
   [(set_attr "isa" "avx512dq,*")
(set_attr "length_immediate" "0,1")
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn "*_cvtmask2_pternlog"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
+   (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "vector_all_ones_operand")
+ (match_operand:VI48_AVX512VL 3 "const0_operand")
+ (match_operand: 1 "register_operand" "Yk")))
+   (unspec [(match_operand:VI48_AVX512VL 4 "register_operand" "0")] 
UNSPEC_INSN_FALSE_DEP)]
+  "TARGET_AVX512F && !TARGET_AVX512DQ"
+

[PATCH 1/2] Don't issue vzeroupper for vzeroupper call_insn.

2023-06-26 Thread liuhongt via Gcc-patches

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/82735
* config/i386/i386.cc (ix86_avx_u127_mode_needed): Don't emit
vzeroupper for vzeroupper call_insn.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-vzeroupper-30.c: New test.
---
 gcc/config/i386/i386.cc   |  5 +++--
 gcc/testsuite/gcc.target/i386/avx-vzeroupper-30.c | 15 +++
 2 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-30.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 0761965344b..caca74d6dec 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14489,8 +14489,9 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
 modes wider than 256 bits.  It's only safe to issue a
 vzeroupper if all SSE registers are clobbered.  */
   const function_abi  = insn_callee_abi (insn);
-  if (!hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
- abi.mode_clobbers (V4DImode)))
+  if (vzeroupper_pattern (PATTERN (insn), VOIDmode)
+ || !hard_reg_set_subset_p (reg_class_contents[SSE_REGS],
+abi.mode_clobbers (V4DImode)))
return AVX_U128_ANY;
 
   return AVX_U128_CLEAN;
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-30.c 
b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-30.c
new file mode 100644
index 000..c1c9baa8fc4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-30.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx -mvzeroupper -dp" } */
+
+#include 
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  x = y;
+  _mm256_zeroupper ();
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 2/2] Make option mvzeroupper independent of optimization level.

2023-06-26 Thread liuhongt via Gcc-patches

pass_insert_vzeroupper is under condition

TARGET_AVX && TARGET_VZEROUPPER
&& flag_expensive_optimizations && !optimize_size

But the document of mvzeroupper doesn't mention the insertion
required -O2 and above, it may confuse users when they explicitly
use -Os -mvzeroupper.


mvzeroupper
Target Mask(VZEROUPPER) Save
Generate vzeroupper instruction before a transfer of control flow out of
the function.


The patch moves flag_expensive_optimizations && !optimize_size to
ix86_option_override_internal. It makes -mvzeroupper independent of
optimization level, but still keeps the behavior of architecture
tuning(emit_vzeroupper) unchanged.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386-features.cc (pass_insert_vzeroupper:gate):
Move flag_expensive_optimizations && !optimize_size to ..
* config/i386/i386-options.cc (ix86_option_override_internal):
.. this, it makes -mvzeroupper independent of optimization
level, but still keeps the behavior of architecture
tuning(emit_vzeroupper) unchanged.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx-vzeroupper-29.c: New testcase.
---
 gcc/config/i386/i386-features.cc  |  3 +--
 gcc/config/i386/i386-options.cc   |  4 +++-
 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c | 14 ++
 3 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 4a3b07ae045..92ae08d442e 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -2489,8 +2489,7 @@ public:
   /* opt_pass methods: */
   bool gate (function *) final override
 {
-  return TARGET_AVX && TARGET_VZEROUPPER
-   && flag_expensive_optimizations && !optimize_size;
+  return TARGET_AVX && TARGET_VZEROUPPER;
 }
 
   unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 2cb0bddcd35..f76e7c5947b 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2727,7 +2727,9 @@ ix86_option_override_internal (bool main_args_p,
 sorry ("%<-mcall-ms2sysv-xlogues%> isn%'t currently supported with SEH");
 
   if (!(opts_set->x_target_flags & MASK_VZEROUPPER)
-  && TARGET_EMIT_VZEROUPPER)
+  && TARGET_EMIT_VZEROUPPER
+  && flag_expensive_optimizations
+  && !optimize_size)
 opts->x_target_flags |= MASK_VZEROUPPER;
   if (!(opts_set->x_target_flags & MASK_STV))
 opts->x_target_flags |= MASK_STV;
diff --git a/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c 
b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
new file mode 100644
index 000..4af637757f7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx-vzeroupper-29.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O0 -mavx -mtune=generic -mvzeroupper -dp" } */
+
+#include 
+
+extern __m256 x, y;
+
+void
+foo ()
+{
+  x = y;
+}
+
+/* { dg-final { scan-assembler-times "avx_vzeroupper" 1 } } */
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] [x86] Refine maskstore patterns with UNSPEC_MASKMOV.

2023-06-26 Thread liuhongt via Gcc-patches

At the rtl level, we cannot guarantee that the maskstore is not optimized
to other full-memory accesses, as the current implementations are equivalent
in terms of pattern, to solve this potential problem, this patch refines
the pattern of the maskstore and the intrinsics with unspec.

One thing I'm not sure is VCOND_EXPR, should VCOND_EXPR also expect
fault suppression for masked-out elements?

Currently we're still using vec_merge for both AVX2 and AVX512 target.


Similar like r14-2070-gc79476da46728e

If mem_addr points to a memory region with less than whole vector size
bytes of accessible memory and k is a mask that would prevent reading
the inaccessible bytes from mem_addr, add UNSPEC_MASKMOV to prevent
it to be transformed to any other whole memory access instructions.

Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR rtl-optimization/110237
* config/i386/sse.md (_store_mask): Refine with
UNSPEC_MASKMOV.
(maskstore_store_mask): New define_insn, it's renamed
from original _store_mask.
---
 gcc/config/i386/sse.md | 69 ++
 1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3b50c7117f8..812cfca4b92 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1608,7 +1608,7 @@ (define_insn "_blendm"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
-(define_insn "_store_mask"
+(define_insn "*_store_mask"
   [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
(vec_merge:V48_AVX512VL
  (match_operand:V48_AVX512VL 1 "register_operand" "v")
@@ -1636,7 +1636,7 @@ (define_insn "_store_mask"
(set_attr "memory" "store")
(set_attr "mode" "")])
 
-(define_insn "_store_mask"
+(define_insn "*_store_mask"
   [(set (match_operand:VI12HFBF_AVX512VL 0 "memory_operand" "=m")
(vec_merge:VI12HFBF_AVX512VL
  (match_operand:VI12HFBF_AVX512VL 1 "register_operand" "v")
@@ -27008,21 +27008,66 @@ (define_expand "maskstore"
   "TARGET_AVX")
 
 (define_expand "maskstore"
-  [(set (match_operand:V48H_AVX512VL 0 "memory_operand")
-   (vec_merge:V48H_AVX512VL
- (match_operand:V48H_AVX512VL 1 "register_operand")
- (match_dup 0)
- (match_operand: 2 "register_operand")))]
+  [(set (match_operand:V48_AVX512VL 0 "memory_operand")
+   (unspec:V48_AVX512VL
+ [(match_operand:V48_AVX512VL 1 "register_operand")
+  (match_dup 0)
+  (match_operand: 2 "register_operand")]
+ UNSPEC_MASKMOV))]
   "TARGET_AVX512F")
 
 (define_expand "maskstore"
-  [(set (match_operand:VI12_AVX512VL 0 "memory_operand")
-   (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 1 "register_operand")
- (match_dup 0)
- (match_operand: 2 "register_operand")))]
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "memory_operand")
+   (unspec:VI12HFBF_AVX512VL
+ [(match_operand:VI12HFBF_AVX512VL 1 "register_operand")
+  (match_dup 0)
+  (match_operand: 2 "register_operand")]
+ UNSPEC_MASKMOV))]
   "TARGET_AVX512BW")
 
+(define_insn "_store_mask"
+  [(set (match_operand:V48_AVX512VL 0 "memory_operand" "=m")
+   (unspec:V48_AVX512VL
+ [(match_operand:V48_AVX512VL 1 "register_operand" "v")
+  (match_dup 0)
+  (match_operand: 2 "register_operand" "Yk")]
+ UNSPEC_MASKMOV))]
+  "TARGET_AVX512F"
+{
+  if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
+{
+  if (misaligned_operand (operands[0], mode))
+   return "vmovu\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+  else
+   return "vmova\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+}
+  else
+{
+  if (misaligned_operand (operands[0], mode))
+   return "vmovdqu\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+  else
+   return "vmovdqa\t{%1, %0%{%2%}|%0%{%2%}, %1}";
+}
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "store")
+   (set_attr "mode" "")])
+
+(define_insn "_store_mask"
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "memory_operand" "=m")
+   (unspec:VI12HFBF_AVX512VL
+ [(match_operand:VI12HFBF_AVX512VL 1 "register_operand" "v")
+  (match_dup 0)
+  (match_operand: 2 "register_operand" "Yk")]
+  UNSPEC_MASKMOV))]
+  "TARGET_AVX512BW"
+  "vmovdqu\t{%1, %0%{%2%}|%0%{%2%}, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "memory" "store")
+   (set_attr "mode" "")])
+
 (define_expand "cbranch4"
   [(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:VI48_AVX 1 "register_operand")
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Issue a warning for conversion between short and __bf16 under TARGET_AVX512BF16.

2023-06-26 Thread liuhongt via Gcc-patches

__bfloat16 is redefined from typedef short to real __bf16 since GCC
V13. The patch issues an warning for potential silent implicit
conversion between __bf16 and short where users may only expect a
data movement.

To avoid too many false positive, warning is only under
TARGET_AVX512BF16.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_invalid_conversion): New function.
(TARGET_INVALID_CONVERSION): Define as
ix86_invalid_conversion.

gcc/testsuite/ChangeLog:

* gcc.target/i386/bf16_short_warn.c: New test.
---
 gcc/config/i386/i386.cc   | 32 +++
 .../gcc.target/i386/bf16_short_warn.c | 17 ++
 2 files changed, 49 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/bf16_short_warn.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 0761965344b..dc02eac6203 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22718,6 +22718,35 @@ x86_emit_floatuns (rtx operands[2])
 
   emit_label (donelab);
 }
+
+/* Return the diagnostic message string if conversion from FROMTYPE to
+   TOTYPE is not allowed, NULL otherwise.
+   Currently it's used to warn for silent implicit conversion between __bf16
+   and short, since __bfloat16 is refined as real __bf16 instead of short
+   since GCC13.  */
+
+static const char *
+ix86_invalid_conversion (const_tree fromtype, const_tree totype)
+{
+  if (element_mode (fromtype) != element_mode (totype)
+  && (TARGET_AVX512BF16 || TARGET_AVXNECONVERT))
+{
+  /* Warn for silent implicit conversion where user may expect
+a bitcast.  */
+  if ((TYPE_MODE (fromtype) == BFmode
+  && TYPE_MODE (totype) == HImode)
+ || (TYPE_MODE (totype) == BFmode
+ && TYPE_MODE (fromtype) == HImode))
+   warning (0, "%<__bfloat16%> is redefined from typedef % "
+   "to real %<__bf16%> since GCC V13, be careful of "
+"implicit conversion between %<__bf16%> and %; "
+"a explicit bitcast may be needed here");
+}
+
+  /* Conversion allowed.  */
+  return NULL;
+}
+
 
 /* Target hook for scalar_mode_supported_p.  */
 static bool
@@ -25009,6 +25038,9 @@ ix86_run_selftests (void)
 #  define TARGET_MERGE_DECL_ATTRIBUTES merge_dllimport_decl_attributes
 #endif
 
+#undef TARGET_INVALID_CONVERSION
+#define TARGET_INVALID_CONVERSION ix86_invalid_conversion
+
 #undef TARGET_COMP_TYPE_ATTRIBUTES
 #define TARGET_COMP_TYPE_ATTRIBUTES ix86_comp_type_attributes
 
diff --git a/gcc/testsuite/gcc.target/i386/bf16_short_warn.c 
b/gcc/testsuite/gcc.target/i386/bf16_short_warn.c
new file mode 100644
index 000..3e47a815200
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/bf16_short_warn.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#include
+typedef struct {
+short payload;
+} BFloat16;
+
+__attribute__((target("avx512vl,avx512bf16")))
+BFloat16 tobf16_avx512(float f)
+{
+BFloat16 r;
+__m128bh m = _mm_cvtneps_pbh(_mm_set_ss(f));
+r.payload = m[0]; /* { dg-warning " be careful of implicit conversion 
between '__bf16' and 'short'" } */
+return r;
+}
+
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 1/3] Use cvt_op to save intermediate type operand instead of "subtle" vec_dest.

2023-06-25 Thread liuhongt via Gcc-patches

When there're multiple operands in vec_oprnds0, vec_dest will be
overwrited to vectype_out, but in multi_step_cvt case, cvt_type is
expected. It caused an ICE when verify_gimple_in_cfg.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and aarch64-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/110371
PR tree-optimization/110018
* tree-vect-stmts.cc (vectorizable_conversion): Use cvt_op to
save intermediate type operand instead of "subtle" vec_dest
for case NONE.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/pr110371.c: New test.
---
 gcc/testsuite/gcc.target/aarch64/pr110371.c | 20 
 gcc/tree-vect-stmts.cc  | 14 ++
 2 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr110371.c

diff --git a/gcc/testsuite/gcc.target/aarch64/pr110371.c 
b/gcc/testsuite/gcc.target/aarch64/pr110371.c
new file mode 100644
index 000..444e514e04f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr110371.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+typedef struct dest
+{
+  double m[3][3];
+} dest;
+
+typedef struct src
+{
+  int m[3][3];
+} src;
+
+void
+foo (dest *a, src* s)
+{
+  for (int i = 0; i != 3; i++)
+for (int j = 0; j != 3; j++)
+  a->m[i][j] = s->m[i][j];
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 85d1f3ae52c..1748555a625 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5044,7 +5044,7 @@ vectorizable_conversion (vec_info *vinfo,
 gimple **vec_stmt, slp_tree slp_node,
 stmt_vector_for_cost *cost_vec)
 {
-  tree vec_dest;
+  tree vec_dest, cvt_op = NULL_TREE;
   tree scalar_dest;
   tree op0, op1 = NULL_TREE;
   loop_vec_info loop_vinfo = dyn_cast  (vinfo);
@@ -5568,6 +5568,13 @@ vectorizable_conversion (vec_info *vinfo,
 case NONE:
   vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
 op0, _oprnds0);
+  /* vec_dest is intermediate type operand when multi_step_cvt.  */
+  if (multi_step_cvt)
+   {
+ cvt_op = vec_dest;
+ vec_dest = vec_dsts[0];
+   }
+
   FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
{
  /* Arguments are ready, create the new vector stmt.  */
@@ -5575,12 +5582,11 @@ vectorizable_conversion (vec_info *vinfo,
  if (multi_step_cvt)
{
  gcc_assert (multi_step_cvt == 1);
- new_stmt = vect_gimple_build (vec_dest, codecvt1, vop0);
- new_temp = make_ssa_name (vec_dest, new_stmt);
+ new_stmt = vect_gimple_build (cvt_op, codecvt1, vop0);
+ new_temp = make_ssa_name (cvt_op, new_stmt);
  gimple_assign_set_lhs (new_stmt, new_temp);
  vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
  vop0 = new_temp;
- vec_dest = vec_dsts[0];
}
  new_stmt = vect_gimple_build (vec_dest, code1, vop0);
  new_temp = make_ssa_name (vec_dest, new_stmt);
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 3/3] [aarch64] Adjust testcase to match assembly output after r14-2007.

2023-06-25 Thread liuhongt via Gcc-patches

The new assembly looks better than original one, so I adjust those testcases.
Ok for trunk?

gcc/testsuite/ChangeLog:

PR tree-optimization/110371
PR tree-optimization/110018
* gcc.target/aarch64/sve/unpack_fcvt_signed_1.c: Scan scvt +
sxtw instead of scvt + zip1 + zip2.
* gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c: Scan scvt +
uxtw instead of ucvtf + zip1 + zip2.
---
 gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c | 6 +++---
 .../gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c
index 0f96dc2ff00..5edc288ce35 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_signed_1.c
@@ -10,6 +10,6 @@ unpack_double_int_plus8 (double *d, int32_t *s, int size)
 d[i] = s[i] + 8;
 }
 
-/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.s, z[0-9]+\.s, 
z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.s, z[0-9]+\.s, 
z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tsxtw\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d\n} 1 } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c
index 70465f91eba..ecd72176177 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/unpack_fcvt_unsigned_1.c
@@ -10,6 +10,5 @@ unpack_double_int_plus9 (double *d, uint32_t *s, int size)
 d[i] = (double) (s[i] + 9);
 }
 
-/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.s, z[0-9]+\.s, 
z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.s, z[0-9]+\.s, 
z[0-9]+\.s\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tucvtf\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tuxtw\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d\n} 1 } } */
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 2/3] Don't use intermiediate type for FIX_TRUNC_EXPR when ftrapping-math.

2023-06-25 Thread liuhongt via Gcc-patches

> > Hmm, good question.  GENERIC has a direct truncation to unsigned char
> > for example, the C standard generally says if the integral part cannot
> > be represented then the behavior is undefined.  So I think we should be
> > safe here (0x1.0p32 doesn't fit an int).
>
> We should be following Annex F (unspecified value plus "invalid" exception
> for out-of-range floating-to-integer conversions rather than undefined
> behavior).  But we don't achieve that very well at present (see bug 93806
> comments 27-29 for examples of how such conversions produce wobbly
> values).

That would mean guarding this with !flag_trapping_math would be the appropriate
thing to do.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and aarch64-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/110371
PR tree-optimization/110018
* tree-vect-stmts.cc (vectorizable_conversion): Don't use
intermiediate type for FIX_TRUNC_EXPR when ftrapping-math.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110018-1.c: Add -fno-trapping-math to dg-options.
* gcc.target/i386/pr110018-2.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pr110018-1.c | 2 +-
 gcc/testsuite/gcc.target/i386/pr110018-2.c | 2 +-
 gcc/tree-vect-stmts.cc | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr110018-1.c 
b/gcc/testsuite/gcc.target/i386/pr110018-1.c
index b6a3be7b7a2..24eeca60f6f 100644
--- a/gcc/testsuite/gcc.target/i386/pr110018-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr110018-1.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq" } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq -fno-trapping-math" } 
*/
 /* { dg-final { scan-assembler-times {(?n)vcvttp[dsh]2[dqw]} 5 } } */
 /* { dg-final { scan-assembler-times {(?n)vcvt[dqw]*2p[dsh]} 5 } } */
 
diff --git a/gcc/testsuite/gcc.target/i386/pr110018-2.c 
b/gcc/testsuite/gcc.target/i386/pr110018-2.c
index a663e074698..9a2d9e17894 100644
--- a/gcc/testsuite/gcc.target/i386/pr110018-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr110018-2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq" } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq -fno-trapping-math" } 
*/
 /* { dg-final { scan-assembler-times {(?n)vcvttp[dsh]2[dqw]} 5 } } */
 /* { dg-final { scan-assembler-times {(?n)vcvt[dqw]*2p[dsh]} 5 } } */
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 1748555a625..bf61461939b 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5263,7 +5263,8 @@ vectorizable_conversion (vec_info *vinfo,
   if ((code == FLOAT_EXPR
   && GET_MODE_SIZE (lhs_mode) > GET_MODE_SIZE (rhs_mode))
  || (code == FIX_TRUNC_EXPR
- && GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)))
+ && GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)
+ && !flag_trapping_math))
{
  bool float_expr_p = code == FLOAT_EXPR;
  scalar_mode imode = float_expr_p ? rhs_mode : lhs_mode;
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Refine maskloadmn pattern with UNSPEC_MASKLOAD.

2023-06-20 Thread liuhongt via Gcc-patches

If mem_addr points to a memory region with less than whole vector size
bytes of accessible memory and k is a mask that would prevent reading
the inaccessible bytes from mem_addr, add UNSPEC_MASKLOAD to prevent
it to be transformed to vpblendd.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to master.

gcc/ChangeLog:

PR target/110309
* config/i386/sse.md (maskload):
Refine pattern with UNSPEC_MASKLOAD.
(maskload): Ditto.
(*_load_mask): Extend mode iterator to
VI12HFBF_AVX512VL.
(*_load): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110309.c: New test.
---
 gcc/config/i386/sse.md   | 32 +---
 gcc/testsuite/gcc.target/i386/pr110309.c | 10 
 2 files changed, 28 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110309.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 87570357db6..4d1f7ac8d7e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1465,12 +1465,12 @@ (define_expand "_load_mask"
 })
 
 (define_insn "*_load_mask"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
-   (vec_merge:VI12_AVX512VL
- (unspec:VI12_AVX512VL
-   [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "register_operand" "=v")
+   (vec_merge:VI12HFBF_AVX512VL
+ (unspec:VI12HFBF_AVX512VL
+   [(match_operand:VI12HFBF_AVX512VL 1 "memory_operand" "m")]
UNSPEC_MASKLOAD)
- (match_operand:VI12_AVX512VL 2 "nonimm_or_0_operand" "0C")
+ (match_operand:VI12HFBF_AVX512VL 2 "nonimm_or_0_operand" "0C")
  (match_operand: 3 "register_operand" "Yk")))]
   "TARGET_AVX512BW"
   "vmovdqu\t{%1, %0%{%3%}%N2|%0%{%3%}%N2, %1}"
@@ -1479,9 +1479,9 @@ (define_insn "*_load_mask"
(set_attr "mode" "")])
 
 (define_insn_and_split "*_load"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand" "=v")
-   (unspec:VI12_AVX512VL
- [(match_operand:VI12_AVX512VL 1 "memory_operand" "m")]
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "register_operand" "=v")
+   (unspec:VI12HFBF_AVX512VL
+ [(match_operand:VI12HFBF_AVX512VL 1 "memory_operand" "m")]
  UNSPEC_MASKLOAD))]
   "TARGET_AVX512BW"
   "#"
@@ -26883,17 +26883,21 @@ (define_expand "maskload"
   "TARGET_AVX")
 
 (define_expand "maskload"
-  [(set (match_operand:V48H_AVX512VL 0 "register_operand")
-   (vec_merge:V48H_AVX512VL
- (match_operand:V48H_AVX512VL 1 "memory_operand")
+  [(set (match_operand:V48_AVX512VL 0 "register_operand")
+   (vec_merge:V48_AVX512VL
+ (unspec:V48_AVX512VL
+   [(match_operand:V48_AVX512VL 1 "memory_operand")]
+   UNSPEC_MASKLOAD)
  (match_dup 0)
  (match_operand: 2 "register_operand")))]
   "TARGET_AVX512F")
 
 (define_expand "maskload"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
-   (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 1 "memory_operand")
+  [(set (match_operand:VI12HFBF_AVX512VL 0 "register_operand")
+   (vec_merge:VI12HFBF_AVX512VL
+ (unspec:VI12HFBF_AVX512VL
+   [(match_operand:VI12HFBF_AVX512VL 1 "memory_operand")]
+   UNSPEC_MASKLOAD)
  (match_dup 0)
  (match_operand: 2 "register_operand")))]
   "TARGET_AVX512BW")
diff --git a/gcc/testsuite/gcc.target/i386/pr110309.c 
b/gcc/testsuite/gcc.target/i386/pr110309.c
new file mode 100644
index 000..f6e9e9c3c61
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110309.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 --param vect-partial-vector-usage=1 -march=znver4 
-mprefer-vector-width=256" } */
+/* { dg-final { scan-assembler-not {(?n)vpblendd.*ymm} } } */
+
+
+void foo (int * __restrict a, int *b)
+{
+  for (int i = 0; i < 6; ++i)
+a[i] = b[i] + 42;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] [vect]Use intermiediate integer type for float_expr/fix_trunc_expr when direct optab is not existed.

2023-06-20 Thread liuhongt via Gcc-patches

I notice there's some refactor in vectorizable_conversion
for code_helper,so I've adjusted my patch to that.
Here's the patch I'm going to commit.

We have already use intermidate type in case WIDEN, but not for NONE,
this patch extended that.

gcc/ChangeLog:

PR target/110018
* tree-vect-stmts.cc (vectorizable_conversion): Use
intermiediate integer type for float_expr/fix_trunc_expr when
direct optab is not existed.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110018-1.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr110018-1.c | 94 ++
 gcc/tree-vect-stmts.cc | 66 ++-
 2 files changed, 158 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110018-1.c

diff --git a/gcc/testsuite/gcc.target/i386/pr110018-1.c 
b/gcc/testsuite/gcc.target/i386/pr110018-1.c
new file mode 100644
index 000..b1baffd7af1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110018-1.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq" } */
+/* { dg-final { scan-assembler-times {(?n)vcvttp[dsh]2[dqw]} 5 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvt[dqw]*2p[dsh]} 5 } } */
+
+void
+foo (double* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo1 (float* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo2 (_Float16* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+foo3 (double* __restrict a, short* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo4 (float* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo5 (double* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo6 (float* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo7 (_Float16* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+foo8 (double* __restrict b, short* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo9 (float* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 056a0ecb2be..ae24f3e66e6 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5041,7 +5041,7 @@ vectorizable_conversion (vec_info *vinfo,
   tree scalar_dest;
   tree op0, op1 = NULL_TREE;
   loop_vec_info loop_vinfo = dyn_cast  (vinfo);
-  tree_code tc1;
+  tree_code tc1, tc2;
   code_helper code, code1, code2;
   code_helper codecvt1 = ERROR_MARK, codecvt2 = ERROR_MARK;
   tree new_temp;
@@ -5249,6 +5249,57 @@ vectorizable_conversion (vec_info *vinfo,
code1 = tc1;
break;
   }
+
+  /* For conversions between float and smaller integer types try whether we
+can use intermediate signed integer types to support the
+conversion.  */
+  if ((code == FLOAT_EXPR
+  && GET_MODE_SIZE (lhs_mode) > GET_MODE_SIZE (rhs_mode))
+ || (code == FIX_TRUNC_EXPR
+ && GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)))
+   {
+ bool float_expr_p = code == FLOAT_EXPR;
+ scalar_mode imode = float_expr_p ? rhs_mode : lhs_mode;
+ fltsz = GET_MODE_SIZE (float_expr_p ? lhs_mode : rhs_mode);
+ code1 = float_expr_p ? code : NOP_EXPR;
+ codecvt1 = float_expr_p ? NOP_EXPR : code;
+ FOR_EACH_2XWIDER_MODE (rhs_mode_iter, imode)
+   {
+ imode = rhs_mode_iter.require ();
+ if (GET_MODE_SIZE (imode) > fltsz)
+   break;
+
+ cvt_type
+   = build_nonstandard_integer_type (GET_MODE_BITSIZE (imode),
+ 0);
+ cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type,
+ slp_node);
+ /* This should only happened for SLP as long as loop vectorizer
+only supports same-sized vector.  */
+ if (cvt_type == NULL_TREE
+ || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
+ || !supportable_convert_operation ((tree_code) code1,
+vectype_out,
+cvt_type, )
+ || !supportable_convert_operation ((tree_code) codecvt1,
+cvt_type,
+vectype_in, ))
+   continue;
+
+ found_mode = true;
+ break;
+   }
+
+ if (found_mode)
+   {
+ multi_step_cvt++;
+

[PATCH 2/2] Refined 256/512-bit vpacksswb/vpackssdw patterns.

2023-06-15 Thread liuhongt via Gcc-patches

The packing in vpacksswb/vpackssdw is not a simple concat, it's an
interweave from src1 and src2 for every 128 bit(or 64-bit for the
ss_truncate result).

.i.e.

dst[192-255] = ss_truncate (src2[128-255])
dst[128-191] = ss_truncate (src1[128-255])
dst[64-127] = ss_truncate (src2[0-127])
dst[0-63] = ss_truncate (src1[0-127]

The patch refined those patterns with an extra vec_select for the
interweave.

The patch will fix below testcase which failed after
g:921b841350c4fc298d09f6c5674663e0f4208610 added constant-folding for 
SS_TRUNCATE
FAIL: gcc.target/i386/avx2-vpackssdw-2.c execution test.

Bootstrapped and regtested on x86_64-pc-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR target/110235
* config/i386/sse.md (_packsswb): Split
to below 3 new define_insns.
(sse2_packsswb): New define_insn.
(avx2_packsswb): Ditto.
(avx512bw_packsswb): Ditto.
(_packssdw): Split to below 3 new define_insns.
(sse2_packssdw): New define_insn.
(avx2_packssdw): Ditto.
(avx512bw_packssdw): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-vpackssdw-3.c: New test.
* gcc.target/i386/avx512bw-vpacksswb-3.c: New test.
---
 gcc/config/i386/sse.md| 165 --
 .../gcc.target/i386/avx512bw-vpackssdw-3.c|  55 ++
 .../gcc.target/i386/avx512bw-vpacksswb-3.c|  50 ++
 3 files changed, 252 insertions(+), 18 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-vpackssdw-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-vpacksswb-3.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 83e3f534fd2..cc4e4620257 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17762,14 +17762,14 @@ (define_expand "vec_pack_sbool_trunc_qi"
   DONE;
 })
 
-(define_insn "_packsswb"
-  [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,")
-   (vec_concat:VI1_AVX512
- (ss_truncate:
-   (match_operand: 1 "register_operand" "0,"))
- (ss_truncate:
-   (match_operand: 2 "vector_operand" "xBm,m"]
-  "TARGET_SSE2 &&  && "
+(define_insn "sse2_packsswb"
+  [(set (match_operand:V16QI 0 "register_operand" "=x,Yw")
+   (vec_concat:V16QI
+ (ss_truncate:V8QI
+   (match_operand:V8HI 1 "register_operand" "0,Yw"))
+ (ss_truncate:V8QI
+   (match_operand:V8HI 2 "vector_operand" "xBm,Ywm"]
+  "TARGET_SSE2 &&  && "
   "@
packsswb\t{%2, %0|%0, %2}
vpacksswb\t{%2, %1, %0|%0, %1, %2}"
@@ -1,16 +1,93 @@ (define_insn "_packsswb"
(set_attr "type" "sselog")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix" "orig,")
-   (set_attr "mode" "")])
+   (set_attr "mode" "TI")])
 
-(define_insn "_packssdw"
-  [(set (match_operand:VI2_AVX2 0 "register_operand" "=x,")
-   (vec_concat:VI2_AVX2
- (ss_truncate:
-   (match_operand: 1 "register_operand" "0,"))
- (ss_truncate:
-   (match_operand: 2 "vector_operand" "xBm,m"]
-  "TARGET_SSE2 &&  && "
+(define_insn "avx2_packsswb"
+  [(set (match_operand:V32QI 0 "register_operand" "=Yw")
+   (vec_select:V32QI
+ (vec_concat:V32QI
+   (ss_truncate:V16QI
+ (match_operand:V16HI 1 "register_operand" "Yw"))
+   (ss_truncate:V16QI
+ (match_operand:V16HI 2 "vector_operand" "Ywm")))
+ (parallel [(const_int 0)  (const_int 1)
+(const_int 2)  (const_int 3)
+(const_int 4)  (const_int 5)
+(const_int 6)  (const_int 7)
+(const_int 16) (const_int 17)
+(const_int 18) (const_int 19)
+(const_int 20) (const_int 21)
+(const_int 22) (const_int 23)
+(const_int 8)  (const_int 9)
+(const_int 10) (const_int 11)
+(const_int 12) (const_int 13)
+(const_int 14) (const_int 15)
+(const_int 24) (const_int 25)
+(const_int 26) (const_int 27)
+(const_int 28) (const_int 29)
+(const_int 30) (const_int 31)])))]
+  "TARGET_AVX2 &&  && "
+  "vpacksswb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx512bw_packsswb"
+  [(set (match_operand:V64QI 0 "register_operand" "=v")
+   (vec_select:V64QI
+ (vec_concat:V64QI
+   (ss_truncate:V32QI
+ (match_operand:V32HI 1 "register_operand" "v"))
+   (ss_truncate:V32QI
+ (match_operand:V32HI 2 "vector_operand" "vm")))
+ (parallel [(const_int 0)  (const_int 1)
+(const_int 2)  (const_int 3)
+(const_int 4)  (const_int 5)
+(const_int 6)  (const_int 7)
+(const_int 32) (const_int 33)
+(const_int 34)

[PATCH 1/2] Reimplement packuswb/packusdw with UNSPEC_US_TRUNCATE instead of original us_truncate.

2023-06-15 Thread liuhongt via Gcc-patches

packuswb/packusdw does unsigned saturation for signed source, but rtl
us_truncate means does unsigned saturation for unsigned source.
So for value -1, packuswb will produce 0, but us_truncate produces
255. The patch reimplement those related patterns and functions with
UNSPEC_US_TRUNCATE instead of us_truncate.

The patch will fix below testcase which failed after
g:921b841350c4fc298d09f6c5674663e0f4208610 added constant-folding for 
US_TRUNCATE

FAIL: gcc.target/i386/avx-vpackuswb-1.c execution test
FAIL: gcc.target/i386/avx2-vpackusdw-2.c execution test
FAIL: gcc.target/i386/avx2-vpackuswb-2.c execution test
FAIL: gcc.target/i386/sse2-packuswb-1.c execution test

Bootstrapped and regtested on x86_64-pc-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR target/110235
* config/i386/i386-expand.cc (ix86_split_mmx_pack): Use
UNSPEC_US_TRUNCATE instead of original us_truncate for
packusdw/packuswb.
* config/i386/mmx.md (mmx_packswb): Splitted to
below 2 new patterns.
(mmx_packsswb): New reload_completed define_insn_and_split.
(mmx_packuswb): Ditto.
(mmx_packusdw): Use UNSPEC_US_TRUNCATE instead of original
us_truncate.
(s_trunsuffix): Removed.
(any_s_truncate): Removed.
* config/i386/sse.md (_packuswb): Use
UNSPEC_US_TRUNCATE instead of original us_truncate.
(_packusdw): Ditto.
* config/i386/i386.md (UNSPEC_US_TRUNCATE): New unspec_c_enum.
---
 gcc/config/i386/i386-expand.cc | 20 
 gcc/config/i386/i386.md|  4 
 gcc/config/i386/mmx.md | 43 ++
 gcc/config/i386/sse.md | 20 
 4 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index def060ab562..35e2740f9b6 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1019,6 +1019,7 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
   rtx op0 = operands[0];
   rtx op1 = operands[1];
   rtx op2 = operands[2];
+  rtx src;
 
   machine_mode dmode = GET_MODE (op0);
   machine_mode smode = GET_MODE (op1);
@@ -1042,11 +1043,20 @@ ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
   op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
   op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
 
-  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
-  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
-  rtx insn = gen_rtx_SET (dest, gen_rtx_VEC_CONCAT (sse_dmode,
-   op1, op2));
-  emit_insn (insn);
+  /* For packusdw/packuswb, it does unsigned saturation for
+ signed source which is different for rtl US_TRUNCATE.  */
+  if (code == US_TRUNCATE)
+src = gen_rtx_UNSPEC (sse_dmode,
+ gen_rtvec (2, op1, op2),
+ UNSPEC_US_TRUNCATE);
+  else
+{
+  op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
+  op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
+  src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
+}
+
+  emit_move_insn (dest, src);
 
   ix86_move_vector_high_sse_to_mmx (op0);
 }
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 0929115ed4d..070a84d8af9 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -129,6 +129,10 @@ (define_c_enum "unspec" [
   UNSPEC_RSQRT
   UNSPEC_PSADBW
 
+  ;; US_TRUNCATE this is different from rtl us_truncate,
+  ;; it does unsigned truncation for signed source.
+  UNSPEC_US_TRUNCATE
+
   ;; For AVX/AVX512F support
   UNSPEC_SCALEF
   UNSPEC_PCMP
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 6fbe3909c8b..315eb4193c4 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -3337,27 +3337,41 @@ (define_split
 ;;
 ;
 
-;; Used in signed and unsigned truncations with saturation.
-(define_code_iterator any_s_truncate [ss_truncate us_truncate])
-;; Instruction suffix for truncations with saturation.
-(define_code_attr s_trunsuffix [(ss_truncate "s") (us_truncate "u")])
-
-(define_insn_and_split "mmx_packswb"
+(define_insn_and_split "mmx_packsswb"
   [(set (match_operand:V8QI 0 "register_operand" "=y,x,Yw")
(vec_concat:V8QI
- (any_s_truncate:V4QI
+ (ss_truncate:V4QI
(match_operand:V4HI 1 "register_operand" "0,0,Yw"))
- (any_s_truncate:V4QI
+ (ss_truncate:V4QI
(match_operand:V4HI 2 "register_mmxmem_operand" "ym,x,Yw"]
   "TARGET_MMX || TARGET_MMX_WITH_SSE"
   "@
-   packswb\t{%2, %0|%0, %2}
+   packsswb\t{%2, %0|%0, %2}
+   #
+   #"
+  "&& reload_completed
+   && SSE_REGNO_P (REGNO (operands[0]))"
+  [(const_int 0)]
+  "ix86_split_mmx_pack (operands, SS_TRUNCATE); DONE;"
+  [(set_attr "mmx_isa" "native,sse_noavx,avx")
+   (set_attr "type" "mmxshft,sselog,sselog")
+   (set_attr "mode" "DI,TI,TI")])
+

[PATCH] [x86] Use x instead of v for alternative 2 (v, BH) in mov_internal.

2023-06-13 Thread liuhongt via Gcc-patches

Since there's no evex version for vpcmpeq ymm, ymm, ymm.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk and backport to GCC13.

gcc/ChangeLog:

PR target/110227
* config/i386/sse.md (mov_internal>): Use x instead of v
for alternative 2 since there's no evex version for vpcmpeqd
ymm, ymm, ymm.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110227.c: New test.
---
 gcc/config/i386/sse.md   |  2 +-
 gcc/testsuite/gcc.target/i386/pr110227.c | 11 +++
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110227.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 9bec09d354a..370ea6418a6 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1324,7 +1324,7 @@ (define_expand "mov"
 
 (define_insn "mov_internal"
   [(set (match_operand:VMOVE 0 "nonimmediate_operand"
-"=v,v ,v,v ,m")
+"=v,v ,x,v ,m")
(match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"
 " C,,BH,vm,v"))]
   "TARGET_SSE
diff --git a/gcc/testsuite/gcc.target/i386/pr110227.c 
b/gcc/testsuite/gcc.target/i386/pr110227.c
new file mode 100644
index 000..9b59f5b6e49
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110227.c
@@ -0,0 +1,11 @@
+/* { dg-do assemble { target { ! ia32 } } } */
+/* { dg-options " -O2 -mavx512vl" } */
+
+#include 
+
+void f()
+{
+  __m256i mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+  register __m256i reg asm("xmm16") = mask;
+  asm(""::"v"(reg));
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 1/2] Fold _mm{, 256, 512}_abs_{epi8, epi16, epi32, epi64} into gimple ABSU_EXPR + VCE.

2023-06-06 Thread liuhongt via Gcc-patches

r14-1145 fold the intrinsics into gimple ABS_EXPR which has UB for
TYPE_MIN, but PABSB will store unsigned result into dst. The patch
uses ABSU_EXPR + VCE instead of ABS_EXPR.

Also don't fold _mm_abs_{pi8,pi16,pi32} w/o TARGET_64BIT since 64-bit
vector absm2 is guarded with TARGET_MMX_WITH_SSE.

gcc/ChangeLog:

PR target/110108
* config/i386/i386.cc (ix86_gimple_fold_builtin): Fold
_mm{,256,512}_abs_{epi8,epi16,epi32,epi64} into gimple
ABSU_EXPR + VCE, don't fold _mm_abs_{pi8,pi16,pi32} w/o
TARGET_64BIT.
* config/i386/i386-builtin.def: Replace CODE_FOR_nothing with
real codename for __builtin_ia32_pabs{b,w,d}.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110108.c: New test.
* gcc.target/i386/pr110108-3.c: New test.
---
 gcc/config/i386/i386-builtin.def   |  6 ++---
 gcc/config/i386/i386.cc| 27 --
 gcc/testsuite/gcc.target/i386/pr109900.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr110108-3.c | 22 ++
 gcc/testsuite/gcc.target/i386/pr110108.c   | 16 +
 5 files changed, 62 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110108.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 383b68a9bb8..7ba5b6a9d11 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -900,11 +900,11 @@ BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv2df3, 
"__builtin_ia32_hsubpd"
 
 /* SSSE3 */
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsb128", 
IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, 
(int) V8QI_FTYPE_V8QI)
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsw128", 
IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, 
(int) V4HI_FTYPE_V4HI)
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsd128", 
IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, 
(int) V2SI_FTYPE_V2SI)
 
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_phaddwv8hi3, 
"__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) 
V8HI_FTYPE_V8HI_V8HI)
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, 
UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ff56ee8dd..da20c2c49de 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18433,6 +18433,7 @@ bool
 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 {
   gimple *stmt = gsi_stmt (*gsi), *g;
+  gimple_seq stmts = NULL;
   tree fndecl = gimple_call_fndecl (stmt);
   gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
   int n_args = gimple_call_num_args (stmt);
@@ -18555,7 +18556,6 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
{
  loc = gimple_location (stmt);
  tree type = TREE_TYPE (arg2);
- gimple_seq stmts = NULL;
  if (VECTOR_FLOAT_TYPE_P (type))
{
  tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
@@ -18610,7 +18610,6 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
  tree zero_vec = build_zero_cst (type);
  tree minus_one_vec = build_minus_one_cst (type);
  tree cmp_type = truth_type_for (type);
- gimple_seq stmts = NULL;
  tree cmp = gimple_build (, tcode, cmp_type, arg0, arg1);
  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
  g = gimple_build_assign (gimple_call_lhs (stmt),
@@ -18904,14 +18903,18 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   break;
 
 case IX86_BUILTIN_PABSB:
+case IX86_BUILTIN_PABSW:
+case IX86_BUILTIN_PABSD:
+  /* 64-bit vector abs2 is only supported under TARGET_MMX_WITH_SSE. 
 */
+  if (!TARGET_64BIT)
+   break;
+  /* FALLTHRU.  */
 case IX86_BUILTIN_PABSB128:
 case IX86_BUILTIN_PABSB256:
 case IX86_BUILTIN_PABSB512:
-case IX86_BUILTIN_PABSW:

[PATCH v2] Explicitly view_convert_expr mask to signed type when folding pblendvb builtins.

2023-06-06 Thread liuhongt via Gcc-patches

> I think this is a better patch and will always be correct and still
> get folded at the gimple level (correctly):
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index d4ff56ee8dd..02bf5ba93a5 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -18561,8 +18561,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
>   tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
> ? intSI_type_node : intDI_type_node;
>   type = get_same_sized_vectype (itype, type);
> - arg2 = gimple_build (, VIEW_CONVERT_EXPR, type, arg2);
> }
> + else
> +   type = signed_type_for (type);
> + arg2 = gimple_build (, VIEW_CONVERT_EXPR, type, arg2);
>   tree zero_vec = build_zero_cst (type);
>   tree cmp_type = truth_type_for (type);
>   tree cmp = gimple_build (, LT_EXPR, cmp_type, arg2, zero_vec);
>
>

Yes, thanks.

Here's the updated patch:

Since mask < 0 will be always false for vector char when
-funsigned-char, but vpblendvb needs to check the most significant
bit. The patch explicitly VCE to vector signed char.


gcc/ChangeLog:

PR target/110108
* config/i386/i386.cc (ix86_gimple_fold_builtin): Explicitly
view_convert_expr mask to signed type when folding pblendvb
builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110108-2.c: New test.
---
 gcc/config/i386/i386.cc|  4 +++-
 gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++
 2 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index da20c2c49de..4e594a9c88e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18561,8 +18561,10 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
  tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
? intSI_type_node : intDI_type_node;
  type = get_same_sized_vectype (itype, type);
- arg2 = gimple_build (, VIEW_CONVERT_EXPR, type, arg2);
}
+ else
+   type = signed_type_for (type);
+ arg2 = gimple_build (, VIEW_CONVERT_EXPR, type, arg2);
  tree zero_vec = build_zero_cst (type);
  tree cmp_type = truth_type_for (type);
  tree cmp = gimple_build (, LT_EXPR, cmp_type, arg2, zero_vec);
diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c 
b/gcc/testsuite/gcc.target/i386/pr110108-2.c
new file mode 100644
index 000..2d1d2fd4991
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -funsigned-char" } */
+/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */
+
+#include 
+__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) {
+  __m128i Result = _mm_blendv_epi8(X0, X1, X2);
+  return Result;
+}
+
+__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) {
+  __m256i Result = _mm256_blendv_epi8(X0, X1, X2);
+  return Result;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Don't fold _mm{, 256}_blendv_epi8 into (mask < 0 ? src1 : src2) when -funsigned-char.

2023-06-05 Thread liuhongt via Gcc-patches

Since mask < 0 will be always false when -funsigned-char, but
vpblendvb needs to check the most significant bit.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport to GCC12/GCC13 release branch?

gcc/ChangeLog:

PR target/110108
* config/i386/i386-builtin.def (BDESC): Replace
CODE_FOR_nothing with real code name for blendvb builtins.
* config/i386/i386.cc (ix86_gimple_fold_builtin): Don't fold
_mm{,256}_blendv_epi8 into (mask < 0 ? src1 : src2) when
-funsigned-char.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110108-2.c: New test.
---
 gcc/config/i386/i386-builtin.def   |  4 ++--
 gcc/config/i386/i386.cc|  7 +++
 gcc/testsuite/gcc.target/i386/pr110108-2.c | 14 ++
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110108-2.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 7ba5b6a9d11..b4c99ff62a2 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -944,7 +944,7 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dppd, 
"__builtin_ia32_dppd", I
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_dpps, "__builtin_ia32_dpps", 
IX86_BUILTIN_DPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_insertps_v4sf, 
"__builtin_ia32_insertps128", IX86_BUILTIN_INSERTPS128, UNKNOWN, (int) 
V4SF_FTYPE_V4SF_V4SF_INT)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_mpsadbw, 
"__builtin_ia32_mpsadbw128", IX86_BUILTIN_MPSADBW128, UNKNOWN, (int) 
V16QI_FTYPE_V16QI_V16QI_INT)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_nothing, 
"__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) 
V16QI_FTYPE_V16QI_V16QI_V16QI)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendvb, 
"__builtin_ia32_pblendvb128", IX86_BUILTIN_PBLENDVB128, UNKNOWN, (int) 
V16QI_FTYPE_V16QI_V16QI_V16QI)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_pblendw, 
"__builtin_ia32_pblendw128", IX86_BUILTIN_PBLENDW128, UNKNOWN, (int) 
V8HI_FTYPE_V8HI_V8HI_INT)
 
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_sign_extendv8qiv8hi2, 
"__builtin_ia32_pmovsxbw128", IX86_BUILTIN_PMOVSXBW128, UNKNOWN, (int) 
V8HI_FTYPE_V16QI)
@@ -1198,7 +1198,7 @@ BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_andv4di3, 
"__builtin_ia32_andsi256", IX
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_andnotv4di3, 
"__builtin_ia32_andnotsi256", IX86_BUILTIN_ANDNOT256I, UNKNOWN, (int) 
V4DI_FTYPE_V4DI_V4DI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv32qi3, 
"__builtin_ia32_pavgb256",  IX86_BUILTIN_PAVGB256, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_uavgv16hi3, 
"__builtin_ia32_pavgw256",  IX86_BUILTIN_PAVGW256, UNKNOWN, (int) 
V16HI_FTYPE_V16HI_V16HI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, 
"__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX2, 0,  CODE_FOR_avx2_pblendvb, 
"__builtin_ia32_pblendvb256", IX86_BUILTIN_PBLENDVB256, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_pblendw, 
"__builtin_ia32_pblendw256", IX86_BUILTIN_PBLENDVW256, UNKNOWN, (int) 
V16HI_FTYPE_V16HI_V16HI_INT)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqb256", 
IX86_BUILTIN_PCMPEQB256, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pcmpeqw256", 
IX86_BUILTIN_PCMPEQW256, UNKNOWN, (int) V16HI_FTYPE_V16HI_V16HI)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b09b3c79e99..f8f6c26c8eb 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18548,6 +18548,13 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   /* FALLTHRU.  */
 case IX86_BUILTIN_PBLENDVB128:
 case IX86_BUILTIN_BLENDVPS:
+  /* Don't fold PBLENDVB when funsigned-char since mask < 0
+will always be false in the gimple level.  */
+  if ((fn_code == IX86_BUILTIN_PBLENDVB128
+  || fn_code == IX86_BUILTIN_PBLENDVB256)
+ && !flag_signed_char)
+   break;
+
   gcc_assert (n_args == 3);
   arg0 = gimple_call_arg (stmt, 0);
   arg1 = gimple_call_arg (stmt, 1);
diff --git a/gcc/testsuite/gcc.target/i386/pr110108-2.c 
b/gcc/testsuite/gcc.target/i386/pr110108-2.c
new file mode 100644
index 000..2d1d2fd4991
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110108-2.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2 -funsigned-char" } */
+/* { dg-final { scan-assembler-times "vpblendvb" 2 } } */
+
+#include 
+__m128i do_stuff_128(__m128i X0, __m128i X1, __m128i X2) {
+  __m128i Result = _mm_blendv_epi8(X0, X1, X2);
+  return Result;
+}
+
+__m256i do_stuff_256(__m256i X0, __m256i X1, __m256i X2) {
+  __m256i Result = _mm256_blendv_epi8(X0, X1,

[PATCH] Fold _mm{, 256, 512}_abs_{epi8, epi16, epi32, epi64} into gimple ABSU_EXPR + VCE.

2023-06-05 Thread liuhongt via Gcc-patches

r14-1145 fold the intrinsics into gimple ABS_EXPR which has UB for
TYPE_MIN, but PABSB will store unsigned result into dst. The patch
uses ABSU_EXPR + VCE instead of ABS_EXPR.

Also don't fold _mm_abs_{pi8,pi16,pi32} w/o TARGET_64BIT since 64-bit
vector absm2 is guarded with TARGET_MMX_WITH_SSE.

Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for trunk?


gcc/ChangeLog:

PR target/110108
* config/i386/i386.cc (ix86_gimple_fold_builtin): Fold
_mm{,256,512}_abs_{epi8,epi16,epi32,epi64} into gimple
ABSU_EXPR + VCE, don't fold _mm_abs_{pi8,pi16,pi32} w/o
TARGET_64BIT.
* config/i386/i386-builtin.def: Replace CODE_FOR_nothing with
real codename for __builtin_ia32_pabs{b,w,d}.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110108.c: New test.
---
 gcc/config/i386/i386-builtin.def |  6 ++--
 gcc/config/i386/i386.cc  | 44 
 gcc/testsuite/gcc.target/i386/pr110108.c | 16 +
 3 files changed, 56 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110108.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 383b68a9bb8..7ba5b6a9d11 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -900,11 +900,11 @@ BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv2df3, 
"__builtin_ia32_hsubpd"
 
 /* SSSE3 */
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsb128", 
IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, 
(int) V8QI_FTYPE_V8QI)
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsw128", 
IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, 
(int) V4HI_FTYPE_V4HI)
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsd128", 
IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, 
(int) V2SI_FTYPE_V2SI)
 
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_phaddwv8hi3, 
"__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) 
V8HI_FTYPE_V8HI_V8HI)
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, 
UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ff56ee8dd..b09b3c79e99 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -18433,6 +18433,7 @@ bool
 ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
 {
   gimple *stmt = gsi_stmt (*gsi), *g;
+  gimple_seq stmts = NULL;
   tree fndecl = gimple_call_fndecl (stmt);
   gcc_checking_assert (fndecl && fndecl_built_in_p (fndecl, BUILT_IN_MD));
   int n_args = gimple_call_num_args (stmt);
@@ -18555,7 +18556,6 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
{
  loc = gimple_location (stmt);
  tree type = TREE_TYPE (arg2);
- gimple_seq stmts = NULL;
  if (VECTOR_FLOAT_TYPE_P (type))
{
  tree itype = GET_MODE_INNER (TYPE_MODE (type)) == E_SFmode
@@ -18610,7 +18610,6 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
  tree zero_vec = build_zero_cst (type);
  tree minus_one_vec = build_minus_one_cst (type);
  tree cmp_type = truth_type_for (type);
- gimple_seq stmts = NULL;
  tree cmp = gimple_build (, tcode, cmp_type, arg0, arg1);
  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
  g = gimple_build_assign (gimple_call_lhs (stmt),
@@ -18904,14 +18903,18 @@ ix86_gimple_fold_builtin (gimple_stmt_iterator *gsi)
   break;
 
 case IX86_BUILTIN_PABSB:
+case IX86_BUILTIN_PABSW:
+case IX86_BUILTIN_PABSD:
+  /* 64-bit vector abs2 is only supported under TARGET_MMX_WITH_SSE. 
 */
+  if (!TARGET_64BIT)
+   break;
+  /* FALLTHRU.  */
 case IX86_BUILTIN_PABSB128:
 case IX86_BUILTIN_PABSB256:
 case IX86_BUILTIN_PABSB512:
-case IX86_BUILTIN_PABSW:
 case IX86_BUILTIN_PABSW128:
 case IX86_BUILTIN_PABSW256:
 case IX86_BUILTIN_PABSW512:
-case IX86_BUILTIN_PABSD:
 case IX86_BUILTIN_PABSD128:
 case

[PATCH] [x86] Add missing vec_pack/unpacks patterns for _Float16 <-> int/float conversion.

2023-06-04 Thread liuhongt via Gcc-patches

This patch only support vec_pack/unpacks optabs for vector modes whose lenth >= 
128.
For 32/64-bit vector, they're more hanlded by BB vectorizer with
truncmn2/extendmn2/fix{,uns}_truncmn2.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

* config/i386/sse.md (vec_pack_float_): New expander.
(vec_unpack_fix_trunc_lo_): Ditto.
(vec_unpack_fix_trunc_hi_): Ditto.
(vec_unpacks_lo_: Ditto.
(vec_unpacks_hi_: Ditto.
(sse_movlhps_): New define_insn.
(ssse3_palignr_perm): Extend to V_128H.
(V_128H): New mode iterator.
(ssepackPHmode): New mode attribute.
(vunpck_extract_mode>: Ditto.
(vpckfloat_concat_mode): Extend to VxSI/VxSF for _Float16.
(vpckfloat_temp_mode): Ditto.
(vpckfloat_op_mode): Ditto.
(vunpckfixt_mode): Extend to VxHF.
(vunpckfixt_model): Ditto.
(vunpckfixt_extract_mode): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/vec_pack_fp16-1.c: New test.
* gcc.target/i386/vec_pack_fp16-2.c: New test.
* gcc.target/i386/vec_pack_fp16-3.c: New test.
---
 gcc/config/i386/sse.md| 216 +-
 .../gcc.target/i386/vec_pack_fp16-1.c |  34 +++
 .../gcc.target/i386/vec_pack_fp16-2.c |   9 +
 .../gcc.target/i386/vec_pack_fp16-3.c |   8 +
 4 files changed, 258 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vec_pack_fp16-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vec_pack_fp16-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vec_pack_fp16-3.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a92f50e96b5..1eb2dd077ff 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -291,6 +291,9 @@ (define_mode_iterator V
 (define_mode_iterator V_128
   [V16QI V8HI V4SI V2DI V4SF (V2DF "TARGET_SSE2")])
 
+(define_mode_iterator V_128H
+  [V16QI V8HI V8HF V8BF V4SI V2DI V4SF (V2DF "TARGET_SSE2")])
+
 ;; All 256bit vector modes
 (define_mode_iterator V_256
   [V32QI V16HI V8SI V4DI V8SF V4DF])
@@ -1076,6 +1079,12 @@ (define_mode_attr ssePHmodelower
(V8DI "v8hf") (V4DI "v4hf") (V2DI "v2hf")
(V8DF "v8hf") (V16SF "v16hf") (V8SF "v8hf")])
 
+
+;; Mapping of vector modes to packed vector hf modes of same sized.
+(define_mode_attr ssepackPHmode
+  [(V16SI "V32HF") (V8SI "V16HF") (V4SI "V8HF")
+   (V16SF "V32HF") (V8SF "V16HF") (V4SF "V8HF")])
+
 ;; Mapping of vector modes to packed single mode of the same size
 (define_mode_attr ssePSmode
   [(V16SI "V16SF") (V8DF "V16SF")
@@ -6918,6 +6927,61 @@ (define_mode_attr qq2phsuff
(V16SF "") (V8SF "{y}") (V4SF "{x}")
(V8DF "{z}") (V4DF "{y}") (V2DF "{x}")])
 
+(define_mode_attr vunpck_extract_mode
+  [(V32HF "v32hf") (V16HF "v16hf") (V8HF "v16hf")])
+
+(define_expand "vec_unpacks_lo_"
+  [(match_operand: 0 "register_operand")
+   (match_operand:VF_AVX512FP16VL 1 "register_operand")]
+  "TARGET_AVX512FP16"
+{
+  rtx tem = operands[1];
+  rtx (*gen) (rtx, rtx);
+  if (mode != V8HFmode)
+{
+  tem = gen_reg_rtx (mode);
+  emit_insn (gen_vec_extract_lo_ (tem,
+  operands[1]));
+  gen = gen_extend2;
+}
+  else
+gen = gen_avx512fp16_float_extend_phv4sf2;
+
+  emit_insn (gen (operands[0], tem));
+  DONE;
+})
+
+(define_expand "vec_unpacks_hi_"
+  [(match_operand: 0 "register_operand")
+   (match_operand:VF_AVX512FP16VL 1 "register_operand")]
+  "TARGET_AVX512FP16"
+{
+  rtx tem = operands[1];
+  rtx (*gen) (rtx, rtx);
+  if (mode != V8HFmode)
+{
+  tem = gen_reg_rtx (mode);
+  emit_insn (gen_vec_extract_hi_ (tem,
+  operands[1]));
+  gen = gen_extend2;
+}
+  else
+{
+  tem = gen_reg_rtx (V8HFmode);
+  rtvec tmp = rtvec_alloc (8);
+  for (int i = 0; i != 8; i++)
+   RTVEC_ELT (tmp, i) = GEN_INT((i+4)%8);
+
+  rtx selector = gen_rtx_PARALLEL (VOIDmode, tmp);
+  emit_move_insn (tem,
+gen_rtx_VEC_SELECT (V8HFmode, operands[1], selector));
+  gen = gen_avx512fp16_float_extend_phv4sf2;
+}
+
+  emit_insn (gen (operands[0], tem));
+  DONE;
+})
+
 (define_insn 
"avx512fp16_vcvtph2_"
   [(set (match_operand:VI248_AVX512VL 0 "register_operand" "=v")
 (unspec:VI248_AVX512VL
@@ -8314,11 +8378,17 @@ (define_expand "floatv2div2sf2"
 })
 
 (define_mode_attr vpckfloat_concat_mode
-  [(V8DI "v16sf") (V4DI "v8sf") (V2DI "v8sf")])
+  [(V8DI "v16sf") (V4DI "v8sf") (V2DI "v8sf")
+   (V16SI "v32hf") (V8SI "v16hf") (V4SI "v16hf")
+   (V16SF "v32hf") (V8SF "v16hf") (V4SF "v16hf")])
 (define_mode_attr vpckfloat_temp_mode
-  [(V8DI "V8SF") (V4DI "V4SF") (V2DI "V4SF")])
+  [(V8DI "V8SF") (V4DI "V4SF") (V2DI "V4SF")
+   (V16SI "V16HF") (V8SI "V8HF") (V4SI "V8HF")
+   (V16SF "V16HF") (V8SF "V8HF") (V4SF "V8HF")])
 (define_mode_attr vpckfloat_op_mode
-  [(V8DI

[PATCH] [vect]Use intermiediate integer type for float_expr/fix_trunc_expr when direct optab is not existed.

2023-06-01 Thread liuhongt via Gcc-patches

We have already use intermidate type in case WIDEN, but not for NONE,
this patch extended that.

I didn't do that in pattern recog since we need to know whether the
stmt belongs to any slp_node to decide the vectype, the related optabs
are checked according to vectype_in and vectype_out. For non-slp case,
vec_pack/unpack are always used when lhs has different size from rhs,
for slp case, sometimes vec_pack/unpack is used, somethings
direct conversion is used.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/110018
* tree-vect-stmts.cc (vectorizable_conversion): Use
intermiediate integer type for float_expr/fix_trunc_expr when
direct optab is not existed.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110018-1.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr110018-1.c | 94 ++
 gcc/tree-vect-stmts.cc | 56 -
 2 files changed, 149 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110018-1.c

diff --git a/gcc/testsuite/gcc.target/i386/pr110018-1.c 
b/gcc/testsuite/gcc.target/i386/pr110018-1.c
new file mode 100644
index 000..b1baffd7af1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110018-1.c
@@ -0,0 +1,94 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512fp16 -mavx512vl -O2 -mavx512dq" } */
+/* { dg-final { scan-assembler-times {(?n)vcvttp[dsh]2[dqw]} 5 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvt[dqw]*2p[dsh]} 5 } } */
+
+void
+foo (double* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo1 (float* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo2 (_Float16* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+foo3 (double* __restrict a, short* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo4 (float* __restrict a, char* b)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo5 (double* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo6 (float* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
+
+void
+foo7 (_Float16* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+}
+
+void
+foo8 (double* __restrict b, short* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+}
+
+void
+foo9 (float* __restrict b, char* a)
+{
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index bd3b07a3aa1..1118c89686d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -5162,6 +5162,49 @@ vectorizable_conversion (vec_info *vinfo,
return false;
   if (supportable_convert_operation (code, vectype_out, vectype_in, 
))
break;
+  if ((code == FLOAT_EXPR
+  && GET_MODE_SIZE (lhs_mode) > GET_MODE_SIZE (rhs_mode))
+ || (code == FIX_TRUNC_EXPR
+ && GET_MODE_SIZE (rhs_mode) > GET_MODE_SIZE (lhs_mode)))
+   {
+ bool float_expr_p = code == FLOAT_EXPR;
+ scalar_mode imode = float_expr_p ? rhs_mode : lhs_mode;
+ fltsz = GET_MODE_SIZE (float_expr_p ? lhs_mode : rhs_mode);
+ code1 = float_expr_p ? code : NOP_EXPR;
+ codecvt1 = float_expr_p ? NOP_EXPR : code;
+ FOR_EACH_2XWIDER_MODE (rhs_mode_iter, imode)
+   {
+ imode = rhs_mode_iter.require ();
+ if (GET_MODE_SIZE (imode) > fltsz)
+   break;
+
+ cvt_type
+   = build_nonstandard_integer_type (GET_MODE_BITSIZE (imode),
+ 0);
+ cvt_type = get_vectype_for_scalar_type (vinfo, cvt_type,
+ slp_node);
+ /* This should only happened for SLP as long as loop vectorizer
+only supports same-sized vector.  */
+ if (cvt_type == NULL_TREE
+ || maybe_ne (TYPE_VECTOR_SUBPARTS (cvt_type), nunits_in)
+ || !supportable_convert_operation (code1, vectype_out,
+cvt_type, )
+ || !supportable_convert_operation (codecvt1, cvt_type,
+vectype_in, ))
+   continue;
+
+ found_mode = true;
+ break;
+   }
+
+ if (found_mode)
+   {
+ multi_step_cvt++;
+ interm_types.safe_push (cvt_type);
+ cvt_type = NULL_TREE;
+ break;
+   }
+   }
   /* FALLTHRU */
 unsupported:
   if (dump_enabled_p ())
@@ -5381,7 +5424,18 @@ vectorizable_conversion (vec_info *vinfo,
{

[PATCH] i386: Add missing vector truncate patterns [PR92658].

2023-06-01 Thread liuhongt via Gcc-patches

Add missing insn patterns for v2si -> v2hi/v2qi and v2hi-> v2qi vector
truncate.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/92658
* config/i386/mmx.md (truncv2hiv2qi2): New define_insn.
(truncv2si2): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr92658-avx512bw-trunc-2.c: New test.
---
 gcc/config/i386/mmx.md| 21 +++
 .../i386/pr92658-avx512bw-trunc-2.c   | 27 +++
 2 files changed, 48 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92658-avx512bw-trunc-2.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index dbcb850ffde..bb45098f797 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -3667,6 +3667,27 @@ (define_expand "v2qiv2hi2"
   DONE;
 })
 
+(define_insn "truncv2hiv2qi2"
+  [(set (match_operand:V2QI 0 "register_operand" "=v")
+   (truncate:V2QI
+ (match_operand:V2HI 1 "register_operand" "v")))]
+  "TARGET_AVX512VL && TARGET_AVX512BW"
+  "vpmovwb\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
+(define_mode_iterator V2QI_V2HI [V2QI V2HI])
+(define_insn "truncv2si2"
+  [(set (match_operand:V2QI_V2HI 0 "register_operand" "=v")
+   (truncate:V2QI_V2HI
+ (match_operand:V2SI 1 "register_operand" "v")))]
+  "TARGET_AVX512VL && TARGET_MMX_WITH_SSE"
+  "vpmovd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "evex")
+   (set_attr "mode" "TI")])
+
 ;; Pack/unpack vector modes
 (define_mode_attr mmxpackmode
   [(V4HI "V8QI") (V2SI "V4HI")])
diff --git a/gcc/testsuite/gcc.target/i386/pr92658-avx512bw-trunc-2.c 
b/gcc/testsuite/gcc.target/i386/pr92658-avx512bw-trunc-2.c
new file mode 100644
index 000..2f5b7dc5668
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92658-avx512bw-trunc-2.c
@@ -0,0 +1,27 @@
+/* PR target/92658 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+/* { dg-final { scan-assembler-times "vpmovwb" 1 } } */
+/* { dg-final { scan-assembler-times "vpmovdb" 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpmovdw" 1 { target { ! ia32 } } } } */
+
+void
+foo (int* __restrict a, char* b)
+{
+b[0] = a[0];
+b[1] = a[1];
+}
+
+void
+foo2 (short* __restrict a, char* b)
+{
+b[0] = a[0];
+b[1] = a[1];
+}
+
+void
+foo3 (int* __restrict a, short* b)
+{
+b[0] = a[0];
+b[1] = a[1];
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Don't try bswap + rotate when TYPE_PRECISION(n->type) > n->range.

2023-06-01 Thread liuhongt via Gcc-patches

For the testcase in the PR, we have

  br64 = br;
  br64 = ((br64 << 16) & 0x00ffull) | (br64 & 0xff00ull);

  n->n: 0x300200.
  n->range: 32.
  n->type: uint64.

The original code assumes n->range is same as TYPE PRECISION(n->type),
and tries to rotate the mask from 0x30200 -> 0x20300 which is
incorrect. The patch fixed this bug by not trying bswap + rotate when
TYPE_PRECISION(n->type) is not equal to n->range.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/110067
* gimple-ssa-store-merging.cc (find_bswap_or_nop): Don't try
bswap + rotate when TYPE_PRECISION(n->type) > n->range.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr110067.c: New test.
---
 gcc/gimple-ssa-store-merging.cc  |  3 +
 gcc/testsuite/gcc.target/i386/pr110067.c | 77 
 2 files changed, 80 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr110067.c

diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc
index 9cb574fa315..401496a9231 100644
--- a/gcc/gimple-ssa-store-merging.cc
+++ b/gcc/gimple-ssa-store-merging.cc
@@ -1029,6 +1029,9 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number 
*n, bool *bswap,
   /* TODO, handle cast64_to_32 and big/litte_endian memory
 source when rsize < range.  */
   if (n->range == orig_range
+ /* There're case like 0x30200 for uint32->uint64 cast,
+Don't hanlde this.  */
+ && n->range == TYPE_PRECISION (n->type)
  && ((orig_range == 32
   && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing)
  || (orig_range == 64
diff --git a/gcc/testsuite/gcc.target/i386/pr110067.c 
b/gcc/testsuite/gcc.target/i386/pr110067.c
new file mode 100644
index 000..c4208811628
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr110067.c
@@ -0,0 +1,77 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fno-strict-aliasing" } */
+
+#include 
+#define force_inline __inline__ __attribute__ ((__always_inline__))
+
+__attribute__((noipa))
+static void
+fetch_pixel_no_alpha_32_bug (void *out)
+{
+  uint32_t *ret = out;
+  *ret = 0xff499baf;
+}
+
+static force_inline uint32_t
+bilinear_interpolation_local (uint32_t tl, uint32_t tr,
+ uint32_t bl, uint32_t br,
+ int distx, int disty)
+{
+  uint64_t distxy, distxiy, distixy, distixiy;
+  uint64_t tl64, tr64, bl64, br64;
+  uint64_t f, r;
+
+  distx <<= 1;
+  disty <<= 1;
+
+  distxy = distx * disty;
+  distxiy = distx * (256 - disty);
+  distixy = (256 - distx) * disty;
+  distixiy = (256 - distx) * (256 - disty);
+
+  /* Alpha and Blue */
+  tl64 = tl & 0xffff;
+  tr64 = tr & 0xffff;
+  bl64 = bl & 0xffff;
+  br64 = br & 0xffff;
+
+  f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+  r = f & 0xffffull;
+
+  /* Red and Green */
+  tl64 = tl;
+  tl64 = ((tl64 << 16) & 0x00ffull) | (tl64 & 0xff00ull);
+
+  tr64 = tr;
+  tr64 = ((tr64 << 16) & 0x00ffull) | (tr64 & 0xff00ull);
+
+  bl64 = bl;
+  bl64 = ((bl64 << 16) & 0x00ffull) | (bl64 & 0xff00ull);
+
+  br64 = br;
+  br64 = ((br64 << 16) & 0x00ffull) | (br64 & 0xff00ull);
+
+  f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
+  r |= ((f >> 16) & 0x00ffull) | (f & 0xff00ull);
+
+  return (uint32_t)(r >> 16);
+}
+
+__attribute__((noipa))
+static void
+bits_image_fetch_pixel_bilinear_32_bug (void *out)
+{
+  uint32_t br;
+  uint32_t *ret = out;
+
+  fetch_pixel_no_alpha_32_bug ();
+  *ret = bilinear_interpolation_local (0, 0, 0, br, 0x41, 0x42);
+}
+
+int main() {
+  uint32_t r;
+  bits_image_fetch_pixel_bilinear_32_bug ();
+  if (r != 0x4213282d)
+__builtin_abort ();
+  return 0;
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Disable avoid_false_dep_for_bmi for atom and icelake(and later) core processors.

2023-05-25 Thread liuhongt via Gcc-patches

lzcnt/tzcnt has been fixed since skylake, popcnt has been fixed since
icelake. At least for icelake and later intel Core processors, the
errata tune is not needed. And the tune isn't need for ATOM either.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.


gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI):
Remove ATOM and ICELAKER(and later) core processors.
---
 gcc/config/i386/x86-tune.def | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 9d603cc84e4..e1c72cddf1f 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -335,7 +335,8 @@ DEF_TUNE (X86_TUNE_USE_BT, "use_bt",
 /* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency
for bit-manipulation instructions.  */
 DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi",
- m_SANDYBRIDGE | m_CORE_AVX2 | m_TREMONT | m_ALDERLAKE | m_CORE_ATOM
+ m_SANDYBRIDGE | m_HASWELL | m_SKYLAKE | m_SKYLAKE_AVX512
+ | m_CANNONLAKE | m_CASCADELAKE | m_COOPERLAKE
  | m_LUJIAZUI | m_GENERIC)
 
 /* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] [x86] Split notl + pbraodcast + pand to pbroadcast + pandn more modes.

2023-05-25 Thread liuhongt via Gcc-patches

r12-5595-gc39d77f252e895306ef88c1efb3eff04e4232554 adds 2 splitter to
transform notl + pbroadcast + pand to pbroadcast + pandn for
VI124_AVX2 which leaves out all DI-element-size ones as
well as all 512-bit ones.
This patch extend the splitter to VI_AVX2 which will handle DImode for
AVX2, and V64QImode,V32HImode,V16SImode,V8DImode for AVX512.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready to push to trunk.

gcc/ChangeLog:

PR target/100711
* config/i386/sse.md (*andnot3): Extend below splitter
to VI_AVX2 to cover more modes.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr100711-2.c: Add v4di/v2di testcases.
* gcc.target/i386/pr100711-3.c: New test.
---
 gcc/config/i386/sse.md | 12 +++
 gcc/testsuite/gcc.target/i386/pr100711-2.c | 14 +++-
 gcc/testsuite/gcc.target/i386/pr100711-3.c | 40 ++
 3 files changed, 59 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100711-3.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 26dd0b1aa10..97f883d8083 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17116,17 +17116,17 @@ (define_split
 
 ;; PR target/100711: Split notl; vpbroadcastd; vpand as vpbroadcastd; vpandn
 (define_split
-  [(set (match_operand:VI124_AVX2 0 "register_operand")
-   (and:VI124_AVX2
- (vec_duplicate:VI124_AVX2
+  [(set (match_operand:VI_AVX2 0 "register_operand")
+   (and:VI_AVX2
+ (vec_duplicate:VI_AVX2
(not:
  (match_operand: 1 "register_operand")))
- (match_operand:VI124_AVX2 2 "vector_operand")))]
+ (match_operand:VI_AVX2 2 "vector_operand")))]
   "TARGET_AVX2"
   [(set (match_dup 3)
-   (vec_duplicate:VI124_AVX2 (match_dup 1)))
+   (vec_duplicate:VI_AVX2 (match_dup 1)))
(set (match_dup 0)
-   (and:VI124_AVX2 (not:VI124_AVX2 (match_dup 3))
+   (and:VI_AVX2 (not:VI_AVX2 (match_dup 3))
(match_dup 2)))]
   "operands[3] = gen_reg_rtx (mode);")
 
diff --git a/gcc/testsuite/gcc.target/i386/pr100711-2.c 
b/gcc/testsuite/gcc.target/i386/pr100711-2.c
index ccaf1688e19..f75914fb7fc 100644
--- a/gcc/testsuite/gcc.target/i386/pr100711-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr100711-2.c
@@ -4,10 +4,12 @@
 typedef char v16qi __attribute__ ((vector_size (16)));
 typedef short v8hi __attribute__ ((vector_size (16)));
 typedef int v4si __attribute__ ((vector_size (16)));
+typedef long long v2di __attribute__((vector_size (16)));
 
 typedef char v32qi __attribute__ ((vector_size (32)));
 typedef short v16hi __attribute__ ((vector_size (32)));
 typedef int v8si __attribute__ ((vector_size (32)));
+typedef long long v4di __attribute__((vector_size (32)));
 
 v16qi foo_v16qi (char a, v16qi b)
 {
@@ -25,6 +27,11 @@ v4si foo_v4si (int a, v4si b)
 return (__extension__ (v4si) {~a, ~a, ~a, ~a}) & b;
 }
 
+v2di foo_v2di (long long a, v2di b)
+{
+return (__extension__ (v2di) {~a, ~a}) & b;
+}
+
 v32qi foo_v32qi (char a, v32qi b)
 {
 return (__extension__ (v32qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
@@ -44,4 +51,9 @@ v8si foo_v8si (int a, v8si b)
 return (__extension__ (v8si) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,}) & b;
 }
 
-/* { dg-final { scan-assembler-times "vpandn" 6 } } */
+v4di foo_v4di (long long a, v4di b)
+{
+return (__extension__ (v4di) {~a, ~a, ~a, ~a}) & b;
+}
+
+/* { dg-final { scan-assembler-times "vpandn" 8 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100711-3.c 
b/gcc/testsuite/gcc.target/i386/pr100711-3.c
new file mode 100644
index 000..e90f2a48d8d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100711-3.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" } */
+
+typedef char v64qi __attribute__ ((vector_size (64)));
+typedef short v32hi __attribute__ ((vector_size (64)));
+typedef int v16si __attribute__ ((vector_size (64)));
+typedef long long v8di __attribute__((vector_size (64)));
+
+v64qi foo_v64qi (char a, v64qi b)
+{
+return (__extension__ (v64qi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) & b;
+}
+
+v32hi foo_v32hi (short a, v32hi b)
+{
+return (__extension__ (v32hi) {~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+   ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a,
+  ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a}) & b;
+}
+
+v16si foo_v16si (int a, v16si b)
+{
+return (__extension__ (v16si) {~a, ~a, ~a,

[PATCH] Fold _mm{, 256, 512}_abs_{epi8, epi16, epi32, epi64} into gimple ABS_EXPR.

2023-05-22 Thread liuhongt via Gcc-patches

Also for 64-bit vector abs intrinsics _mm_abs_{pi8,pi16,pi32}.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/109900
* config/i386/i386.cc (ix86_gimple_fold_builtin): Fold
_mm{,256,512}_abs_{epi8,epi16,epi32,epi64} and
_mm_abs_{pi8,pi16,pi32} into gimple ABS_EXPR.
(ix86_masked_all_ones): Handle 64-bit mask.
* config/i386/i386-builtin.def: Replace icode of related
non-mask simd abs builtins with CODE_FOR_nothing.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr109900.c: New test.
---
 gcc/config/i386/i386-builtin.def | 18 ++---
 gcc/config/i386/i386.cc  | 86 +++--
 gcc/testsuite/gcc.target/i386/pr109900.c | 95 
 3 files changed, 166 insertions(+), 33 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109900.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index f7b10a6ab1e..c91e3809c75 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -899,12 +899,12 @@ BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv4sf3, 
"__builtin_ia32_hsubps"
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv2df3, 
"__builtin_ia32_hsubpd", IX86_BUILTIN_HSUBPD, UNKNOWN, (int) 
V2DF_FTYPE_V2DF_V2DF)
 
 /* SSSE3 */
-BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_absv16qi2, 
"__builtin_ia32_pabsb128", IX86_BUILTIN_PABSB128, UNKNOWN, (int) 
V16QI_FTYPE_V16QI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv8qi2, "__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, 
(int) V8QI_FTYPE_V8QI)
-BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_absv8hi2, "__builtin_ia32_pabsw128", 
IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv4hi2, "__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, 
(int) V4HI_FTYPE_V4HI)
-BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_absv4si2, "__builtin_ia32_pabsd128", 
IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI)
-BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_absv2si2, "__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, 
(int) V2SI_FTYPE_V2SI)
+BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsb128", 
IX86_BUILTIN_PABSB128, UNKNOWN, (int) V16QI_FTYPE_V16QI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsb", IX86_BUILTIN_PABSB, UNKNOWN, (int) V8QI_FTYPE_V8QI)
+BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsw128", 
IX86_BUILTIN_PABSW128, UNKNOWN, (int) V8HI_FTYPE_V8HI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsw", IX86_BUILTIN_PABSW, UNKNOWN, (int) V4HI_FTYPE_V4HI)
+BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_nothing, "__builtin_ia32_pabsd128", 
IX86_BUILTIN_PABSD128, UNKNOWN, (int) V4SI_FTYPE_V4SI)
+BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_nothing, 
"__builtin_ia32_pabsd", IX86_BUILTIN_PABSD, UNKNOWN, (int) V2SI_FTYPE_V2SI)
 
 BDESC (OPTION_MASK_ISA_SSSE3, 0, CODE_FOR_ssse3_phaddwv8hi3, 
"__builtin_ia32_phaddw128", IX86_BUILTIN_PHADDW128, UNKNOWN, (int) 
V8HI_FTYPE_V8HI_V8HI)
 BDESC (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_MMX, 0, 
CODE_FOR_ssse3_phaddwv4hi3, "__builtin_ia32_phaddw", IX86_BUILTIN_PHADDW, 
UNKNOWN, (int) V4HI_FTYPE_V4HI_V4HI)
@@ -1178,9 +1178,9 @@ BDESC (OPTION_MASK_ISA_AVX, 0, 
CODE_FOR_vec_pack_sfix_v4df, "__builtin_ia32_vec_
 
 /* AVX2 */
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_mpsadbw, 
"__builtin_ia32_mpsadbw256", IX86_BUILTIN_MPSADBW256, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI_INT)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_absv32qi2, "__builtin_ia32_pabsb256", 
IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_absv16hi2, "__builtin_ia32_pabsw256", 
IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI)
-BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_absv8si2, "__builtin_ia32_pabsd256", 
IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pabsb256", 
IX86_BUILTIN_PABSB256, UNKNOWN, (int) V32QI_FTYPE_V32QI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pabsw256", 
IX86_BUILTIN_PABSW256, UNKNOWN, (int) V16HI_FTYPE_V16HI)
+BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_nothing, "__builtin_ia32_pabsd256", 
IX86_BUILTIN_PABSD256, UNKNOWN, (int) V8SI_FTYPE_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_packssdw, 
"__builtin_ia32_packssdw256",  IX86_BUILTIN_PACKSSDW256, UNKNOWN, (int) 
V16HI_FTYPE_V8SI_V8SI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_packsswb, 
"__builtin_ia32_packsswb256",  IX86_BUILTIN_PACKSSWB256, UNKNOWN, (int) 
V32QI_FTYPE_V16HI_V16HI)
 BDESC (OPTION_MASK_ISA_AVX2, 0, CODE_FOR_avx2_packusdw, 
"__builtin_ia32_packusdw256",  IX86_BUILTIN_PACKUSDW256, UNKNOWN, (int)

[PATCH] Only use NO_REGS in cost calculation when !hard_regno_mode_ok for GENERAL_REGS and mode.

2023-05-17 Thread liuhongt via Gcc-patches

r14-172-g0368d169492017 replaces GENERAL_REGS with NO_REGS in cost
calculation when the preferred register class are not known yet.
It regressed powerpc PR109610 and PR109858, it looks too aggressive to use
NO_REGS when mode can be allocated with GENERAL_REGS.
The patch takes a step back, still use GENERAL_REGS when
hard_regno_mode_ok for mode and GENERAL_REGS, otherwise uses NO_REGS.
Kewen confirmed the patch fixed PR109858, I vefiried it also fixed PR109610.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big performance impact for SPEC2017 on icelake server.
Ok for trunk?

gcc/ChangeLog:

* ira-costs.cc (scan_one_insn): Only use NO_REGS in cost
calculation when !hard_regno_mode_ok for GENERAL_REGS and
mode, otherwise still use GENERAL_REGS.
---
 gcc/ira-costs.cc | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index d2a801ab9b0..ae8304ff938 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1572,12 +1572,16 @@ scan_one_insn (rtx_insn *insn)
   && (! ira_use_lra_p || ! pic_offset_table_rtx
  || ! contains_symbol_ref_p (XEXP (note, 0
 {
-  /* Costs for NO_REGS are used in cost calculation on the
-1st pass when the preferred register classes are not
-known yet.  In this case we take the best scenario.  */
-  enum reg_class cl = NO_REGS;
+  enum reg_class cl = GENERAL_REGS;
   rtx reg = SET_DEST (set);
   int num = COST_INDEX (REGNO (reg));
+  /* Costs for NO_REGS are used in cost calculation on the
+1st pass when the preferred register classes are not
+known yet.  In this case we take the best scenario when
+mode can't be put into GENERAL_REGS.  */
+  if (!targetm.hard_regno_mode_ok (ira_class_hard_regs[cl][0],
+  GET_MODE (reg)))
+   cl = NO_REGS;
 
   COSTS (costs, num)->mem_cost
-= ira_memory_move_cost[GET_MODE (reg)][cl][1] * frequency;
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH V2] Provide -fcf-protection=branch,return.

2023-05-13 Thread liuhongt via Gcc-patches

> I think this could be simplified if you use either EnumSet or
> EnumBitSet instead in common.opt for `-fcf-protection=`.

Use EnumSet instead of EnumBitSet since CF_FULL is not power of 2.
It is a bit tricky for sets classification, cf_branch and cf_return
should be in different sets, but they both "conflicts" cf_full,
cf_none. And current EnumSet don't handle this well.

So in the current implementation, only cf_full,cf_none are exclusive
to each other, but they can be combined with any cf_branch, cf_return,
cf_check. It's not perfect, but still an improvement than original
one.

gcc/ChangeLog:

* common.opt: (fcf-protection=): Add EnumSet attribute to
support combination of params.

gcc/testsuite/ChangeLog:

* c-c++-common/fcf-protection-10.c: New test.
* c-c++-common/fcf-protection-11.c: New test.
* c-c++-common/fcf-protection-12.c: New test.
* c-c++-common/fcf-protection-8.c: New test.
* c-c++-common/fcf-protection-9.c: New test.
* gcc.target/i386/pr89701-1.c: New test.
* gcc.target/i386/pr89701-2.c: New test.
* gcc.target/i386/pr89701-3.c: New test.
---
 gcc/common.opt | 12 ++--
 gcc/testsuite/c-c++-common/fcf-protection-10.c |  2 ++
 gcc/testsuite/c-c++-common/fcf-protection-11.c |  2 ++
 gcc/testsuite/c-c++-common/fcf-protection-12.c |  2 ++
 gcc/testsuite/c-c++-common/fcf-protection-8.c  |  2 ++
 gcc/testsuite/c-c++-common/fcf-protection-9.c  |  2 ++
 gcc/testsuite/gcc.target/i386/pr89701-1.c  |  4 
 gcc/testsuite/gcc.target/i386/pr89701-2.c  |  4 
 gcc/testsuite/gcc.target/i386/pr89701-3.c  |  4 
 9 files changed, 28 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-10.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-11.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-12.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-8.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-3.c

diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..02f2472959a 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -1886,7 +1886,7 @@ fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
 fcf-protection=
-Common Joined RejectNegative Enum(cf_protection_level) Var(flag_cf_protection) 
Init(CF_NONE)
+Common Joined RejectNegative Enum(cf_protection_level) EnumSet 
Var(flag_cf_protection) Init(CF_NONE)
 -fcf-protection=[full|branch|return|none|check]Instrument functions 
with checks to verify jump/call/return control-flow transfer
 instructions have valid targets.
 
@@ -1894,19 +1894,19 @@ Enum
 Name(cf_protection_level) Type(enum cf_protection_level) UnknownError(unknown 
Control-Flow Protection Level %qs)
 
 EnumValue
-Enum(cf_protection_level) String(full) Value(CF_FULL)
+Enum(cf_protection_level) String(full) Value(CF_FULL) Set(1)
 
 EnumValue
-Enum(cf_protection_level) String(branch) Value(CF_BRANCH)
+Enum(cf_protection_level) String(branch) Value(CF_BRANCH) Set(2)
 
 EnumValue
-Enum(cf_protection_level) String(return) Value(CF_RETURN)
+Enum(cf_protection_level) String(return) Value(CF_RETURN) Set(3)
 
 EnumValue
-Enum(cf_protection_level) String(check) Value(CF_CHECK)
+Enum(cf_protection_level) String(check) Value(CF_CHECK) Set(4)
 
 EnumValue
-Enum(cf_protection_level) String(none) Value(CF_NONE)
+Enum(cf_protection_level) String(none) Value(CF_NONE) Set(1)
 
 finstrument-functions
 Common Var(flag_instrument_function_entry_exit,1)
diff --git a/gcc/testsuite/c-c++-common/fcf-protection-10.c 
b/gcc/testsuite/c-c++-common/fcf-protection-10.c
new file mode 100644
index 000..b271d134e52
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/fcf-protection-10.c
@@ -0,0 +1,2 @@
+/* { dg-do compile { target { "i?86-*-* x86_64-*-*" } } } */
+/* { dg-options "-fcf-protection=branch,check" } */
diff --git a/gcc/testsuite/c-c++-common/fcf-protection-11.c 
b/gcc/testsuite/c-c++-common/fcf-protection-11.c
new file mode 100644
index 000..2e566350ccd
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/fcf-protection-11.c
@@ -0,0 +1,2 @@
+/* { dg-do compile { target { "i?86-*-* x86_64-*-*" } } } */
+/* { dg-options "-fcf-protection=branch,return" } */
diff --git a/gcc/testsuite/c-c++-common/fcf-protection-12.c 
b/gcc/testsuite/c-c++-common/fcf-protection-12.c
new file mode 100644
index 000..b39c2f8e25d
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/fcf-protection-12.c
@@ -0,0 +1,2 @@
+/* { dg-do compile { target { "i?86-*-* x86_64-*-*" } } } */
+/* { dg-options "-fcf-protection=return,branch" } */
diff --git a/gcc/testsuite/c-c++-common/fcf-protection-8.c 
b/gcc/testsuite/c-c++-common/fcf-protection-8.c
new file mode 100644
index 000..3b97095a92c
--- /dev/null

[PATCH] Provide -fcf-protection=branch,return.

2023-05-11 Thread liuhongt via Gcc-patches

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/89701
* common.opt: Refactor -fcf-protection= to support combination
of param.
* lto-wrapper.c (merge_and_complain): Adjusted.
* opts.c (parse_cf_protection_options): New.
(common_handle_option): Decode argument for -fcf-protection=.
* opts.h (parse_cf_protection_options): Declare.

gcc/testsuite/ChangeLog:

PR target/89701
* c-c++-common/fcf-protection-8.c: New test.
* c-c++-common/fcf-protection-9.c: New test.
* c-c++-common/fcf-protection-10.c: New test.
* gcc.target/i386/pr89701-1.c: New test.
* gcc.target/i386/pr89701-2.c: New test.
* gcc.target/i386/pr89701-3.c: New test.
* gcc.target/i386/pr89701-4.c: New test.
---
 gcc/common.opt| 24 ++
 gcc/lto-wrapper.cc| 21 +++--
 gcc/opts.cc   | 79 +++
 gcc/opts.h|  1 +
 .../c-c++-common/fcf-protection-10.c  |  3 +
 .../c-c++-common/fcf-protection-11.c  |  2 +
 .../c-c++-common/fcf-protection-12.c  |  2 +
 gcc/testsuite/c-c++-common/fcf-protection-8.c |  3 +
 gcc/testsuite/c-c++-common/fcf-protection-9.c |  3 +
 gcc/testsuite/gcc.target/i386/pr89701-1.c |  4 +
 gcc/testsuite/gcc.target/i386/pr89701-2.c |  4 +
 gcc/testsuite/gcc.target/i386/pr89701-3.c |  5 ++
 gcc/testsuite/gcc.target/i386/pr89701-4.c |  5 ++
 13 files changed, 130 insertions(+), 26 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-10.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-11.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-12.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-8.c
 create mode 100644 gcc/testsuite/c-c++-common/fcf-protection-9.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr89701-4.c

diff --git a/gcc/common.opt b/gcc/common.opt
index a28ca13385a..ac12da52733 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -229,6 +229,10 @@ bool dump_base_name_prefixed = false
 Variable
 unsigned int flag_zero_call_used_regs
 
+;; What the CF check should instrument
+Variable
+unsigned int flag_cf_protection = 0
+
 ###
 Driver
 
@@ -1886,28 +1890,10 @@ fcf-protection
 Common RejectNegative Alias(fcf-protection=,full)
 
 fcf-protection=
-Common Joined RejectNegative Enum(cf_protection_level) Var(flag_cf_protection) 
Init(CF_NONE)
+Common Joined
 -fcf-protection=[full|branch|return|none|check]Instrument functions 
with checks to verify jump/call/return control-flow transfer
 instructions have valid targets.
 
-Enum
-Name(cf_protection_level) Type(enum cf_protection_level) UnknownError(unknown 
Control-Flow Protection Level %qs)
-
-EnumValue
-Enum(cf_protection_level) String(full) Value(CF_FULL)
-
-EnumValue
-Enum(cf_protection_level) String(branch) Value(CF_BRANCH)
-
-EnumValue
-Enum(cf_protection_level) String(return) Value(CF_RETURN)
-
-EnumValue
-Enum(cf_protection_level) String(check) Value(CF_CHECK)
-
-EnumValue
-Enum(cf_protection_level) String(none) Value(CF_NONE)
-
 finstrument-functions
 Common Var(flag_instrument_function_entry_exit,1)
 Instrument function entry and exit with profiling calls.
diff --git a/gcc/lto-wrapper.cc b/gcc/lto-wrapper.cc
index 5186d040ce0..568c8af659d 100644
--- a/gcc/lto-wrapper.cc
+++ b/gcc/lto-wrapper.cc
@@ -359,26 +359,33 @@ merge_and_complain (vec 
_options,
case OPT_fcf_protection_:
  /* Default to link-time option, else append or check identical.  */
  if (!cf_protection_option
- || cf_protection_option->value == CF_CHECK)
+ || !memcmp (cf_protection_option->arg, "check", 5))
{
+ const char* parg = decoded_options[existing_opt].arg;
  if (existing_opt == -1)
decoded_options.safe_push (*foption);
- else if (decoded_options[existing_opt].value != foption->value)
+ else if ((strlen (parg) != strlen (foption->arg))
+  || memcmp (parg, foption->arg, strlen (foption->arg)))
{
  if (cf_protection_option
- && cf_protection_option->value == CF_CHECK)
+ && !memcmp (cf_protection_option->arg, "check", 5))
fatal_error (input_location,
 "option %qs with mismatching values"
 " (%s, %s)",
 "-fcf-protection",
-decoded_options[existing_opt].arg,
+parg,
 foption->arg);

[PATCH] x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR.

2023-05-10 Thread liuhongt via Gcc-patches

> The quoted patch shows -shared in context and  you didn't post a
> backport version
> to look at.  But yes, we shouldn't change -shared behavior on a
> branch, even less so make it
> inconsistent between targets.
Here's the patch.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for GCC 11/12 backport?

if (mdaz-ftz)
  link crtfastmath.o
else if ((Ofast || ffast-math || funsafe-math-optimizations)
 && !mno-daz-ftz)
  link crtfastmath.o
else
  Don't link crtfastmath.o

gcc/ChangeLog:

* config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o
whenever -mdaz-ftz is specified. Don't link crtfastmath.o
when -mno-daz-ftz is specified.
* config/i386/darwin.h (ENDFILE_SPEC): Ditto.
* config/i386/gnu-user-common.h
(GNU_USER_TARGET_MATHFILE_SPEC): Ditto.
* config/i386/mingw32.h (ENDFILE_SPEC): Ditto.
* config/i386/i386.opt (mdaz-ftz): New option.
* doc/invoke.texi (x86 options): Document mftz-daz.
---
 gcc/config/i386/cygwin.h  |  2 +-
 gcc/config/i386/darwin.h  |  4 ++--
 gcc/config/i386/gnu-user-common.h |  2 +-
 gcc/config/i386/i386.opt  |  4 
 gcc/config/i386/mingw32.h |  2 +-
 gcc/doc/invoke.texi   | 11 ++-
 6 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h
index d06eda369cf..5412c5d4479 100644
--- a/gcc/config/i386/cygwin.h
+++ b/gcc/config/i386/cygwin.h
@@ -57,7 +57,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}}
 \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
 fvtable-verify=preinit:vtv_end.o%s; \
diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h
index a55f6b2b874..2f773924d6e 100644
--- a/gcc/config/i386/darwin.h
+++ b/gcc/config/i386/darwin.h
@@ -109,8 +109,8 @@ along with GCC; see the file COPYING3.  If not see
 "%{!force_cpusubtype_ALL:-force_cpusubtype_ALL} "
 
 #undef ENDFILE_SPEC
-#define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+#define ENDFILE_SPEC
+\  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}}
 \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}" TM_DESTRUCTOR
diff --git a/gcc/config/i386/gnu-user-common.h 
b/gcc/config/i386/gnu-user-common.h
index 23b54c5be52..3d2a33f1714 100644
--- a/gcc/config/i386/gnu-user-common.h
+++ b/gcc/config/i386/gnu-user-common.h
@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3.  If not see
 
 /* Similar to standard GNU userspace, but adding -ffast-math support.  */
 #define GNU_USER_TARGET_MATHFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}}
 \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index a3675e515bc..5cfb7cdcbc2 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -420,6 +420,10 @@ mpc80
 Target RejectNegative
 Set 80387 floating-point precision to 80-bit.
 
+mdaz-ftz
+Target
+Set the FTZ and DAZ Flags.
+
 mpreferred-stack-boundary=
 Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
 Attempt to keep stack aligned to this power of 2.
diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
index d3ca0cd0279..ddbe6a4054b 100644
--- a/gcc/config/i386/mingw32.h
+++ b/gcc/config/i386/mingw32.h
@@ -197,7 +197,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!mno-daz-ftz:crtfastmath.o%s}}
 \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
 fvtable-verify=preinit:vtv_end.o%s; \
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index cb83dd8a1cc..87eedfffa6c 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1434,7 +1434,7 @@ See RS/6000 and PowerPC Options.
 -m96bit-long-double  -mlong-double-64  -mlong-double-80  -mlong-double-128 @gol
 -mregparm=@var{num}  -msseregparm @gol
 -mveclibabi=@var{type}  -mvect8-ret-in-mem @gol
--mpc32  -mpc64  -mpc80  -mstackrealign @gol
+-mpc32  -mpc64  -mpc80 -mdaz-ftz -mstackrealign @gol
 -momit-leaf-frame-pointer  -mno-red-zone  -mno-tls-direct-seg-refs @gol
 -mcmodel=@var{code-model}  -mabi=@var{name}  -maddress-mode=@var{mode} @gol
 -m32  -m64  -mx32  -m16  -miamcu  -mlarge-data-threshold=@var{num} @gol
@@ -32078,6

[PATCH] Detect bswap + rotate for byte permutation in pass_bswap.

2023-05-09 Thread liuhongt via Gcc-patches

The patch doesn't handle:
  1. cast64_to_32,
  2. memory source with rsize < range.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR middle-end/108938
* gimple-ssa-store-merging.cc (is_bswap_or_nop_p): New
function, cut from original find_bswap_or_nop function.
(find_bswap_or_nop): Add a new parameter, detect bswap +
rotate and save rotate result in the new parameter.
(bswap_replace): Add a new parameter to indicate rotate and
generate rotate stmt if needed.
(maybe_optimize_vector_constructor): Adjust for new rotate
parameter in the upper 2 functions.
(pass_optimize_bswap::execute): Ditto.
(imm_store_chain_info::output_merged_store): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr108938-1.c: New test.
* gcc.target/i386/pr108938-2.c: New test.
* gcc.target/i386/pr108938-3.c: New test.
* gcc.target/i386/pr108938-load-1.c: New test.
* gcc.target/i386/pr108938-load-2.c: New test.
---
 gcc/gimple-ssa-store-merging.cc   | 130 ++
 gcc/testsuite/gcc.target/i386/pr108938-1.c|  79 +++
 gcc/testsuite/gcc.target/i386/pr108938-2.c|  35 +
 gcc/testsuite/gcc.target/i386/pr108938-3.c|  26 
 .../gcc.target/i386/pr108938-load-1.c |  69 ++
 .../gcc.target/i386/pr108938-load-2.c |  30 
 6 files changed, 342 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-2.c

diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc
index df7afd2fd78..9cb574fa315 100644
--- a/gcc/gimple-ssa-store-merging.cc
+++ b/gcc/gimple-ssa-store-merging.cc
@@ -893,6 +893,37 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, 
uint64_t *cmpxchg,
   n->range *= BITS_PER_UNIT;
 }
 
+/* Helper function for find_bswap_or_nop,
+   Return true if N is a swap or nop with MASK.  */
+static bool
+is_bswap_or_nop_p (uint64_t n, uint64_t cmpxchg,
+  uint64_t cmpnop, uint64_t* mask,
+  bool* bswap)
+{
+  *mask = ~(uint64_t) 0;
+  if (n == cmpnop)
+*bswap = false;
+  else if (n == cmpxchg)
+*bswap = true;
+  else
+{
+  int set = 0;
+  for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
+   if ((n & msk) == 0)
+ *mask &= ~msk;
+   else if ((n & msk) == (cmpxchg & msk))
+ set++;
+   else
+ return false;
+
+  if (set < 2)
+   return false;
+  *bswap = true;
+}
+  return true;
+}
+
+
 /* Check if STMT completes a bswap implementation or a read in a given
endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
accordingly.  It also sets N to represent the kind of operations
@@ -903,7 +934,7 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, 
uint64_t *cmpxchg,
 
 gimple *
 find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap,
-  bool *cast64_to_32, uint64_t *mask)
+  bool *cast64_to_32, uint64_t *mask, uint64_t* l_rotate)
 {
   tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt)));
   if (!tree_fits_uhwi_p (type_size))
@@ -984,29 +1015,57 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number 
*n, bool *bswap,
 }
 
   uint64_t cmpxchg, cmpnop;
+  uint64_t orig_range = n->range * BITS_PER_UNIT;
   find_bswap_or_nop_finalize (n, , , cast64_to_32);
 
   /* A complete byte swap should make the symbolic number to start with
  the largest digit in the highest order byte. Unchanged symbolic
  number indicates a read with same endianness as target architecture.  */
-  *mask = ~(uint64_t) 0;
-  if (n->n == cmpnop)
-*bswap = false;
-  else if (n->n == cmpxchg)
-*bswap = true;
-  else
+  *l_rotate = 0;
+  uint64_t tmp_n = n->n;
+  if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap))
 {
-  int set = 0;
-  for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER)
-   if ((n->n & msk) == 0)
- *mask &= ~msk;
-   else if ((n->n & msk) == (cmpxchg & msk))
- set++;
-   else
- return NULL;
-  if (set < 2)
+  /* Try bswap + lrotate.  */
+  /* TODO, handle cast64_to_32 and big/litte_endian memory
+source when rsize < range.  */
+  if (n->range == orig_range
+ && ((orig_range == 32
+  && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing)
+ || (orig_range == 64
+ && optab_handler (rotl_optab, DImode) != CODE_FOR_nothing))
+ && (tmp_n & MARKER_MASK) < orig_range / BITS_PER_UNIT)
+   {
+ uint64_t range = (orig_range / BITS_PER_UNIT) *

[PATCH V2] [vect]Enhance NARROW FLOAT_EXPR vectorization by truncating integer to lower precision.

2023-05-07 Thread liuhongt via Gcc-patches

> > @@ -4799,7 +4800,8 @@ vect_create_vectorized_demotion_stmts (vec_info 
> > *vinfo, vec *vec_oprnds,
> >stmt_vec_info stmt_info,
> >vec _dsts,
> >gimple_stmt_iterator *gsi,
> > -  slp_tree slp_node, enum tree_code 
> > code)
> > +  slp_tree slp_node, enum tree_code 
> > code,
> > +  bool last_stmt_p)
>
> Can you please document this new parameter?
>
Changed.

>
> I understand what you are doing, but somehow it looks a bit awkward?
> Maybe we should split the NARROW case into NARROW_SRC and NARROW_DST?
> The case of narrowing the source because we know its range isn't a
> good fit for the
> flow.
Changed.

Here's updated patch.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

Similar like WIDEN FLOAT_EXPR, when direct_optab is not existed, try
intermediate integer type whenever gimple ranger can tell it's safe.

.i.e.
When there's no direct optab for vector long long -> vector float, but
the value range of integer can be represented as int, try vector int
-> vector float if availble.

gcc/ChangeLog:

PR tree-optimization/108804
* tree-vect-patterns.cc (vect_get_range_info): Remove static.
* tree-vect-stmts.cc (vect_create_vectorized_demotion_stmts):
Add new parameter narrow_src_p.
(vectorizable_conversion): Enhance NARROW FLOAT_EXPR
vectorization by truncating to lower precision.
* tree-vectorizer.h (vect_get_range_info): New declare.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr108804.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr108804.c |  15 +++
 gcc/tree-vect-patterns.cc|   2 +-
 gcc/tree-vect-stmts.cc   | 135 +--
 gcc/tree-vectorizer.h|   1 +
 4 files changed, 121 insertions(+), 32 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108804.c

diff --git a/gcc/testsuite/gcc.target/i386/pr108804.c 
b/gcc/testsuite/gcc.target/i386/pr108804.c
new file mode 100644
index 000..2a43c1e1848
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108804.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -Ofast -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 1 "vect" } } */
+
+typedef unsigned long long uint64_t;
+uint64_t d[512];
+float f[1024];
+
+void foo() {
+for (int i=0; i<512; ++i) {
+uint64_t k = d[i];
+f[i]=(k & 0x3F30);
+}
+}
+
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index a49b0953977..dd546b488a4 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -61,7 +61,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Return true if we have a useful VR_RANGE range for VAR, storing it
in *MIN_VALUE and *MAX_VALUE if so.  Note the range in the dump files.  */
 
-static bool
+bool
 vect_get_range_info (tree var, wide_int *min_value, wide_int *max_value)
 {
   value_range vr;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 6b7dbfd4a23..3da89a8402d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -51,6 +51,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "internal-fn.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
+#include "gimple-range.h"
 #include "tree-ssa-loop-niter.h"
 #include "gimple-fold.h"
 #include "regs.h"
@@ -4791,7 +4792,9 @@ vect_gen_widened_results_half (vec_info *vinfo, enum 
tree_code code,
 
 /* Create vectorized demotion statements for vector operands from VEC_OPRNDS.
For multi-step conversions store the resulting vectors and call the function
-   recursively.  */
+   recursively. When NARROW_SRC_P is true, there's still a conversion after
+   narrowing, don't store the vectors in the SLP_NODE or in vector info of
+   the scalar statement(or in STMT_VINFO_RELATED_STMT chain).  */
 
 static void
 vect_create_vectorized_demotion_stmts (vec_info *vinfo, vec *vec_oprnds,
@@ -4799,7 +4802,8 @@ vect_create_vectorized_demotion_stmts (vec_info *vinfo, 
vec *vec_oprnds,
   stmt_vec_info stmt_info,
   vec _dsts,
   gimple_stmt_iterator *gsi,
-  slp_tree slp_node, enum tree_code code)
+  slp_tree slp_node, enum tree_code code,
+  bool narrow_src_p)
 {
   unsigned int i;
   tree vop0, vop1, new_tmp, vec_dest;
@@ -4815,9 +4819,9 @@ vect_create_vectorized_demotion_stmts (vec_info *vinfo, 
vec *vec_oprnds,
   new_tmp = make_ssa_name (vec_dest, new_stmt);
   gimple_assign_set_lhs (new_stmt, new_tmp);
   vect_finish_stmt_generation (vinfo, stmt_info,

[PATCH] [powerpc] Add a peephole2 to eliminate redundant move from VSX_REGS to GENERAL_REGS when it's from memory.

2023-05-03 Thread liuhongt via Gcc-patches

r14-172-g0368d169492017 use NO_REGS instead of GENERAL_REGS in memory cost
calculation when preferred register class is unkown.
+  /* Costs for NO_REGS are used in cost calculation on the
+1st pass when the preferred register classes are not
+known yet.  In this case we take the best scenario.  */

It regressed gcc.target/powerpc/dform-3.c which has inline asm explicitly
put a vector mode into a general register, then create an extra move.
RA doesn't allocate GENERAL_REGS for it because the backend pattern
explicitly disparage the alternative (, r), (??r, Y) which moves
from GENERAL_REGS/MEM to GENERAL_REGS.

(define_insn "vsx_mov_64bit"
  [(set (match_operand:VSX_M 0 "nonimmediate_operand"
   "=ZwO,  wa,wa,r, we,?wQ,
?,   ??r,   ??Y,   , wa,v,
wa,wa,
?wa,   v, , wZ,v")

(match_operand:VSX_M 1 "input_operand" 
   "wa,ZwO,   wa,we,r, r,
wQ,Y, r, r, wE,jwM,
eQ,eP,
?jwM,  W, ,  v, wZ"))]

  "TARGET_POWERPC64 && VECTOR_MEM_VSX_P (mode)
   && (register_operand (operands[0], mode) 
   || register_operand (operands[1], mode))"
{
  return rs6000_output_move_128bit (operands);
}

Normally the extra move can be eliminated by pass_reload when src and
dest has same reg_class, but for that case, src and dest have
different reg_classes.

The patch adds a peephole2 to eliminate the extra move.

Bootstrapped and regtested on powerpc64le-linux-gnu.
Ok for trunk?


gcc/ChangeLog:

PR target/109610
* config/rs6000/vsx.md (define_peephole2): New peephole2 to
catch memory loads to VSX_REGS and then moves to GENERAL_REGS.
---
 gcc/config/rs6000/vsx.md | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index 7d845df5c2d..a0808ccff9a 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -1075,6 +1075,16 @@ (define_peephole2
&& peep2_reg_dead_p (2, operands[0])"
[(set (match_dup 2) (match_dup 1))])
 
+;; Peephole to catch memory loads to VSX_REG and then moves to GENERAL_REGS.
+(define_peephole2
+  [(set (match_operand:VSX_M 0 "vsx_register_operand")
+   (match_operand:VSX_M 1 "memory_operand"))
+   (set (match_operand:VSX_M 2 "int_reg_operand")
+   (match_dup 0))]
+  "TARGET_POWERPC64 && VECTOR_MEM_VSX_P (mode)
+  && peep2_reg_dead_p (2, operands[0])"
+  [(set (match_dup 2) (match_dup 1))])
+
 ;; Peephole to catch memory to memory transfers for TImode if TImode landed in
 ;; VSX registers on a little endian system.  The vector types and IEEE 128-bit
 ;; floating point are handled by the more generic swap elimination pass.
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH v2] Canonicalize vec_merge when mask is constant.

2023-05-03 Thread liuhongt via Gcc-patches

Here's update patch with documents in md.texi.
Ok for trunk?

--
Use swap_communattive_operands_p for canonicalization. When both value
has same operand precedence value, then first bit in the mask should
select first operand.

The canonicalization should help backends for pattern match. .i.e. x86
backend has lots of vec_merge patterns, combine will create any form
of vec_merge(mask, or inverted mask), then backend need to add 2
patterns to match exact 1 instruction. The canonicalization can
simplify 2 patterns to 1.

gcc/ChangeLog:

* combine.cc (maybe_swap_commutative_operands): Canonicalize
vec_merge when mask is constant.
* doc/md.texi: Document vec_merge canonicalization.
---
 gcc/combine.cc  | 22 ++
 gcc/doc/md.texi |  7 +++
 2 files changed, 29 insertions(+)

diff --git a/gcc/combine.cc b/gcc/combine.cc
index 0106092e456..5aa0ec5c45a 100644
--- a/gcc/combine.cc
+++ b/gcc/combine.cc
@@ -5631,6 +5631,28 @@ maybe_swap_commutative_operands (rtx x)
   SUBST (XEXP (x, 0), XEXP (x, 1));
   SUBST (XEXP (x, 1), temp);
 }
+
+  unsigned n_elts = 0;
+  if (GET_CODE (x) == VEC_MERGE
+  && CONST_INT_P (XEXP (x, 2))
+  && GET_MODE_NUNITS (GET_MODE (x)).is_constant (_elts)
+  && (swap_commutative_operands_p (XEXP (x, 0), XEXP (x, 1))
+ /* Two operands have same precedence, then
+first bit of mask select first operand.  */
+ || (!swap_commutative_operands_p (XEXP (x, 1), XEXP (x, 0))
+ && !(UINTVAL (XEXP (x, 2)) & 1
+{
+  rtx temp = XEXP (x, 0);
+  unsigned HOST_WIDE_INT sel = UINTVAL (XEXP (x, 2));
+  unsigned HOST_WIDE_INT mask = HOST_WIDE_INT_1U;
+  if (n_elts == HOST_BITS_PER_WIDE_INT)
+   mask = -1;
+  else
+   mask = (HOST_WIDE_INT_1U << n_elts) - 1;
+  SUBST (XEXP (x, 0), XEXP (x, 1));
+  SUBST (XEXP (x, 1), temp);
+  SUBST (XEXP (x, 2), GEN_INT (~sel & mask));
+}
 }
 
 /* Simplify X, a piece of RTL.  We just operate on the expression at the
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 07bf8bdebff..aff9b7348ce 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -8215,6 +8215,13 @@ second operand.  If a machine only supports a constant 
as the second
 operand, only patterns that match a constant in the second operand need
 be supplied.
 
+@cindex @code{vec_merge}, canonicalization of
+@item
+For the @code{vec_merge} with constant mask(the third operand), the first
+and the second operand can be exchanged by inverting the mask. In such cases,
+a constant is always made the second operand, otherwise the least significant
+bit of the mask is always set(select the first operand first).
+
 @item
 For associative operators, a sequence of operators will always chain
 to the left; for instance, only the left operand of an integer @code{plus}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] [vect]Enhance NARROW FLOAT_EXPR vectorization by truncating integer to lower precision.

2023-04-26 Thread liuhongt via Gcc-patches

Similar like WIDEN FLOAT_EXPR, when direct_optab is not existed, try
intermediate integer type whenever gimple ranger can tell it's safe.

.i.e.
When there's no direct optab for vector long long -> vector float, but
the value range of integer can be represented as int, try vector int
-> vector float if availble.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/108804
* tree-vect-patterns.cc (vect_get_range_info): Remove static.
* tree-vect-stmts.cc (vect_create_vectorized_demotion_stmts):
Add new parameter last_stmt_p.
(vectorizable_conversion): Enhance NARROW FLOAT_EXPR
vectorization by truncating to lower precision.
* tree-vectorizer.h (vect_get_range_info): New declare.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr108804.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr108804.c |  15 
 gcc/tree-vect-patterns.cc|   2 +-
 gcc/tree-vect-stmts.cc   | 106 ++-
 gcc/tree-vectorizer.h|   1 +
 4 files changed, 100 insertions(+), 24 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108804.c

diff --git a/gcc/testsuite/gcc.target/i386/pr108804.c 
b/gcc/testsuite/gcc.target/i386/pr108804.c
new file mode 100644
index 000..2a43c1e1848
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108804.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -Ofast -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 1 "vect" } } */
+
+typedef unsigned long long uint64_t;
+uint64_t d[512];
+float f[1024];
+
+void foo() {
+for (int i=0; i<512; ++i) {
+uint64_t k = d[i];
+f[i]=(k & 0x3F30);
+}
+}
+
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index a49b0953977..dd546b488a4 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -61,7 +61,7 @@ along with GCC; see the file COPYING3.  If not see
 /* Return true if we have a useful VR_RANGE range for VAR, storing it
in *MIN_VALUE and *MAX_VALUE if so.  Note the range in the dump files.  */
 
-static bool
+bool
 vect_get_range_info (tree var, wide_int *min_value, wide_int *max_value)
 {
   value_range vr;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 6b7dbfd4a23..d79a1409d24 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -51,6 +51,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "internal-fn.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
+#include "gimple-range.h"
 #include "tree-ssa-loop-niter.h"
 #include "gimple-fold.h"
 #include "regs.h"
@@ -4799,7 +4800,8 @@ vect_create_vectorized_demotion_stmts (vec_info *vinfo, 
vec *vec_oprnds,
   stmt_vec_info stmt_info,
   vec _dsts,
   gimple_stmt_iterator *gsi,
-  slp_tree slp_node, enum tree_code code)
+  slp_tree slp_node, enum tree_code code,
+  bool last_stmt_p)
 {
   unsigned int i;
   tree vop0, vop1, new_tmp, vec_dest;
@@ -4815,9 +4817,9 @@ vect_create_vectorized_demotion_stmts (vec_info *vinfo, 
vec *vec_oprnds,
   new_tmp = make_ssa_name (vec_dest, new_stmt);
   gimple_assign_set_lhs (new_stmt, new_tmp);
   vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
-
-  if (multi_step_cvt)
-   /* Store the resulting vector for next recursive call.  */
+  if (multi_step_cvt || !last_stmt_p)
+   /* Store the resulting vector for next recursive call,
+  or return the resulting vector_tmp for NARROW FLOAT_EXPR.  */
(*vec_oprnds)[i/2] = new_tmp;
   else
{
@@ -4843,7 +4845,8 @@ vect_create_vectorized_demotion_stmts (vec_info *vinfo, 
vec *vec_oprnds,
   vect_create_vectorized_demotion_stmts (vinfo, vec_oprnds,
 multi_step_cvt - 1,
 stmt_info, vec_dsts, gsi,
-slp_node, VEC_PACK_TRUNC_EXPR);
+slp_node, VEC_PACK_TRUNC_EXPR,
+last_stmt_p);
 }
 
   vec_dsts.quick_push (vec_dest);
@@ -5248,22 +5251,53 @@ vectorizable_conversion (vec_info *vinfo,
   _types))
break;
 
-  if (code != FIX_TRUNC_EXPR
- || GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode))
+  if (GET_MODE_SIZE (lhs_mode) >= GET_MODE_SIZE (rhs_mode))
goto unsupported;
 
-  cvt_type
-   = build_nonstandard_integer_type (GET_MODE_BITSIZE (rhs_mode), 0);
-  cvt_type = get_same_sized_vectype (cvt_type, vectype_in);
-  if (cvt_type == NULL_TREE)
-   goto unsupported;
-

[PATCH] Add testcases for ffs/ctz vectorization.

2023-04-22 Thread liuhongt via Gcc-patches

Ready push to trunk.

gcc/testsuite/ChangeLog:

PR tree-optimization/109011
* gcc.target/i386/pr109011-b1.c: New test.
* gcc.target/i386/pr109011-b2.c: New test.
* gcc.target/i386/pr109011-d1.c: New test.
* gcc.target/i386/pr109011-d2.c: New test.
* gcc.target/i386/pr109011-q1.c: New test.
* gcc.target/i386/pr109011-q2.c: New test.
* gcc.target/i386/pr109011-w1.c: New test.
* gcc.target/i386/pr109011-w2.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr109011-b1.c  |  53 +
 gcc/testsuite/gcc.target/i386/pr109011-b2.c  | 104 
 gcc/testsuite/gcc.target/i386/pr109011-d1.c  |  46 
 gcc/testsuite/gcc.target/i386/pr109011-d2.c  | 118 +++
 gcc/testsuite/gcc.target/i386/pr109011-dq1.c |  46 
 gcc/testsuite/gcc.target/i386/pr109011-dq2.c | 104 
 gcc/testsuite/gcc.target/i386/pr109011-q1.c  |  46 
 gcc/testsuite/gcc.target/i386/pr109011-q2.c  | 118 +++
 gcc/testsuite/gcc.target/i386/pr109011-w1.c  |  47 
 gcc/testsuite/gcc.target/i386/pr109011-w2.c  | 104 
 10 files changed, 786 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-b1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-b2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-d1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-d2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-dq1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-dq2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-q1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-q2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-w1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-w2.c

diff --git a/gcc/testsuite/gcc.target/i386/pr109011-b1.c 
b/gcc/testsuite/gcc.target/i386/pr109011-b1.c
new file mode 100644
index 000..9833d3526f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-b1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \t\]+" 4 } } */
+/* 4 vplzcntd come from function clzw, the other 4 come from function clzb0.  
*/
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 8 } } */
+
+void
+__attribute__((noipa))
+popcntb (unsigned char *p, unsigned char *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa))
+clzb (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa))
+ffsb (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa))
+ctzb (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+p[i] = __builtin_ctz (q[i]);
+}
+
+void
+__attribute__((noipa))
+clzb0 (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+p[i] = q[i] ? __builtin_clz (q[i]) : 8;
+}
+
+void
+__attribute__((noipa))
+ctzb0 (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+p[i] = q[i] ? __builtin_ctz (q[i]) : 8;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-b2.c 
b/gcc/testsuite/gcc.target/i386/pr109011-b2.c
new file mode 100644
index 000..7f2042645d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-b2.c
@@ -0,0 +1,104 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg 
-mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-b1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntb_scalar (unsigned char *p, unsigned char *q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzb_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsb_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzb0_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+  for (unsigned int i = 0; i < 2048; ++i)
+p[i] = q[i] ?

[PATCH 1/2] [i386] Support type _Float16/__bf16 independent of SSE2.

2023-04-21 Thread liuhongt via Gcc-patches

> > +  if (!TARGET_SSE2)
> > +{
> > +  if (c_dialect_cxx ()
> > +   && cxx_dialect > cxx20)
>
> Formatting, both conditions are short, so just put them on one line.
Changed.

> But for the C++23 macros, more importantly I think we really should
> also in ix86_target_macros_internal add
>   if (c_dialect_cxx ()
>   && cxx_dialect > cxx20
>   && (isa_flag & OPTION_MASK_ISA_SSE2))
> {
>   def_or_undef (parse_in, "__STDCPP_FLOAT16_T__");
>   def_or_undef (parse_in, "__STDCPP_BFLOAT16_T__");
> }
> plus associated libstdc++ changes.  It can be done incrementally though.
Added in PATCH 2/2

> > +      if (flag_building_libgcc)
> > +     {
> > +       /* libbid uses __LIBGCC_HAS_HF_MODE__ and __LIBGCC_HAS_BF_MODE__
> > +          to check backend support of _Float16 and __bf16 type.  */
>
> That is actually the case only for HFmode, but not for BFmode right now.
> So, we need further work.  One is to add the BFmode support in there,
> and another one is make sure the _Float16 <-> _Decimal* and __bf16 <->
> _Decimal* conversions are compiled in also if not -msse2 by default.
> One way to do that is wrap the HF and BF mode related functions on x86
> #ifndef __SSE2__ into the pragmas like intrin headers use (but then
> perhaps we don't need to undef this stuff here), another is not provide
> the hf/bf support in that case from the TUs where they are provided now,
> but from a different one which would be compiled with -msse2.
Add CFLAGS-_hf_to_sd.c += -msse2, similar for other files in libbid, just like
we did before for HFtype softfp. Then no need to undef libgcc macros.

> >/* We allowed the user to turn off SSE for kernel mode.  Don't crash if
> >   some less clueful developer tries to use floating-point anyway.  */
> > -  if (needed_sseregs && !TARGET_SSE)
> > +  if (needed_sseregs
> > +  && (!TARGET_SSE
> > +   || (VALID_SSE2_TYPE_MODE (mode)
> > +   && !TARGET_SSE2)))
>
> Formatting, no need to split this up that much.
>   if (needed_sseregs
>   && (!TARGET_SSE
>   || (VALID_SSE2_TYPE_MODE (mode) && !TARGET_SSE2)))
> or even better
>   if (needed_sseregs
>   && (!TARGET_SSE || (VALID_SSE2_TYPE_MODE (mode) && !TARGET_SSE2)))
> will do it.
Changed.

> Instead of this, just use
>   if (!float16_type_node)
> {
>   float16_type_node = ix86_float16_type_node;
>   callback (float16_type_node);
>   float16_type_node = NULL_TREE;
> }
>   if (!bfloat16_type_node)
> {
>   bfloat16_type_node = ix86_bf16_type_node;
>   callback (bfloat16_type_node);
>   bfloat16_type_node = NULL_TREE;
> }
Changed.


> > +static const char *
> > +ix86_invalid_conversion (const_tree fromtype, const_tree totype)
> > +{
> > +  if (element_mode (fromtype) != element_mode (totype))
> > +    {
> > +      /* Do no allow conversions to/from BFmode/HFmode scalar types
> > +      when TARGET_SSE2 is not available.  */
> > +      if ((TYPE_MODE (fromtype) == BFmode
> > +        || TYPE_MODE (fromtype) == HFmode)
> > +       && !TARGET_SSE2)
>
> First of all, not really sure if this should be purely about scalar
> modes, not also complex and vector modes involving those inner modes.
> Because complex or vector modes with BF/HF elements will be without
> TARGET_SSE2 for sure lowered into scalar code and that can't be handled
> either.
> So if (!TARGET_SSE2 && GET_MODE_INNER (TYPE_MODE (fromtype)) == BFmode)
> or even better
> if (!TARGET_SSE2 && element_mode (fromtype) == BFmode)
> ?
> Or even better remember the 2 modes above into machine_mode temporaries
> and just use those in the != comparison and for the checks?
>
> Also, I think it is weird to tell user %<__bf16%> or %<_Float16%> when
> we know which one it is.  Just return separate messages?
Changed.

> > +  /* Reject all single-operand operations on BFmode/HFmode except for &
> > +     when TARGET_SSE2 is not available.  */
> > +  if ((element_mode (type) == BFmode || element_mode (type) == HFmode)
> > +      && !TARGET_SSE2 && op != ADDR_EXPR)
> > +    return N_("operation not permitted on type %<__bf16%> "
> > +           "or %<_Float16%> without option %<-msse2%>");
>
> Similarly.  Also, check !TARGET_SSE2 first as inexpensive one.
Changed.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Successfully cross-build i686-linux-gnu.
Ok for trunk?

Enable _Float16 and __bf16 all the time but issue errors when the
types are used in conversion, unary operation, binary operation,
parameter passing or value return when TARGET_SSE2 is not available.

Also undef macros which are used by libgcc/libstdc++ to check the
backend support of the _Float16/__bf16 types when TARGET_SSE2 is not
available.

gcc/ChangeLog:

PR target/109504
* config/i386/i386-builtins.cc
(ix86_register_float16_builtin_type): Remove TARGET_SSE2.
(ix86_register_bf16_builtin_type): Ditto.
* config/i386/i386-c.cc

[PATCH 2/2] [i386] def_or_undef __STDCPP_FLOAT16_T and STDCPP_BFLOAT16_T__ for target attribute/pragmas.

2023-04-21 Thread liuhongt via Gcc-patches

> But for the C++23 macros, more importantly I think we really should
> also in ix86_target_macros_internal add
>   if (c_dialect_cxx ()
>   && cxx_dialect > cxx20
>   && (isa_flag & OPTION_MASK_ISA_SSE2))
> {
>   def_or_undef (parse_in, "__STDCPP_FLOAT16_T__");
>   def_or_undef (parse_in, "__STDCPP_BFLOAT16_T__");
> }
> plus associated libstdc++ changes.  It can be done incrementally though.
Changed except for one place in libsupc++/compare, it's inside a function
where pragma can be added. Not sure if this inconsistency will cause any
issue.

#ifdef __STDCPP_BFLOAT16_T__
  if constexpr (__is_same(_Tp, decltype(0.0bf16)))
return _Bfloat16;
#endif

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Successfully cross-build i686-linux-gnu.
Ok for trunk?

def_or_undef  target macros based on currently active ISA in pragmas
to also do that for __STDCPP_FLOAT16_T__ and __STDCPP_BFLOAT16_T__ for
C++, and change libstdc++ such that for x86 it adds similarly to x86
intrin headers something like around std::float16_t/std::bfloat16_t stuff.

gcc/ChangeLog:

PR target/109504
* config/i386/i386-c.cc (ix86_target_macros_internal):
def_or_undef __STDCPP_FLOAT16_T__ and __STDCPP_BFLOAT16_T__.

libstdc++-v3/ChangeLog:

* include/bits/c++config: Add #pragma GCC target("sse2") for
_Float16 and bfloat16_t when __SSE2__ is not available.
* include/bits/cpp_type_traits.h: Ditto.
* include/bits/std_abs.h: Ditto.
* include/c_global/cmath: Ditto.
* include/ext/type_traits.h: Ditto.
* include/std/atomic: Ditto.
* include/std/charconv: Ditto.
* include/std/complex: Ditto.
* include/std/istream: Ditto.
* include/std/limits: Ditto.
* include/std/numbers: Ditto.
* include/std/ostream: Ditto.
* include/std/stdfloat: Ditto.
* include/std/type_traits: Ditto.
---
 gcc/config/i386/i386-c.cc   |   9 +-
 libstdc++-v3/include/bits/c++config |  11 +
 libstdc++-v3/include/bits/cpp_type_traits.h |  27 +-
 libstdc++-v3/include/bits/std_abs.h |  23 +-
 libstdc++-v3/include/c_global/cmath | 733 +++-
 libstdc++-v3/include/ext/type_traits.h  |  23 +-
 libstdc++-v3/include/std/atomic |  43 +-
 libstdc++-v3/include/std/charconv   |  90 ++-
 libstdc++-v3/include/std/complex| 227 +++---
 libstdc++-v3/include/std/istream|  61 +-
 libstdc++-v3/include/std/limits |  37 +-
 libstdc++-v3/include/std/numbers|  11 +
 libstdc++-v3/include/std/ostream|  29 +-
 libstdc++-v3/include/std/stdfloat   |  19 +-
 libstdc++-v3/include/std/type_traits|  23 +-
 15 files changed, 809 insertions(+), 557 deletions(-)

diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index 2f83c9981e1..bcc17263e28 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -492,7 +492,14 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag,
   if (isa_flag & OPTION_MASK_ISA_SSE)
 def_or_undef (parse_in, "__SSE__");
   if (isa_flag & OPTION_MASK_ISA_SSE2)
-def_or_undef (parse_in, "__SSE2__");
+{
+  def_or_undef (parse_in, "__SSE2__");
+  if (c_dialect_cxx () && cxx_dialect > cxx20)
+   {
+ def_or_undef (parse_in, "__STDCPP_FLOAT16_T__");
+ def_or_undef (parse_in, "__STDCPP_BFLOAT16_T__");
+   }
+}
   if (isa_flag & OPTION_MASK_ISA_SSE3)
 def_or_undef (parse_in, "__SSE3__");
   if (isa_flag & OPTION_MASK_ISA_SSSE3)
diff --git a/libstdc++-v3/include/bits/c++config 
b/libstdc++-v3/include/bits/c++config
index 13892787e09..c858497fc6e 100644
--- a/libstdc++-v3/include/bits/c++config
+++ b/libstdc++-v3/include/bits/c++config
@@ -820,6 +820,12 @@ namespace std
 # define _GLIBCXX_LDOUBLE_IS_IEEE_BINARY128 1
 #endif
 
+#ifndef __SSE2__
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#define __DISABLE_STDCPP_SSE2__
+#endif
+
 #ifdef __STDCPP_BFLOAT16_T__
 namespace __gnu_cxx
 {
@@ -827,6 +833,11 @@ namespace __gnu_cxx
 }
 #endif
 
+#ifdef __DISABLE_STDCPP_SSE2__
+#undef __DISABLE_STDCPP_SSE2__
+#pragma GCC pop_options
+#endif
+
 #ifdef __has_builtin
 # ifdef __is_identifier
 // Intel and older Clang require !__is_identifier for some built-ins:
diff --git a/libstdc++-v3/include/bits/cpp_type_traits.h 
b/libstdc++-v3/include/bits/cpp_type_traits.h
index 4312f32a4e0..cadd5ca4fde 100644
--- a/libstdc++-v3/include/bits/cpp_type_traits.h
+++ b/libstdc++-v3/include/bits/cpp_type_traits.h
@@ -315,6 +315,12 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
   typedef __true_type __type;
 };
 
+#ifndef __SSE2__
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#define __DISABLE_STDCPP_SSE2__
+#endif
+
 #ifdef __STDCPP_FLOAT16_T__
   template<>
 struct __is_floating<_Float16>
@@ -324,36 +330,41 @@ __INT_N(__GLIBCXX_TYPE_INT_N_3)
 };
 #endif
 
-#ifdef __STDCPP_FLOAT32_T__

[PATCH] Canonicalize vec_merge when mask is constant.

2023-04-19 Thread liuhongt via Gcc-patches

Use swap_communattive_operands_p for canonicalization. When both value
has same operand precedence value, then first bit in the mask should
select first operand.

The canonicalization should help backends for pattern match. .i.e. x86
backend has lots of vec_merge patterns, combine will create any form
of vec_merge(mask, or inverted mask), then backend need to add 2
patterns to match exact 1 instruction. The canonicalization can
simplify 2 patterns to 1.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}, aarch64-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

* combine.cc (maybe_swap_commutative_operands): Canonicalize
vec_merge when mask is constant.
---
 gcc/combine.cc | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/gcc/combine.cc b/gcc/combine.cc
index 0106092e456..5aa0ec5c45a 100644
--- a/gcc/combine.cc
+++ b/gcc/combine.cc
@@ -5631,6 +5631,28 @@ maybe_swap_commutative_operands (rtx x)
   SUBST (XEXP (x, 0), XEXP (x, 1));
   SUBST (XEXP (x, 1), temp);
 }
+
+  unsigned n_elts = 0;
+  if (GET_CODE (x) == VEC_MERGE
+  && CONST_INT_P (XEXP (x, 2))
+  && GET_MODE_NUNITS (GET_MODE (x)).is_constant (_elts)
+  && (swap_commutative_operands_p (XEXP (x, 0), XEXP (x, 1))
+ /* Two operands have same precedence, then
+first bit of mask select first operand.  */
+ || (!swap_commutative_operands_p (XEXP (x, 1), XEXP (x, 0))
+ && !(UINTVAL (XEXP (x, 2)) & 1
+{
+  rtx temp = XEXP (x, 0);
+  unsigned HOST_WIDE_INT sel = UINTVAL (XEXP (x, 2));
+  unsigned HOST_WIDE_INT mask = HOST_WIDE_INT_1U;
+  if (n_elts == HOST_BITS_PER_WIDE_INT)
+   mask = -1;
+  else
+   mask = (HOST_WIDE_INT_1U << n_elts) - 1;
+  SUBST (XEXP (x, 0), XEXP (x, 1));
+  SUBST (XEXP (x, 1), temp);
+  SUBST (XEXP (x, 2), GEN_INT (~sel & mask));
+}
 }
 
 /* Simplify X, a piece of RTL.  We just operate on the expression at the
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH 2/2] Adjust testcases after better RA decision.

2023-04-19 Thread liuhongt via Gcc-patches

After optimization for RA, memory op is not propagated into
instructions(>1), and it make testcases not generate vxorps since
the memory is loaded into the dest, and the dest is never unused now.

So rewrite testcases to make the codegen more stable.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-dest-false-dep-for-glc.c: Rewrite
testcase to make the codegen more stable.
* gcc.target/i386/avx512dq-dest-false-dep-for-glc.c: Ditto
* gcc.target/i386/avx512f-dest-false-dep-for-glc.c: Ditto.
* gcc.target/i386/avx512fp16-dest-false-dep-for-glc.c: Ditto.
* gcc.target/i386/avx512vl-dest-false-dep-for-glc.c: Ditto.
---
 .../i386/avx2-dest-false-dep-for-glc.c|  28 +-
 .../i386/avx512dq-dest-false-dep-for-glc.c| 257 ++---
 .../i386/avx512f-dest-false-dep-for-glc.c | 348 ++
 .../i386/avx512fp16-dest-false-dep-for-glc.c  | 118 --
 .../i386/avx512vl-dest-false-dep-for-glc.c| 243 +---
 gcc/testsuite/gcc.target/i386/pr108707.c  |   2 +-
 6 files changed, 791 insertions(+), 205 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c 
b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c
index fe331fe5e2c..e260888627f 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-dest-false-dep-for-glc.c
@@ -5,16 +5,28 @@
 
 #include 
 
-extern __m256i i1, i2, i3, i4;
-extern __m256d d1, d2;
-extern __m256 f1, f2;
+__m256i
+foo0 (__m256i i3, __m256i i1, __m256i i2)
+{
+  return _mm256_permutevar8x32_epi32 (i1, i2);
+}
+
+__m256i
+foo1 (__m256i i2, __m256i i1)
+{
+  return _mm256_permute4x64_epi64 (i1, 12);
+}
+
+__m256d
+foo2 (__m256d d2, __m256d d1)
+{
+  return _mm256_permute4x64_pd (d1, 12);
+}
 
-void vperm_test (void)
+__m256
+foo3 (__m256 f2, __m256i i2, __m256 f1)
 {
-  i3 = _mm256_permutevar8x32_epi32 (i1, i2);
-  i4 = _mm256_permute4x64_epi64 (i1, 12);
-  d2 = _mm256_permute4x64_pd (d1, 12);
-  f2 = _mm256_permutevar8x32_ps (f1, i2);
+  return _mm256_permutevar8x32_ps (f1, i2);
 }
 
 /* { dg-final { scan-assembler-times "vxorps" 4 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c 
b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c
index b334b88194b..b615b8d 100644
--- a/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-dest-false-dep-for-glc.c
@@ -13,56 +13,219 @@ extern __m512 f1, f11;
 extern __m256 f2;
 extern __m128 f3, f33;
 
-__mmask32 m32;
 __mmask16 m16;
 __mmask8 m8;
 
-void mullo_test (void)
-{
-  i1 = _mm512_mullo_epi64 (i1, i1);
-  i1 = _mm512_mask_mullo_epi64 (i1, m8, i1, i1);
-  i1 = _mm512_maskz_mullo_epi64 (m8, i1, i1);
-  i2 = _mm256_mullo_epi64 (i2, i2);
-  i2 = _mm256_mask_mullo_epi64 (i2, m8, i2, i2);
-  i2 = _mm256_maskz_mullo_epi64 (m8, i2, i2);
-  i3 = _mm_mullo_epi64 (i3, i3);
-  i3 = _mm_mask_mullo_epi64 (i3, m8, i3, i3);
-  i3 = _mm_maskz_mullo_epi64 (m8, i3, i3);
-}
-
-void range_test (void)
-{
-  d1 = _mm512_range_pd (d1, d11, 15);
-  d11 = _mm512_range_round_pd (d11, d1, 15, 8);
-  d1 = _mm512_mask_range_pd (d1, m8, d11, d11, 15);
-  d11 = _mm512_mask_range_round_pd (d11, m8, d1, d1, 15, 8);
-  d1 = _mm512_maskz_range_pd (m8, d11, d11, 15);
-  d11 = _mm512_maskz_range_round_pd (m8, d1, d1, 15, 8);
-  d2 = _mm256_range_pd (d2, d2, 15);
-  d2 = _mm256_mask_range_pd (d2, m8, d2, d2, 15);
-  d2 = _mm256_maskz_range_pd (m8, d2, d2, 15);
-  d3 = _mm_range_pd (d3, d3, 15);
-  d3 = _mm_mask_range_pd (d3, m8, d3, d3, 15);
-  d3 = _mm_maskz_range_pd (m8, d3, d3, 15);
-  d33 = _mm_range_sd (d33, d33, 15);
-  d33 = _mm_mask_range_sd (d33, m8, d33, d33, 15);
-  d33 = _mm_maskz_range_sd (m8, d33, d33, 15);
-
-  f1 = _mm512_range_ps (f1, f11, 15);
-  f11 = _mm512_range_round_ps (f11, f1, 15, 8);
-  f1 = _mm512_mask_range_ps (f1, m16, f11, f11, 15);
-  f11 = _mm512_mask_range_round_ps (f11, m16, f1, f1, 15, 8);
-  f1 = _mm512_maskz_range_ps (m16, f11, f11, 15);
-  f11 = _mm512_maskz_range_round_ps (m16, f1, f1, 15, 8);
-  f2 = _mm256_range_ps (f2, f2, 15);
-  f2 = _mm256_mask_range_ps (f2, m8, f2, f2, 15);
-  f2 = _mm256_maskz_range_ps (m8, f2, f2, 15);
-  f3 = _mm_range_ps (f3, f3, 15);
-  f3 = _mm_mask_range_ps (f3, m8, f3, f3, 15);
-  f3 = _mm_maskz_range_ps (m8, f3, f3, 15);
-  f33 = _mm_range_ss (f33, f33, 15);
-  f33 = _mm_mask_range_ss (f33, m8, f33, f33, 15);
-  f33 = _mm_maskz_range_ss (m8, f33, f33, 15);
+#define MULLO(func, type)  \
+  type \
+  mullo##type (type i2, type i1)   \
+  {\
+return func (i1, i1);  \
+  }
+
+#define MULLO_MASK(func, type) \
+  type \
+  mullo_mask##type (type i2, type i1)  \
+  {\
+return func (i1, m8, i1,

[PATCH 1/2] Use NO_REGS in cost calculation when the preferred register class are not known yet.

2023-04-19 Thread liuhongt via Gcc-patches

1547  /* If this insn loads a parameter from its stack slot, then it
1548 represents a savings, rather than a cost, if the parameter is
1549 stored in memory.  Record this fact.
1550
1551 Similarly if we're loading other constants from memory (constant
1552 pool, TOC references, small data areas, etc) and this is the only
1553 assignment to the destination pseudo.

At that time, preferred regclass is unknown, and GENERAL_REGS is used to
record memory move cost, but it's not accurate especially for large vector
modes, i.e. 512-bit vector in x86 which would most probably allocate with
SSE_REGS instead of GENERAL_REGS. Using GENERAL_REGS here will overestimate
the cost of this load and make RA propagate the memeory operand into many
consume instructions which causes worse performance.

Fortunately, NO_REGS is used to record the best scenario, so the patch uses
NO_REGS instead of GENERAL_REGS here, it could help RA in PR108707.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
and aarch64-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR rtl-optimization/108707
* ira-costs.cc (scan_one_insn): Use NO_REGS instead of
GENERAL_REGS when preferred reg_class is not known.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr108707.c: New test.
---
 gcc/ira-costs.cc |  5 -
 gcc/testsuite/gcc.target/i386/pr108707.c | 16 
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr108707.c

diff --git a/gcc/ira-costs.cc b/gcc/ira-costs.cc
index c0fdef807dd..d2a801ab9b0 100644
--- a/gcc/ira-costs.cc
+++ b/gcc/ira-costs.cc
@@ -1572,7 +1572,10 @@ scan_one_insn (rtx_insn *insn)
   && (! ira_use_lra_p || ! pic_offset_table_rtx
  || ! contains_symbol_ref_p (XEXP (note, 0
 {
-  enum reg_class cl = GENERAL_REGS;
+  /* Costs for NO_REGS are used in cost calculation on the
+1st pass when the preferred register classes are not
+known yet.  In this case we take the best scenario.  */
+  enum reg_class cl = NO_REGS;
   rtx reg = SET_DEST (set);
   int num = COST_INDEX (REGNO (reg));
 
diff --git a/gcc/testsuite/gcc.target/i386/pr108707.c 
b/gcc/testsuite/gcc.target/i386/pr108707.c
new file mode 100644
index 000..bc1a476f551
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr108707.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-not {(?n)vfmadd[1-3]*ps.*\(} } } */
+/* { dg-final { scan-assembler-times {(?n)vfmadd[1-3]*ps[ \t]*} 3 } } */
+
+#include
+
+void
+foo (__m512 pv, __m512 a, __m512 b, __m512 c,
+ __m512* pdest, __m512* p1)
+{
+  __m512 t = *p1;
+pdest[0] = _mm512_fmadd_ps (t, pv, a);
+pdest[1] = _mm512_fmadd_ps (t, pv, b);
+pdest[2] = _mm512_fmadd_ps (t, pv, c);
+}
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] [i386] Support type _Float16/__bf16 independent of SSE2.

2023-04-19 Thread liuhongt via Gcc-patches

-Jakub's comments--
That said, these fundamental types whose presence/absence depends on ISA flags
are quite problematic IMHO, as they are incompatible with the target
attribute/pragmas. Whether they are available or not available depends on
whether in this case SSE2 is enabled during compiler initialization (aka after
parsing command line options) and then they are available or unavailable to
everything else based on that.
-comments end--

Enable _Float16 and __bf16 all the time but issue errors when the
types are used in conversion, unary operation, binary operation,
parameter passing or value return when TARGET_SSE2 is not available.

Also undef macros which are used by libgcc/libstdc++ to check the
backend support of the _Float16/__bf16 types when TARGET_SSE2 is not
available.

Bootstrapped and regtested  on x86_64-pc-linux-gnu{-m32,}
Also successfully cross-build targte i686-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR target/109054
* config/i386/i386-builtins.cc
(ix86_register_float16_builtin_type): Remove TARGET_SSE2.
(ix86_register_bf16_builtin_type): Ditto.
* config/i386/i386-c.cc (ix86_target_macros): When TARGET_SSE2
isn't available, undef the macros which are used to check the
backend support of the _Float16/__bf16 types when building
libstdc++ and libgcc.
* config/i386/i386.cc (construct_container): Issue errors for
HFmode/BFmode when TARGET_SSE2 is not available.
(function_value_32): Ditto.
(ix86_scalar_mode_supported_p): Remove TARGET_SSE2 for HFmode/BFmode.
(ix86_libgcc_floating_mode_supported_p): Ditto.
(ix86_emit_support_tinfos): Adjust codes.
(ix86_invalid_conversion): New function.
(ix86_invalid_unary_op): Ditto.
(ix86_invalid_binary_op): Ditto.
(TARGET_INVALID_CONVERSION): Defined.
(TARGET_INVALID_UNARY_OP): Defined.
(TARGET_INVALID_BINARY_OP): Defined.
* config/i386/immintrin.h: Remove #ifdef __SSE2__ for fp16/bf16
related instrinsics header filers.
* config/i386/i386.h (VALID_SSE2_TYPE_MODE): New macro.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr109054.c: New test.
* gcc.target/i386/sse2-bfloat16-1.c: Adjust error info.
* gcc.target/i386/sse2-float16-1.c: Ditto.
* gcc.target/i386/sse2-float16-4.c: New test.
* gcc.target/i386/sse2-float16-5.c: New test.
* g++.target/i386/float16-1.C: Adjust error info.
---
 gcc/config/i386/i386-builtins.cc  |   4 +-
 gcc/config/i386/i386-c.cc |  37 ++
 gcc/config/i386/i386.cc   | 117 --
 gcc/config/i386/i386.h|   4 +
 gcc/config/i386/immintrin.h   |   4 -
 gcc/testsuite/g++.target/i386/float16-1.C |   8 +-
 gcc/testsuite/gcc.target/i386/pr109054.c  |   6 +
 .../gcc.target/i386/sse2-bfloat16-1.c |   8 +-
 .../gcc.target/i386/sse2-float16-1.c  |   8 +-
 .../gcc.target/i386/sse2-float16-4.c  |  25 
 .../gcc.target/i386/sse2-float16-5.c  |  24 
 11 files changed, 217 insertions(+), 28 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109054.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-float16-5.c

diff --git a/gcc/config/i386/i386-builtins.cc b/gcc/config/i386/i386-builtins.cc
index fc0c82b156e..1cdabfd3a0a 100644
--- a/gcc/config/i386/i386-builtins.cc
+++ b/gcc/config/i386/i386-builtins.cc
@@ -1367,7 +1367,7 @@ ix86_register_float16_builtin_type (void)
   else
 ix86_float16_type_node = float16_type_node;
 
-  if (!maybe_get_identifier ("_Float16") && TARGET_SSE2)
+  if (!maybe_get_identifier ("_Float16"))
 lang_hooks.types.register_builtin_type (ix86_float16_type_node,
"_Float16");
 }
@@ -1385,7 +1385,7 @@ ix86_register_bf16_builtin_type (void)
   else
 ix86_bf16_type_node = bfloat16_type_node;
 
-  if (!maybe_get_identifier ("__bf16") && TARGET_SSE2)
+  if (!maybe_get_identifier ("__bf16"))
 lang_hooks.types.register_builtin_type (ix86_bf16_type_node, "__bf16");
 }
 
diff --git a/gcc/config/i386/i386-c.cc b/gcc/config/i386/i386-c.cc
index e7bd7cc706c..eb77d0af226 100644
--- a/gcc/config/i386/i386-c.cc
+++ b/gcc/config/i386/i386-c.cc
@@ -817,6 +817,43 @@ ix86_target_macros (void)
   if (!TARGET_80387)
 cpp_define (parse_in, "_SOFT_FLOAT");
 
+  /* HFmode/BFmode is supported without depending any isa
+ in scalar_mode_supported_p and libgcc_floating_mode_supported_p,
+ but according to psABI, they're really supported w/ SSE2 and above.
+ Since libstdc++ uses __STDCPP_FLOAT16_T__ and __STDCPP_BFLOAT16_T__
+ for backend support of the types, undef the macros to avoid
+ build failure, see PR109504.  */
+  if (!TARGET_SSE2)
+{
+  if (c_dialect_cxx ()
+ &&

[PATCH] Check hard_regno_mode_ok before setting lowest memory move cost for the mode with different reg classes.

2023-04-03 Thread liuhongt via Gcc-patches

There's a potential performance issue when backend returns some
unreasonable value for the mode which can be never be allocate with
reg class.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk(or GCC14 stage1)?

gcc/ChangeLog:

PR rtl-optimization/109351
* ira.cc (setup_class_subset_and_memory_move_costs): Check
hard_regno_mode_ok before setting lowest memory move cost for
the mode with different reg classes.
---
 gcc/ira.cc | 4 
 1 file changed, 4 insertions(+)

diff --git a/gcc/ira.cc b/gcc/ira.cc
index 6c7f4901e4c..02dea5d49ee 100644
--- a/gcc/ira.cc
+++ b/gcc/ira.cc
@@ -588,6 +588,10 @@ setup_class_subset_and_memory_move_costs (void)
/* Costs for NO_REGS are used in cost calculation on the
   1st pass when the preferred register classes are not
   known yet.  In this case we take the best scenario.  */
+   if (!targetm.hard_regno_mode_ok (ira_class_hard_regs[cl][0],
+(machine_mode) mode))
+ continue;
+
if (ira_memory_move_cost[mode][NO_REGS][0]
> ira_memory_move_cost[mode][cl][0])
  ira_max_memory_move_cost[mode][NO_REGS][0]
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Document signbitm2.

2023-03-31 Thread liuhongt via Gcc-patches

Look through all backends which defined signbitm2.
1. When m is a scalar mode, the dest is SImode.
2. When m is a vector mode, the dest mode is the vector integer
mode has the same size and elements number as m.

Ok for trunk?

gcc/ChangeLog:

* doc/md.texi: Document signbitm2.
---
 gcc/doc/md.texi | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 8e3113599fd..edfa51e867a 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -6030,6 +6030,17 @@ floating-point mode.
 
 This pattern is not allowed to @code{FAIL}.
 
+@cindex @code{signbit@var{m}2} instruction pattern
+@item @samp{signbit@var{m}2}
+Store the sign bit of floating-point operand 1 in operand 0.
+@var{m} is either a scalar or vector mode.  When it is a scalar,
+operand 1 has mode @var{m} but operand 0 must have mode @code{SImode}.
+When @var{m} is a vector, operand 1 has the mode @var{m}.
+operand 0's mode should be an vector integer mode which has
+the same number of elements and the same size as mode @var{m}.
+
+This pattern is not allowed to @code{FAIL}.
+
 @cindex @code{significand@var{m}2} instruction pattern
 @item @samp{significand@var{m}2}
 Store the significand of floating-point operand 1 in operand 0.
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Adjust memory_move_cost for MASK_REGS when MODE_SIZE > 8.

2023-03-30 Thread liuhongt via Gcc-patches

RA sometimes will use lowest the cost of the mode with all different regclasses
w/o check if it's hard_regno_mode_ok.
It's impossible to put modes whose size > 8 into MASK_REGS, ajdust the cost to
avoid potential performance issue.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (inline_memory_move_cost): Return 100
for MASK_REGS when MODE_SIZE > 8.
---
 gcc/config/i386/i386.cc | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 2cc8e9548a9..2581b800a06 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -19847,9 +19847,12 @@ inline_memory_move_cost (machine_mode mode, enum 
reg_class regclass, int in)
  index = 1;
  break;
/* DImode loads and stores assumed to cost the same as SImode.  */
-   default:
+   case 4:
+   case 8:
  index = 2;
  break;
+   default:
+ return 100;
}
 
   if (in == 2)
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH V2] Rename ufix_trunc/ufloat* patterns to fixuns_trunc/floatuns* to align with standard pattern name.

2023-03-30 Thread liuhongt via Gcc-patches

> > Just rename the instruction and fix all its call sites. The name of
> > the insn pattern is internal to the compiler and can be renamed at
> > will.
>
> Ideally, we should standardize all the names to a standard name, so
> e.g. ufix_  -> fixuns_ and ufloat -> floatuns.

Updated.

There's some typo for the standard pattern name for unsigned_{float,fix},
it should be floatunsmn2/fixuns_truncmn2, not ufloatmn2/ufix_truncmn2
in current trunk, the patch fix the typo, also change all though
ufix_trunc/ufloat patterns.

Also vcvttps2udq is available under AVX512VL, so it can be generated
directly instead of being emulated via vcvttps2dq.

gcc/ChangeLog:

PR target/85048
* config/i386/i386-builtin.def (BDESC): Adjust icode name from 
ufloat/ufix to floatuns/fixuns.
* config/i386/i386-expand.cc (ix86_expand_vector_convert_uns_vsivsf): 
Adjust comments.
* config/i386/sse.md 
(ufloat2):
Renamed to ..

(floatuns2):.. 
this.

(_ufix_notrunc):
Renamed to ..

(_fixuns_notrunc):
.. this.
(fix_truncv16sfv16si2):
Renamed to ..
(fix_truncv16sfv16si2):.. 
this.
(ufloat2): Renamed to ..
(floatuns2): .. this.
(ufloatv2siv2df2): Renamed to ..
(floatunsv2siv2df2): .. this.
(ufix_notrunc2):
Renamed to ..
(fixuns_notrunc2):
.. this.
(ufix_notruncv2dfv2si2): Renamed to ..
(fixuns_notruncv2dfv2si2):.. this.
(ufix_notruncv2dfv2si2_mask): Renamed to ..
(fixuns_notruncv2dfv2si2_mask): .. this.
(*ufix_notruncv2dfv2si2_mask_1): Renamed to ..
(*fixuns_notruncv2dfv2si2_mask_1): .. this.
(ufix_truncv2dfv2si2): Renamed to ..
(*fixuns_truncv2dfv2si2): .. this.
(ufix_truncv2dfv2si2_mask): Renamed to ..
(fixuns_truncv2dfv2si2_mask): .. this.
(*ufix_truncv2dfv2si2_mask_1): Renamed to ..
(*fixuns_truncv2dfv2si2_mask_1): .. this.
(ufix_truncv4dfv4si2): Renamed to ..
(fixuns_truncv4dfv4si2): .. this.
(ufix_notrunc2):
Renamed to ..
(fixuns_notrunc2):
.. this.
(ufix_trunc2): Renamed to ..
(fixuns_trunc2):
.. this.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr85048.C: New test.
---
 gcc/config/i386/i386-builtin.def| 40 +++---
 gcc/config/i386/i386-expand.cc  |  2 +-
 gcc/config/i386/sse.md  | 69 +++--
 gcc/testsuite/g++.target/i386/pr85048.C | 33 
 4 files changed, 84 insertions(+), 60 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr85048.C

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 17dfe40fac7..6dae6972d81 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -1384,7 +1384,7 @@ BDESC (OPTION_MASK_ISA_AVX512F, 0, 
CODE_FOR_avx512f_compressv8df_mask, "__builti
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_compressv16sf_mask, 
"__builtin_ia32_compresssf512_mask", IX86_BUILTIN_COMPRESSPS512, UNKNOWN, (int) 
V16SF_FTYPE_V16SF_V16SF_UHI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_floatv8siv8df2_mask, 
"__builtin_ia32_cvtdq2pd512_mask", IX86_BUILTIN_CVTDQ2PD512, UNKNOWN, (int) 
V8DF_FTYPE_V8SI_V8DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_vcvtps2ph512_mask_sae,  
"__builtin_ia32_vcvtps2ph512_mask", IX86_BUILTIN_CVTPS2PH512, UNKNOWN, (int) 
V16HI_FTYPE_V16SF_INT_V16HI_UHI)
-BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_ufloatv8siv8df2_mask, 
"__builtin_ia32_cvtudq2pd512_mask", IX86_BUILTIN_CVTUDQ2PD512, UNKNOWN, (int) 
V8DF_FTYPE_V8SI_V8DF_UQI)
+BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_floatunsv8siv8df2_mask, 
"__builtin_ia32_cvtudq2pd512_mask", IX86_BUILTIN_CVTUDQ2PD512, UNKNOWN, (int) 
V8DF_FTYPE_V8SI_V8DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_cvtusi2sd32, 
"__builtin_ia32_cvtusi2sd32", IX86_BUILTIN_CVTUSI2SD32, UNKNOWN, (int) 
V2DF_FTYPE_V2DF_UINT)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_expandv8df_mask, 
"__builtin_ia32_expanddf512_mask", IX86_BUILTIN_EXPANDPD512, UNKNOWN, (int) 
V8DF_FTYPE_V8DF_V8DF_UQI)
 BDESC (OPTION_MASK_ISA_AVX512F, 0, CODE_FOR_avx512f_expandv8df_maskz, 
"__builtin_ia32_expanddf512_maskz", IX86_BUILTIN_EXPANDPD512Z, UNKNOWN, (int) 
V8DF_FTYPE_V8DF_V8DF_UQI)
@@ -1719,32 +1719,32 @@ BDESC (OPTION_MASK_ISA_AVX512DQ | 
OPTION_MASK_ISA_AVX512VL, 0, CODE_FOR_fixuns_t
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_fixuns_truncv2dfv2di2_mask, "__builtin_ia32_cvttpd2uqq128_mask", 
IX86_BUILTIN_CVTTPD2UQQ128, UNKNOWN, (int) V2DI_FTYPE_V2DF_V2DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_fix_notruncv4dfv4di2_mask, "__builtin_ia32_cvtpd2qq256_mask", 
IX86_BUILTIN_CVTPD2QQ256, UNKNOWN, (int) V4DI_FTYPE_V4DF_V4DI_UQI)
 BDESC (OPTION_MASK_ISA_AVX512DQ | OPTION_MASK_ISA_AVX512VL, 0, 
CODE_FOR_fix_notruncv2dfv2di2_mask,

[PATCH] Support vector conversion for AVX512 vcvtudq2pd/vcvttps2udq/vcvttpd2udq.

2023-03-29 Thread liuhongt via Gcc-patches

There's some typo for the standard pattern name for unsigned_{float,fix},
it should be floatunsmn2/fixuns_truncmn2, not ufloatmn2/ufix_truncmn2
in current trunk, the patch fix the typo.

Also vcvttps2udq is available under AVX512VL, so it can be generated
directly instead of being emulated via vcvttps2dq.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for GCC14 stage1{or maybe for trunk)?

gcc/ChangeLog:

PR target/85048
* config/i386/sse.md (floatuns2):
Generate vcvtudq2ps under AVX512VL.
(fixuns_truncv4dfv4si2): New expander.
(floatuns2): New expander.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr85048.C: New test.
---
 gcc/config/i386/sse.md  | 18 --
 gcc/testsuite/g++.target/i386/pr85048.C | 33 +
 2 files changed, 49 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr85048.C

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 172ec3bea4f..9c2bd468c65 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -8014,8 +8014,9 @@ (define_expand "fixuns_trunc2"
(match_operand:VF1 1 "register_operand")]
   "TARGET_SSE2"
 {
-  if (mode == V16SFmode)
-emit_insn (gen_ufix_truncv16sfv16si2 (operands[0],
+  /* AVX512 support vcvttps2udq for all 128/256/512-bit vectors.  */
+  if (mode == V16SFmode || TARGET_AVX512VL)
+emit_insn (gen_ufix_trunc2 (operands[0],
  operands[1]));
   else
 {
@@ -8413,6 +8414,12 @@ (define_insn "*floatv2div2sf2_mask_1"
(set_attr "prefix" "evex")
(set_attr "mode" "V4SF")])
 
+(define_expand "floatuns2"
+  [(set (match_operand:VF2_512_256VL 0 "register_operand")
+   (unsigned_float:VF2_512_256VL
+ (match_operand: 1 "nonimmediate_operand")))]
+   "TARGET_AVX512F")
+
 (define_insn "ufloat2"
   [(set (match_operand:VF2_512_256VL 0 "register_operand" "=v")
(unsigned_float:VF2_512_256VL
@@ -8694,6 +8701,13 @@ (define_insn "fix_truncv4dfv4si2"
(set_attr "prefix" "maybe_evex")
(set_attr "mode" "OI")])
 
+
+/* The standard pattern name is fixuns_truncmn2.  */
+(define_expand "fixuns_truncv4dfv4si2"
+  [(set (match_operand:V4SI 0 "register_operand")
+   (unsigned_fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand")))]
+  "TARGET_AVX512VL && TARGET_AVX512F")
+
 (define_insn "ufix_truncv4dfv4si2"
   [(set (match_operand:V4SI 0 "register_operand" "=v")
(unsigned_fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "vm")))]
diff --git a/gcc/testsuite/g++.target/i386/pr85048.C 
b/gcc/testsuite/g++.target/i386/pr85048.C
new file mode 100644
index 000..52973c18ebd
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr85048.C
@@ -0,0 +1,33 @@
+/* PR target/85048 */
+/* { dg-do compile }  */
+/* { dg-options "-std=c++17 -O2 -mavx512vl -mavx512dq 
-mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-times {(?n)vcvtudq2pd[ \t]+} 2 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvttps2udq[ \t]+} 2 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvttpd2udqy?[ \t]+} 1 } } */
+
+#include 
+
+template 
+using V [[gnu::vector_size(Size)]] = T;
+
+template  V cvt4(V x) {
+return V{To(x[0]), To(x[1]), To(x[2]), To(x[3])};
+}
+template  V cvt8(V x) {
+return V{
+To(x[0]), To(x[1]), To(x[2]), To(x[3]),
+To(x[4]), To(x[5]), To(x[6]), To(x[7])
+};
+}
+
+#define _(name, from, to, size) \
+auto name(V x) { return cvt##size(x); }
+// integral -> double
+_(vcvtudq2pd, uint32_t, double, 4)
+_(vcvtudq2pd, uint32_t, double, 8)
+
+_( cvttps2udq, float, uint32_t,  4)
+_(vcvttps2udq, float, uint32_t,  8)
+
+// double -> integral
+_(vcvttpd2udq, double, uint32_t, 4)
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Generate vpblendd instead of vpblendw for V4SI under AVX2.

2023-03-29 Thread liuhongt via Gcc-patches

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for GCC14 stage-1(or maybe trunk)?

gcc/ChangeLog:

* config/i386/i386-expand.cc (expand_vec_perm_blend): Generate
vpblendd instead of vpblendw for V4SI under avx2.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr88828-0.c: Adjust testcase.
---
 gcc/config/i386/i386-expand.cc| 18 ++
 gcc/testsuite/gcc.target/i386/pr88828-0.c |  2 +-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index c1300dc4e26..1c436262ee5 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -19069,10 +19069,20 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
   goto do_subreg;
 
 case E_V4SImode:
-  for (i = 0; i < 4; ++i)
-   mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
-  vmode = V8HImode;
-  goto do_subreg;
+  if (TARGET_AVX2)
+   {
+ /* Use vpblendd instead of vpblendw.  */
+ for (i = 0; i < nelt; ++i)
+   mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
+ break;
+   }
+  else
+   {
+ for (i = 0; i < 4; ++i)
+   mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
+ vmode = V8HImode;
+ goto do_subreg;
+   }
 
 case E_V16QImode:
   /* See if bytes move in pairs so we can use pblendw with
diff --git a/gcc/testsuite/gcc.target/i386/pr88828-0.c 
b/gcc/testsuite/gcc.target/i386/pr88828-0.c
index 3ddb2d13526..441c441b51d 100644
--- a/gcc/testsuite/gcc.target/i386/pr88828-0.c
+++ b/gcc/testsuite/gcc.target/i386/pr88828-0.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse4.2" } */
+/* { dg-options "-O2 -msse4.2 -mno-avx2" } */
 
 typedef int v4si __attribute__((vector_size(16)));
 typedef float v4sf __attribute__((vector_size(16)));
-- 
2.39.1.388.g2fc9e9ca3c

[PATCH] Remove TARGET_GEN_MEMSET_SCRATCH_RTX since it's not used anymore.

2023-03-21 Thread liuhongt via Gcc-patches

The target hook is only used by i386, and the current definition is
same as default gen_reg_rtx. So there's no need for this target hook.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk(or GCC14)?

gcc/ChangeLog:

* builtins.cc (builtin_memset_read_str): Replace
targetm.gen_memset_scratch_rtx with gen_reg_rtx.
(builtin_memset_gen_str): Ditto.
* config/i386/i386-expand.cc
(ix86_convert_const_wide_int_to_broadcast): Replace
ix86_gen_scratch_sse_rtx with gen_reg_rtx.
(ix86_expand_vector_move): Ditto.
* config/i386/i386-protos.h (ix86_gen_scratch_sse_rtx):
Removed.
* config/i386/i386.cc (ix86_gen_scratch_sse_rtx): Removed.
(TARGET_GEN_MEMSET_SCRATCH_RTX): Removed.
* doc/tm.texi: Remove TARGET_GEN_MEMSET_SCRATCH_RTX.
* doc/tm.texi.in: Ditto.
* target.def: Ditto.
---
 gcc/builtins.cc|  4 ++--
 gcc/config/i386/i386-expand.cc |  6 +++---
 gcc/config/i386/i386-protos.h  |  2 --
 gcc/config/i386/i386.cc| 12 
 gcc/doc/tm.texi|  7 ---
 gcc/doc/tm.texi.in |  2 --
 gcc/target.def |  9 -
 7 files changed, 5 insertions(+), 37 deletions(-)

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 90246e214d6..8026e2001b7 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -4212,7 +4212,7 @@ builtin_memset_read_str (void *data, void *prev,
return const_vec;
 
   /* Use the move expander with CONST_VECTOR.  */
-  target = targetm.gen_memset_scratch_rtx (mode);
+  target = gen_reg_rtx (mode);
   emit_move_insn (target, const_vec);
   return target;
 }
@@ -4256,7 +4256,7 @@ builtin_memset_gen_str (void *data, void *prev,
 the memset expander.  */
   insn_code icode = optab_handler (vec_duplicate_optab, mode);
 
-  target = targetm.gen_memset_scratch_rtx (mode);
+  target = gen_reg_rtx (mode);
   class expand_operand ops[2];
   create_output_operand ([0], target, mode);
   create_input_operand ([1], (rtx) data, QImode);
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index c1300dc4e26..1e3ce4b7c3f 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -338,7 +338,7 @@ ix86_convert_const_wide_int_to_broadcast (machine_mode 
mode, rtx op)
   machine_mode vector_mode;
   if (!mode_for_vector (broadcast_mode, nunits).exists (_mode))
 gcc_unreachable ();
-  rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
+  rtx target = gen_reg_rtx (vector_mode);
   bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
   target,
   GEN_INT (val_broadcast));
@@ -686,7 +686,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
   if (!register_operand (op0, mode)
  && !register_operand (op1, mode))
{
- rtx scratch = ix86_gen_scratch_sse_rtx (mode);
+ rtx scratch = gen_reg_rtx (mode);
  emit_move_insn (scratch, op1);
  op1 = scratch;
}
@@ -728,7 +728,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
   && !register_operand (op0, mode)
   && !register_operand (op1, mode))
 {
-  rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
+  rtx tmp = gen_reg_rtx (GET_MODE (op0));
   emit_move_insn (tmp, op1);
   emit_move_insn (op0, tmp);
   return;
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index bfb2198265a..71ae95ffef7 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -50,8 +50,6 @@ extern void ix86_reset_previous_fndecl (void);
 
 extern bool ix86_using_red_zone (void);
 
-extern rtx ix86_gen_scratch_sse_rtx (machine_mode);
-
 extern unsigned int ix86_regmode_natural_size (machine_mode);
 extern bool ix86_check_builtin_isa_match (unsigned int fcode);
 #ifdef RTX_CODE
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 5d0e4739a84..6a8734c2346 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24197,15 +24197,6 @@ ix86_optab_supported_p (int op, machine_mode mode1, 
machine_mode,
 }
 }
 
-/* Implement the TARGET_GEN_MEMSET_SCRATCH_RTX hook.  Return a scratch
-   register in MODE for vector load and store.  */
-
-rtx
-ix86_gen_scratch_sse_rtx (machine_mode mode)
-{
-  return gen_reg_rtx (mode);
-}
-
 /* Address space support.
 
This is not "far pointers" in the 16-bit sense, but an easy way
@@ -25253,9 +25244,6 @@ static bool ix86_libc_has_fast_function (int fcode 
ATTRIBUTE_UNUSED)
 #undef TARGET_LIBC_HAS_FAST_FUNCTION
 #define TARGET_LIBC_HAS_FAST_FUNCTION ix86_libc_has_fast_function
 
-#undef TARGET_GEN_MEMSET_SCRATCH_RTX
-#define TARGET_GEN_MEMSET_SCRATCH_RTX ix86_gen_scratch_sse_rtx
-
 #if CHECKING_P
 #undef TARGET_RUN_TARGET_SELFTESTS
 #define TARGET_RUN_TARGET_SELFTESTS

[PATCH] [vect] Don't peel nonlinear iv(mult or shift) for epilog when vf is not constant.

2023-02-01 Thread liuhongt via Gcc-patches

Normally when vf is not constant, it will be prevented by
vectorizable_nonlinear_inductions, but for this case, it failed going
into

if (STMT_VINFO_RELEVANT_P (stmt_info))
  {
need_to_vectorize = true;
if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
   && ! PURE_SLP_STMT (stmt_info))
  ok = vectorizable_induction (loop_vinfo,
   stmt_info, NULL, NULL,
   _vec);

since the iv is never used outside of the loop, and will be dce later, so
vectorizer doesn't bother checking if it's vectorizable. it's
true but hit gcc_assert in vect_can_peel_nonlinear_iv_p when vf is not
constant. One solution is ignoring the nonlinear iv peeling if it's
!STMT_VINFO_RELEVANT_P (stmt_info) just like the upper code, the other
solution is returning false earlier in the
vect_can_peel_nonlinear_iv_p when vf is not constant, the patch chooses
the second incase there's other cases using vect_can_advance_ivs_p which
calls vect_can_peel_nonlinear_iv_p.

Also remove vect_can_peel_nonlinear_iv_p from vectorizable_nonlinear_inductions.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and 
aarch64-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/108601
* tree-vectorizer.h (vect_can_peel_nonlinear_iv_p): Remove declare.
* tree-vect-loop.cc
(vectorizable_nonlinear_induction): Remove
vect_can_peel_nonlinear_iv_p.
(vect_can_peel_nonlinear_iv_p): Don't peel
nonlinear iv(mult or shift) for epilog when vf is not
constant and moved the defination to ..
* tree-vect-loop-manip.cc (vect_can_peel_nonlinear_iv_p):
.. Here.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/pr108601.c: New test.
---
 gcc/testsuite/gcc.target/aarch64/pr108601.c | 11 +
 gcc/tree-vect-loop-manip.cc | 44 
 gcc/tree-vect-loop.cc   | 46 -
 gcc/tree-vectorizer.h   |  3 --
 4 files changed, 55 insertions(+), 49 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr108601.c

diff --git a/gcc/testsuite/gcc.target/aarch64/pr108601.c 
b/gcc/testsuite/gcc.target/aarch64/pr108601.c
new file mode 100644
index 000..deb8b3061d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr108601.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fprofile-generate -mcpu=neoverse-v1" } */
+
+int
+foo() {
+  int flag = 1;
+  for (; flag <= 1 << 21; flag <<= 1)
+;
+  return 0;
+}
+
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index b5c5f859144..c04fcf40c44 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -1390,6 +1390,50 @@ iv_phi_p (stmt_vec_info stmt_info)
   return true;
 }
 
+/* Return true if vectorizer can peel for nonlinear iv.  */
+static bool
+vect_can_peel_nonlinear_iv_p (loop_vec_info loop_vinfo,
+ enum vect_induction_op_type induction_type)
+{
+  tree niters_skip;
+  /* Init_expr will be update by vect_update_ivs_after_vectorizer,
+ if niters or vf is unkown:
+ For shift, when shift mount >= precision, there would be UD.
+ For mult, don't known how to generate
+ init_expr * pow (step, niters) for variable niters.
+ For neg, it should be ok, since niters of vectorized main loop
+ will always be multiple of 2.  */
+  if ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ())
+  && induction_type != vect_step_op_neg)
+{
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"Peeling for epilogue is not supported"
+" for nonlinear induction except neg"
+" when iteration count is unknown.\n");
+  return false;
+}
+
+  /* Also doens't support peel for neg when niter is variable.
+ ??? generate something like niter_expr & 1 ? init_expr : -init_expr?  */
+  niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+  if ((niters_skip != NULL_TREE
+   && TREE_CODE (niters_skip) != INTEGER_CST)
+  || (!vect_use_loop_mask_for_alignment_p (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0))
+{
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"Peeling for alignement is not supported"
+" for nonlinear induction when niters_skip"
+" is not constant.\n");
+  return false;
+}
+
+  return true;
+}
+
 /* Function vect_can_advance_ivs_p
 
In case the number of iterations that LOOP iterates is unknown at compile
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index f0801c23671..01b60a8de33 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -8808,49 +8808,6 @@

[PATCH] Change AVX512FP16 to AVX512-FP16 which is official name.

2023-01-28 Thread liuhongt via Gcc-patches

Ready to push to trunk.

---
 htdocs/gcc-12/changes.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htdocs/gcc-12/changes.html b/htdocs/gcc-12/changes.html
index 30fa4d6e..49055ffe 100644
--- a/htdocs/gcc-12/changes.html
+++ b/htdocs/gcc-12/changes.html
@@ -754,7 +754,7 @@ function Multiply (S1, S2 : Sign) return Sign is
 IA-32/x86-64
 
   New ISA extension support for Intel AVX512-FP16 was added.
-  AVX512FP16 intrinsics are available via the -mavx512fp16
+  AVX512-FP16 intrinsics are available via the -mavx512fp16
   compiler switch.
   
   For both C and C++ the _Float16 type is supported on
-- 
2.31.1

[PATCH] Change AVX512FP16 to AVX512-FP16 in the document.

2023-01-28 Thread liuhongt via Gcc-patches

The official name is AVX512-FP16.

Ready to push to trunk.

gcc/ChangeLog:

* config/i386/i386.opt: Change AVX512FP16 to AVX512-FP16.
* doc/invoke.texi: Ditto.
---
 gcc/config/i386/i386.opt | 2 +-
 gcc/doc/invoke.texi  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index feeb7dee9cc..7d57f617d65 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1209,7 +1209,7 @@ Support MWAIT and MONITOR built-in functions and code 
generation.
 
 mavx512fp16
 Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
-Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and 
AVX512FP16 built-in functions and code generation.
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and 
AVX512-FP16 built-in functions and code generation.
 
 mdirect-extern-access
 Target Var(ix86_direct_extern_access) Init(1)
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index a371cd91ef8..3d059467690 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -32336,7 +32336,7 @@ AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, 
AVX512VL, AVX512BW, AVX512DQ,
 AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, 
AVX512VBMI2,
 VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB,
 MOVDIRI, MOVDIR64B, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK,
-UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16 and AVX512BF16
+UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512-FP16 and AVX512BF16
 instruction set support.
 
 @item alderlake
@@ -32363,7 +32363,7 @@ AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, 
AVX512VL, AVX512BW, AVX512DQ,
 AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, 
AVX512VBMI2,
 VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB,
 MOVDIRI, MOVDIR64B, AVX512VP2INTERSECT, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG,
-SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512FP16,
+SERIALIZE, TSXLDTRK, UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, 
AVX512-FP16,
 AVX512BF16, AMX-FP16 and PREFETCHI instruction set support.
 
 @item k6
@@ -33229,7 +33229,7 @@ WBNOINVD, FMA4, PREFETCHW, RDPID, PREFETCHWT1, RDSEED, 
SGX, XOP, LWP,
 XSAVEOPT, XSAVEC, XSAVES, RTM, HLE, TBM, MWAITX, CLZERO, PKU, AVX512VBMI2,
 GFNI, VAES, WAITPKG, VPCLMULQDQ, AVX512BITALG, MOVDIRI, MOVDIR64B, AVX512BF16,
 ENQCMD, AVX512VPOPCNTDQ, AVX5124FMAPS, AVX512VNNI, AVX5124VNNIW, SERIALIZE,
-UINTR, HRESET, AMXTILE, AMXINT8, AMXBF16, KL, WIDEKL, AVXVNNI, AVX512FP16,
+UINTR, HRESET, AMXTILE, AMXINT8, AMXBF16, KL, WIDEKL, AVXVNNI, AVX512-FP16,
 AVXIFMA, AVXVNNIINT8, AVXNECONVERT, CMPCCXADD, AMX-FP16, PREFETCHI, RAOINT or
 CLDEMOTE extended instruction sets. Each has a corresponding @option{-mno-}
 option to disable use of these instructions.
-- 
2.31.1

[PATCH] Don't add crtfastmath.o for -shared.

2023-01-13 Thread liuhongt via Gcc-patches

Patches [1] and [2] fixed PR55522 for x86-linux but left all other x86
targets unfixed (x86-cygwin, x86-darwin and x86-mingw32).
This patch applies a similar change to other specs using crtfastmath.o.

Ok for trunk?

[1] https://gcc.gnu.org/pipermail/gcc-patches/2022-December/608528.html
[2] https://gcc.gnu.org/pipermail/gcc-patches/2022-December/608529.html

gcc/ChangeLog:

PR target/55522
* config/i386/cygwin.h (ENDFILE_SPEC): Link crtfastmath.o
whenever -mdaz-ftz is specified. Don't link crtfastmath.o when
-share or -mno-daz-ftz is specified.
* config/i386/darwin.h (ENDFILE_SPEC): Ditto.
* config/i386/mingw32.h (ENDFILE_SPEC): Ditto.
---
 gcc/config/i386/cygwin.h  | 2 +-
 gcc/config/i386/darwin.h  | 2 +-
 gcc/config/i386/mingw32.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/i386/cygwin.h b/gcc/config/i386/cygwin.h
index 0a604d65b32..d795ee1e3c5 100644
--- a/gcc/config/i386/cygwin.h
+++ b/gcc/config/i386/cygwin.h
@@ -48,7 +48,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s}\
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!shared:%{!mno-daz-ftz:crtfastmath.o%s}}}
 \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
 fvtable-verify=preinit:vtv_end.o%s; \
diff --git a/gcc/config/i386/darwin.h b/gcc/config/i386/darwin.h
index 5bcb714..ac198db0d9c 100644
--- a/gcc/config/i386/darwin.h
+++ b/gcc/config/i386/darwin.h
@@ -110,7 +110,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!shared:%{!mno-daz-ftz:crtfastmath.o%s}}}
 \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}" TM_DESTRUCTOR
diff --git a/gcc/config/i386/mingw32.h b/gcc/config/i386/mingw32.h
index 19a98c3d995..4e5b486a3da 100644
--- a/gcc/config/i386/mingw32.h
+++ b/gcc/config/i386/mingw32.h
@@ -196,7 +196,7 @@ along with GCC; see the file COPYING3.  If not see
 
 #undef ENDFILE_SPEC
 #define ENDFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!shared:%{!mno-daz-ftz:crtfastmath.o%s}}}
 \
%{!shared:%:if-exists(default-manifest.o%s)}\
%{fvtable-verify=none:%s; \
 fvtable-verify=preinit:vtv_end.o%s; \
-- 
2.31.1

[PATCH V2 2/2] [x86] x86: Add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR.

2022-12-14 Thread liuhongt via Gcc-patches

Update in v2:
1. Support -mno-daz-ftz, and make the the option effectively three state as:

if (mdaz-ftz)
  link crtfastmath.o
else if ((Ofast || ffast-math || funsafe-math-optimizations)
 && !shared && !mno-daz-ftz)
  link crtfastmath.o
else
  Don't link crtfastmath.o

2. Still make the option Target since
   a. cc1: error: command-line option ‘-mdaz-ftz’ is valid for the driver but 
not for C
   b. Since there's no real variable speicified by mdaz-ftz, I saw in 
options.h, it's marked as
   #ifndef GENERATOR_FILE
  int x_VAR_mdaz_ftz;
  #define x_VAR_mdaz_ftz do_not_use
  #endif

and not be saved and restored in cl_target_option_save and 
cl_target_option_restore(am I missing something?)

3. Capital the first letter and add more descriptions about -mdaz-ftz and 
-shared.

gcc/ChangeLog:

PR target/55522
PR target/36821
* config/i386/gnu-user-common.h (GNU_USER_TARGET_MATHFILE_SPEC):
Link crtfastmath.o whenever -mdaz-ftz is specified. Don't link
crtfastmath.o when -share or -mno-daz-ftz is specified.
* config/i386/i386.opt (mdaz-ftz): New option.
* doc/invoke.texi (x86 options): Document mftz-daz.
---
 gcc/config/i386/gnu-user-common.h |  2 +-
 gcc/config/i386/i386.opt  |  4 
 gcc/doc/invoke.texi   | 12 +++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/gnu-user-common.h 
b/gcc/config/i386/gnu-user-common.h
index 9910cd64363..f910524a6c3 100644
--- a/gcc/config/i386/gnu-user-common.h
+++ b/gcc/config/i386/gnu-user-common.h
@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3.  If not see
 
 /* Similar to standard GNU userspace, but adding -ffast-math support.  */
 #define GNU_USER_TARGET_MATHFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} \
+  
"%{mdaz-ftz:crtfastmath.o%s;Ofast|ffast-math|funsafe-math-optimizations:%{!shared:%{!mno-daz-ftz:crtfastmath.o%s}}}
 \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index fb4e57ada7c..0b7df429734 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -420,6 +420,10 @@ mpc80
 Target RejectNegative
 Set 80387 floating-point precision to 80-bit.
 
+mdaz-ftz
+Target
+Set the FTZ and DAZ Flags.
+
 mpreferred-stack-boundary=
 Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
 Attempt to keep stack aligned to this power of 2.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index cba4f19f4f4..7f1d002f228 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1433,7 +1433,7 @@ See RS/6000 and PowerPC Options.
 -m96bit-long-double  -mlong-double-64  -mlong-double-80  -mlong-double-128 @gol
 -mregparm=@var{num}  -msseregparm @gol
 -mveclibabi=@var{type}  -mvect8-ret-in-mem @gol
--mpc32  -mpc64  -mpc80  -mstackrealign @gol
+-mpc32  -mpc64  -mpc80  -mdaz-ftz -mstackrealign @gol
 -momit-leaf-frame-pointer  -mno-red-zone  -mno-tls-direct-seg-refs @gol
 -mcmodel=@var{code-model}  -mabi=@var{name}  -maddress-mode=@var{mode} @gol
 -m32  -m64  -mx32  -m16  -miamcu  -mlarge-data-threshold=@var{num} @gol
@@ -32753,6 +32753,16 @@ are enabled by default; routines in such libraries 
could suffer significant
 loss of accuracy, typically through so-called ``catastrophic cancellation'',
 when this option is used to set the precision to less than extended precision.
 
+@item -mdaz-ftz
+@opindex mdaz-ftz
+
+The flush-to-zero (FTZ) and denormals-are-zero (DAZ) flags in the MXCSR 
register
+are used to control floating-point calculations.SSE and AVX instructions
+including scalar and vector instructions could benefit from enabling the FTZ
+and DAZ flags when @option{-mdaz-ftz} is specified. Don't set FTZ/DAZ flags
+when @option{-mno-daz-ftz} or @option{-shared} is specified, @option{-mdaz-ftz}
+will set FTZ/DAZ flags even with @option{-shared}.
+
 @item -mstackrealign
 @opindex mstackrealign
 Realign the stack at entry.  On the x86, the @option{-mstackrealign}
-- 
2.27.0

[PATCH V2 1/2] x86: Don't add crtfastmath.o for -shared

2022-12-14 Thread liuhongt via Gcc-patches

Update in V2:
Split -shared change into a separate commit and add some documentation
for it.
Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
Ok of trunk?

Don't add crtfastmath.o for -shared to avoid changing the MXCSR register
when loading a shared library.  crtfastmath.o will be used only when
building executables.

 PR target/55522
* config/i386/gnu-user-common.h (GNU_USER_TARGET_MATHFILE_SPEC):
Don't add crtfastmath.o for -shared.
* doc/invoke.texi (-shared): Add related documentation.
---
 gcc/config/i386/gnu-user-common.h | 2 +-
 gcc/doc/invoke.texi   | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/gnu-user-common.h 
b/gcc/config/i386/gnu-user-common.h
index cab9be2bfb7..9910cd64363 100644
--- a/gcc/config/i386/gnu-user-common.h
+++ b/gcc/config/i386/gnu-user-common.h
@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3.  If not see
 
 /* Similar to standard GNU userspace, but adding -ffast-math support.  */
 #define GNU_USER_TARGET_MATHFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  "%{Ofast|ffast-math|funsafe-math-optimizations:%{!shared:crtfastmath.o%s}} \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}"
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index cb40b38b73a..cba4f19f4f4 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -17656,7 +17656,8 @@ needs to build supplementary stub code for constructors 
to work.  On
 multi-libbed systems, @samp{gcc -shared} must select the correct support
 libraries to link against.  Failing to supply the correct flags may lead
 to subtle defects.  Supplying them in cases where they are not necessary
-is innocuous.}
+is innocuous. For x86, crtfastmath.o will not be added when
+@option{-shared} is specified. }
 
 @item -shared-libgcc
 @itemx -static-libgcc
-- 
2.27.0

[PATCH] [x86] x86: Don't add crtfastmath.o for -shared and add a new option -mdaz-ftz to enable FTZ and DAZ flags in MXCSR.

2022-12-13 Thread liuhongt via Gcc-patches

Don't add crtfastmath.o for -shared to avoid changing the MXCSR
register when loading a shared library.  crtfastmath.o will be used
only when building executables.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/55522
PR target/36821
* config/i386/gnu-user-common.h (GNU_USER_TARGET_MATHFILE_SPEC):
Link crtfastmath.o when -mdaz-ftz is specified, not link it
when -shared is specified.
* config/i386/i386.opt (mdaz-ftz): New option.
* doc/invoke.texi (x86 options): Document mftz-daz.
---
 gcc/config/i386/gnu-user-common.h |  2 +-
 gcc/config/i386/i386.opt  |  4 
 gcc/doc/invoke.texi   | 10 +-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/gnu-user-common.h 
b/gcc/config/i386/gnu-user-common.h
index cab9be2bfb7..02e4a2192a4 100644
--- a/gcc/config/i386/gnu-user-common.h
+++ b/gcc/config/i386/gnu-user-common.h
@@ -47,7 +47,7 @@ along with GCC; see the file COPYING3.  If not see
 
 /* Similar to standard GNU userspace, but adding -ffast-math support.  */
 #define GNU_USER_TARGET_MATHFILE_SPEC \
-  "%{Ofast|ffast-math|funsafe-math-optimizations:crtfastmath.o%s} \
+  
"%{Ofast|ffast-math|funsafe-math-optimizations|mdaz-ftz:%{!shared:crtfastmath.o%s}}
 \
%{mpc32:crtprec32.o%s} \
%{mpc64:crtprec64.o%s} \
%{mpc80:crtprec80.o%s}"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index fb4e57ada7c..8fd222db857 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -420,6 +420,10 @@ mpc80
 Target RejectNegative
 Set 80387 floating-point precision to 80-bit.
 
+mdaz-ftz
+Target RejectNegative
+Set the FTZ and DAZ Flags.
+
 mpreferred-stack-boundary=
 Target RejectNegative Joined UInteger Var(ix86_preferred_stack_boundary_arg)
 Attempt to keep stack aligned to this power of 2.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index cb40b38b73a..670e3767fbd 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1433,7 +1433,7 @@ See RS/6000 and PowerPC Options.
 -m96bit-long-double  -mlong-double-64  -mlong-double-80  -mlong-double-128 @gol
 -mregparm=@var{num}  -msseregparm @gol
 -mveclibabi=@var{type}  -mvect8-ret-in-mem @gol
--mpc32  -mpc64  -mpc80  -mstackrealign @gol
+-mpc32  -mpc64  -mpc80  -mdaz-ftz -mstackrealign @gol
 -momit-leaf-frame-pointer  -mno-red-zone  -mno-tls-direct-seg-refs @gol
 -mcmodel=@var{code-model}  -mabi=@var{name}  -maddress-mode=@var{mode} @gol
 -m32  -m64  -mx32  -m16  -miamcu  -mlarge-data-threshold=@var{num} @gol
@@ -32752,6 +32752,14 @@ are enabled by default; routines in such libraries 
could suffer significant
 loss of accuracy, typically through so-called ``catastrophic cancellation'',
 when this option is used to set the precision to less than extended precision.
 
+@item -mdaz-ftz
+@opindex mdaz-ftz
+
+the flush-to-zero (FTZ) and denormals-are-zero (DAZ) flags in the MXCSR 
register
+are used to control floating-point calculations.SSE and AVX instructions
+including scalar and vector instructions could benefit from enabling the FTZ
+and DAZ flags when @option{-mdaz-ftz} is specified.
+
 @item -mstackrealign
 @opindex mstackrealign
 Realign the stack at entry.  On the x86, the @option{-mstackrealign}
-- 
2.27.0

[PATCH] [x86] Fix ICE due to condition mismatch between expander and define_insn.

2022-12-06 Thread liuhongt via Gcc-patches

ice.i:7:1: error: unrecognizable insn:
7 | }
  | ^
(insn 7 6 8 2 (set (reg:V2SF 84 [ vect__3.8 ])
(unspec:V2SF [
(reg:V2SF 86 [ vect__1.7 ])
(const_int 11 [0xb])
] UNSPEC_ROUND)) "ice.i":5:14 -1
 (nil))
during RTL pass: vregs

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
I checked other round patterns are ok, just typo in this one.
Ready push to trunk as obvious patch.

gcc/ChangeLog:

PR target/107970
* config/i386/mmx.md (btruncv2sf2): Add TARGET_MMX_WITH_SSE to
the condition.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr107970.c: New test.
---
 gcc/config/i386/mmx.md   |  3 ++-
 gcc/testsuite/gcc.target/i386/pr107970.c | 10 ++
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107970.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 63aff287795..c3afc6b5846 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1709,7 +1709,8 @@ (define_expand "btruncv2sf2"
  [(match_operand:V2SF 1 "register_operand")
   (match_dup 2)]
  UNSPEC_ROUND))]
-  "TARGET_SSE4_1 && !flag_trapping_math"
+  "TARGET_SSE4_1 && !flag_trapping_math
+  && TARGET_MMX_WITH_SSE"
   "operands[2] = GEN_INT (ROUND_TRUNC | ROUND_NO_EXC);")
 
 (define_insn "*mmx_roundv2sf2"
diff --git a/gcc/testsuite/gcc.target/i386/pr107970.c 
b/gcc/testsuite/gcc.target/i386/pr107970.c
new file mode 100644
index 000..1fbbb14ee72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107970.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-Ofast -m3dnow -msse4.1" } */
+
+float *foo_p;
+
+void
+foo(float *__restrict q) {
+  foo_p[0] = __builtin_truncf(q[0]);
+  foo_p[1] = __builtin_truncf(q[1]);
+}
-- 
2.27.0

[PATCH] [x86] Improve ix86_expand_fast_convert_bf_to_sf with new extendbfsf2_1.

2022-12-01 Thread liuhongt via Gcc-patches

After supporting extendbfsf2_1, ix86_expand_fast_convert_bf_to_sf can
be improved with pslld either.
CONST_INT_P is not handled since constant shift can be optimized off.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386-expand.cc
(ix86_expand_fast_convert_bf_to_sf): Optimized with
extendbfsf2_1 for non-CONST_INT_P operand.

gcc/testsuite/ChangeLog:

* gcc.target/i386/cbranchbf4.c: New test.
---
 gcc/config/i386/i386-expand.cc | 13 ++---
 gcc/testsuite/gcc.target/i386/cbranchbf4.c | 15 +++
 2 files changed, 21 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/cbranchbf4.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index d26e7e41445..0bc80c4b178 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24155,14 +24155,13 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
   /* FLOAT_EXTEND simplification will fail if VAL is a sNaN.  */
   ret = gen_reg_rtx (SImode);
   emit_move_insn (ret, GEN_INT (INTVAL (op) & 0x));
+  emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
+  return gen_lowpart (SFmode, ret);
 }
-  else
-{
-  ret = gen_reg_rtx (SImode);
-  emit_insn (gen_zero_extendhisi2 (ret, op));
-}
-  emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
-  return gen_lowpart (SFmode, ret);
+
+  ret = gen_reg_rtx (SFmode);
+  emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
+  return ret;
 }
 
 #include "gt-i386-expand.h"
diff --git a/gcc/testsuite/gcc.target/i386/cbranchbf4.c 
b/gcc/testsuite/gcc.target/i386/cbranchbf4.c
new file mode 100644
index 000..8241a0c2165
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/cbranchbf4.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-fexcess-precision=16 -O -msse2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times "pslld" 4 } } */
+
+char
+foo (__bf16 a, __bf16 b)
+{
+  return a > b;
+}
+
+float
+foo1 (__bf16 a, __bf16 b, float c, float d)
+{
+  return a > b ? c : d;
+}
-- 
2.27.0

[PATCH] [x86] Fix ICE due to incorrect insn type.

2022-11-30 Thread liuhongt via Gcc-patches

;; if reg/mem op
(define_insn_reservation  "slm_sseishft_3" 2
  (and (eq_attr "cpu" "slm")
   (and (eq_attr "type" "sseishft")
(not (match_operand 2 "immediate_operand"
  "slm-complex, slm-all-eu")

in slm.md it will check operands[2] for type sseishft, but for
extendbfsf2_1 there's no second operand which caused ICE.
The patch set type from sseishft to sseishft1 to fix the issue.

Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
Ready to push as an obvious patch.

gcc/ChangeLog:

PR target/107934
* config/i386/i386.md (extendbfsf2_1): Change type from
sseishft to sseishft1.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr107934.c: New test.
---
 gcc/config/i386/i386.md  | 2 +-
 gcc/testsuite/gcc.target/i386/pr107934.c | 8 
 2 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107934.c

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 9451883396c..9e1d9eec862 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4981,7 +4981,7 @@ (define_insn "extendbfsf2_1"
   pslld\t{$16, %0|%0, 16}
   vpslld\t{$16, %1, %0|%0, %1, 16}"
   [(set_attr "isa" "noavx,avx")
-   (set_attr "type" "sseishft")
+   (set_attr "type" "sseishft1")
(set_attr "length_immediate" "1")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix" "orig,vex")
diff --git a/gcc/testsuite/gcc.target/i386/pr107934.c 
b/gcc/testsuite/gcc.target/i386/pr107934.c
new file mode 100644
index 000..59106b29159
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107934.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtune=knl -ffinite-math-only -msse2" } */
+
+int
+foo (__bf16 bf)
+{
+  return bf;
+}
-- 
2.27.0

[PATCH 1/2 V2] Implement hwasan target_hook.

2022-11-29 Thread liuhongt via Gcc-patches

Update in V2:
Add documentation for -mlam={none,u48,u57} to x86 options in invoke.texi.

gcc/ChangeLog:

* doc/invoke.texi (x86 options): Document
-mlam={none,u48,u57}.
* config/i386/i386-opts.h (enum lam_type): New enum.
* config/i386/i386.c (ix86_memtag_can_tag_addresses): New.
(ix86_memtag_set_tag): Ditto.
(ix86_memtag_extract_tag): Ditto.
(ix86_memtag_add_tag): Ditto.
(ix86_memtag_tag_size): Ditto.
(ix86_memtag_untagged_pointer): Ditto.
(TARGET_MEMTAG_CAN_TAG_ADDRESSES): New.
(TARGET_MEMTAG_ADD_TAG): Ditto.
(TARGET_MEMTAG_SET_TAG): Ditto.
(TARGET_MEMTAG_EXTRACT_TAG): Ditto.
(TARGET_MEMTAG_UNTAGGED_POINTER): Ditto.
(TARGET_MEMTAG_TAG_SIZE): Ditto.
(IX86_HWASAN_SHIFT): Ditto.
(IX86_HWASAN_TAG_SIZE): Ditto.
* config/i386/i386-expand.c (ix86_expand_call): Untag code
pointer.
* config/i386/i386-options.c (ix86_option_override_internal):
Error when enable -mlam=[u48|u57] for 32-bit code.
* config/i386/i386.opt: Add -mlam=[none|u48|u57].
* config/i386/i386-protos.h (ix86_memtag_untagged_pointer):
Declare.
(ix86_memtag_can_tag_addresses): Ditto.
---
 gcc/config/i386/i386-expand.cc  |  12 
 gcc/config/i386/i386-options.cc |   3 +
 gcc/config/i386/i386-opts.h |   6 ++
 gcc/config/i386/i386-protos.h   |   2 +
 gcc/config/i386/i386.cc | 123 
 gcc/config/i386/i386.opt|  16 +
 gcc/doc/invoke.texi |   9 ++-
 7 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index d26e7e41445..0e94782165a 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -92,6 +92,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "i386-options.h"
 #include "i386-builtins.h"
 #include "i386-expand.h"
+#include "asan.h"
 
 /* Split one or more double-mode RTL references into pairs of half-mode
references.  The RTL can be REG, offsettable MEM, integer constant, or
@@ -9438,6 +9439,17 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
   fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
 }
 
+  /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
+ mask off code pointers here.
+ TODO: also need to handle indirect jump.  */
+  if (ix86_memtag_can_tag_addresses () && !fndecl
+  && sanitize_flags_p (SANITIZE_HWADDRESS))
+{
+  rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
+   NULL_RTX);
+  fnaddr = gen_rtx_MEM (QImode, untagged_addr);
+}
+
   call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
 
   if (retval)
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 44dcccb0a73..25f21ac2a49 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2033,6 +2033,9 @@ ix86_option_override_internal (bool main_args_p,
   if (TARGET_UINTR && !TARGET_64BIT)
 error ("%<-muintr%> not supported for 32-bit code");
 
+  if (ix86_lam_type && !TARGET_LP64)
+error ("%<-mlam=%> option: [u48|u57] not supported for 32-bit code");
+
   if (!opts->x_ix86_arch_string)
 opts->x_ix86_arch_string
   = TARGET_64BIT_P (opts->x_ix86_isa_flags)
diff --git a/gcc/config/i386/i386-opts.h b/gcc/config/i386/i386-opts.h
index 8f71e89fa9a..d3bfeed0af2 100644
--- a/gcc/config/i386/i386-opts.h
+++ b/gcc/config/i386/i386-opts.h
@@ -128,4 +128,10 @@ enum harden_sls {
   harden_sls_all = harden_sls_return | harden_sls_indirect_jmp
 };
 
+enum lam_type {
+  lam_none = 0,
+  lam_u48 = 1,
+  lam_u57
+};
+
 #endif
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index e136f6ec175..abd123c9efc 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -228,6 +228,8 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, 
rtx, enum rtx_code,
 extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
  bool, rtx_code_label *);
 extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
+extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
+extern bool ix86_memtag_can_tag_addresses (void);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 95babd93c9d..518cc9ffd1f 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24274,6 +24274,111 @@ ix86_push_rounding (poly_int64 bytes)
   return ROUND_UP (bytes, UNITS_PER_WORD);
 }
 
+/* Use 8 bits metadata start from bit48 for LAM_U48,
+   6 bits metadat start from bit57 for LAM_U57.  */
+#define IX86_HWASAN_SHIFT (ix86_lam_type == lam_u48\
+  ? 48 \
+

[PATCH] [x86] Fix unrecognizable insn due to illegal immediate_operand (const_int 255) of QImode.

2022-11-28 Thread liuhongt via Gcc-patches

For __builtin_ia32_vec_set_v16qi (a, -1, 2) with
!flag_signed_char. it's transformed to
__builtin_ia32_vec_set_v16qi (_4, 255, 2) in the gimple,
and expanded to (const_int 255) in the rtl. But for immediate_operand,
it expects (const_int 255) to be signed extended to
(const_int -1). The mismatch caused an unrecognizable insn error.

expand_expr_real_1 generates (const_int 255) without considering the target 
mode.
I guess it's on purpose, so I'll leave that alone and only change the expander
in the backend. After applying convert_modes to (const_int 255),
it's transformed to (const_int -1) which fix the issue.

Bootstrapped and regtested x86_64-pc-linux-gnu{-m32,}.
Ok for trunk(and backport to GCC-10/11/12 release branches)?

gcc/ChangeLog:

PR target/107863
* config/i386/i386-expand.cc (ix86_expand_vec_set_builtin):
Convert op1 to target mode whenever mode mismatch.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr107863.c: New test.
---
 gcc/config/i386/i386-expand.cc   | 2 +-
 gcc/testsuite/gcc.target/i386/pr107863.c | 8 
 2 files changed, 9 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107863.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 0373c3614a4..c639ee3a9f7 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -12475,7 +12475,7 @@ ix86_expand_vec_set_builtin (tree exp)
   op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
   elt = get_element_number (TREE_TYPE (arg0), arg2);
 
-  if (GET_MODE (op1) != mode1 && GET_MODE (op1) != VOIDmode)
+  if (GET_MODE (op1) != mode1)
 op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
 
   op0 = force_reg (tmode, op0);
diff --git a/gcc/testsuite/gcc.target/i386/pr107863.c 
b/gcc/testsuite/gcc.target/i386/pr107863.c
new file mode 100644
index 000..99fd85d9765
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107863.c
@@ -0,0 +1,8 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O" } */
+
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16qi foo(v16qi a){
+  return __builtin_ia32_vec_set_v16qi (a, -1, 2);
+}
-- 
2.27.0

[PATCH V3] [x86] Fix incorrect _mm_cvtsbh_ss.

2022-11-24 Thread liuhongt via Gcc-patches

Update in V3:
Remove !flag_signaling_nans since there's already HONOR_NANS (BFmode).

Here's the patch:

After supporting real __bf16, the implementation of _mm_cvtsbh_ss went
wrong.

The patch add a builtin to generate pslld for the intrinsic, also
extendbfsf2 is supported with pslld when !HONOR_NANS (BFmode).

truncsfbf2 is supported with vcvtneps2bf16 when
!HONOR_NANS (BFmode) && flag_unsafe_math_optimizations.

gcc/ChangeLog:

PR target/107748
* config/i386/avx512bf16intrin.h (_mm_cvtsbh_ss): Refined.
* config/i386/i386-builtin-types.def (FLOAT_FTYPE_BFLOAT16):
New function type.
* config/i386/i386-builtin.def (BDESC): New builtin.
* config/i386/i386-expand.cc (ix86_expand_args_builtin):
Handle the builtin.
* config/i386/i386.md (extendbfsf2): New expander.
(extendbfsf2_1): New define_insn.
(truncsfbf2): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Scan pslld.
* gcc.target/i386/extendbfsf.c: New test.
---
 gcc/config/i386/avx512bf16intrin.h|  4 +-
 gcc/config/i386/i386-builtin-types.def|  1 +
 gcc/config/i386/i386-builtin.def  |  2 +
 gcc/config/i386/i386-expand.cc|  1 +
 gcc/config/i386/i386.md   | 40 ++-
 .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c  |  3 +-
 gcc/testsuite/gcc.target/i386/extendbfsf.c| 16 
 7 files changed, 61 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/extendbfsf.c

diff --git a/gcc/config/i386/avx512bf16intrin.h 
b/gcc/config/i386/avx512bf16intrin.h
index ea1d0125b3f..75378af5584 100644
--- a/gcc/config/i386/avx512bf16intrin.h
+++ b/gcc/config/i386/avx512bf16intrin.h
@@ -46,9 +46,7 @@ extern __inline float
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsbh_ss (__bf16 __A)
 {
-  union{ float a; unsigned int b;} __tmp;
-  __tmp.b = ((unsigned int)(__A)) << 16;
-  return __tmp.a;
+  return __builtin_ia32_cvtbf2sf (__A);
 }
 
 /* vcvtne2ps2bf16 */
diff --git a/gcc/config/i386/i386-builtin-types.def 
b/gcc/config/i386/i386-builtin-types.def
index d10de32643f..65fe070e37f 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1281,6 +1281,7 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI)
 DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI)
 
 # BF16 builtins
+DEF_FUNCTION_TYPE (FLOAT, BFLOAT16)
 DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF)
 DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, V32BF, USI)
 DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, USI)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 5e0461acc00..d85b1753039 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2838,6 +2838,8 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, 
CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, 
"__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPBF16PS_V4SF, UNKNOWN, (int) 
V4SF_FTYPE_V4SF_V8BF_V8BF)
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, 
"__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPBF16PS_V4SF_MASK, UNKNOWN, 
(int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, 
"__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, 
UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
+BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_extendbfsf2_1, 
"__builtin_ia32_cvtbf2sf", IX86_BUILTIN_CVTBF2SF, UNKNOWN, (int) 
FLOAT_FTYPE_BFLOAT16)
+
 
 /* AVX512FP16.  */
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", 
IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 0373c3614a4..d26e7e41445 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10423,6 +10423,7 @@ ix86_expand_args_builtin (const struct 
builtin_description *d,
   return ix86_expand_sse_ptest (d, exp, target);
 case FLOAT128_FTYPE_FLOAT128:
 case FLOAT_FTYPE_FLOAT:
+case FLOAT_FTYPE_BFLOAT16:
 case INT_FTYPE_INT:
 case UINT_FTYPE_UINT:
 case UINT16_FTYPE_UINT16:
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 01faa911b77..9451883396c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -130,6 +130,7 @@ (define_c_enum "unspec" [
   ;; For AVX/AVX512F support
   UNSPEC_SCALEF
   UNSPEC_PCMP
+  UNSPEC_CVTBFSF
 
   ;; Generic math support
   UNSPEC_IEEE_MIN  ; not commutative
@@ -4961,6 +4962,31 @@ (define_insn "*extendhf2"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_expand "extendbfsf2"
+  [(set (match_operand:SF 0 "register_operand")
+   (unspec:SF
+ [(match_operand:BF 1 "register_operand")]
+

[PATCH v2] [x86] Fix incorrect _mm_cvtsbh_ss.

2022-11-23 Thread liuhongt via Gcc-patches

After supporting real __bf16, the implementation of _mm_cvtsbh_ss went
wrong.

The patch add a builtin to generate pslld for the intrinsic, also
extendbfsf2 is supported with pslld when !flag_signaling_nans &&
!HONOR_NANS (BFmode).

truncsfbf2 is supported with vcvtneps2bf16 when !flag_signaling_nans &&
!HONOR_NANS (BFmode) && flag_unsafe_math_optimizations.

Here's updated patch.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

PR target/107748
* config/i386/avx512bf16intrin.h (_mm_cvtsbh_ss): Refined.
* config/i386/i386-builtin-types.def (FLOAT_FTYPE_BFLOAT16):
New function type.
* config/i386/i386-builtin.def (BDESC): New builtin.
* config/i386/i386-expand.cc (ix86_expand_args_builtin):
Handle the builtin.
* config/i386/i386.md (extendbfsf2): New expander.
(extendbfsf2_1): New define_insn.
(truncsfbf2): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bf16-cvtsbh2ss-1.c: Scan pslld.
* gcc.target/i386/extendbfsf.c: New test.
---
 gcc/config/i386/avx512bf16intrin.h|  4 +-
 gcc/config/i386/i386-builtin-types.def|  1 +
 gcc/config/i386/i386-builtin.def  |  2 +
 gcc/config/i386/i386-expand.cc|  1 +
 gcc/config/i386/i386.md   | 41 ++-
 .../gcc.target/i386/avx512bf16-cvtsbh2ss-1.c  |  3 +-
 gcc/testsuite/gcc.target/i386/extendbfsf.c| 16 
 7 files changed, 62 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/extendbfsf.c

diff --git a/gcc/config/i386/avx512bf16intrin.h 
b/gcc/config/i386/avx512bf16intrin.h
index ea1d0125b3f..75378af5584 100644
--- a/gcc/config/i386/avx512bf16intrin.h
+++ b/gcc/config/i386/avx512bf16intrin.h
@@ -46,9 +46,7 @@ extern __inline float
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsbh_ss (__bf16 __A)
 {
-  union{ float a; unsigned int b;} __tmp;
-  __tmp.b = ((unsigned int)(__A)) << 16;
-  return __tmp.a;
+  return __builtin_ia32_cvtbf2sf (__A);
 }
 
 /* vcvtne2ps2bf16 */
diff --git a/gcc/config/i386/i386-builtin-types.def 
b/gcc/config/i386/i386-builtin-types.def
index d10de32643f..65fe070e37f 100644
--- a/gcc/config/i386/i386-builtin-types.def
+++ b/gcc/config/i386/i386-builtin-types.def
@@ -1281,6 +1281,7 @@ DEF_FUNCTION_TYPE (V4SI, V4SI, V4SI, UHI)
 DEF_FUNCTION_TYPE (V8SI, V8SI, V8SI, UHI)
 
 # BF16 builtins
+DEF_FUNCTION_TYPE (FLOAT, BFLOAT16)
 DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF)
 DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, V32BF, USI)
 DEF_FUNCTION_TYPE (V32BF, V16SF, V16SF, USI)
diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 5e0461acc00..d85b1753039 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2838,6 +2838,8 @@ BDESC (0, OPTION_MASK_ISA2_AVX512BF16, 
CODE_FOR_avx512f_dpbf16ps_v8sf_maskz, "__
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf, 
"__builtin_ia32_dpbf16ps_v4sf", IX86_BUILTIN_DPBF16PS_V4SF, UNKNOWN, (int) 
V4SF_FTYPE_V4SF_V8BF_V8BF)
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_mask, 
"__builtin_ia32_dpbf16ps_v4sf_mask", IX86_BUILTIN_DPBF16PS_V4SF_MASK, UNKNOWN, 
(int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_dpbf16ps_v4sf_maskz, 
"__builtin_ia32_dpbf16ps_v4sf_maskz", IX86_BUILTIN_DPBF16PS_V4SF_MASKZ, 
UNKNOWN, (int) V4SF_FTYPE_V4SF_V8BF_V8BF_UQI)
+BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_extendbfsf2_1, 
"__builtin_ia32_cvtbf2sf", IX86_BUILTIN_CVTBF2SF, UNKNOWN, (int) 
FLOAT_FTYPE_BFLOAT16)
+
 
 /* AVX512FP16.  */
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512FP16, 
CODE_FOR_addv8hf3_mask, "__builtin_ia32_addph128_mask", 
IX86_BUILTIN_ADDPH128_MASK, UNKNOWN, (int) V8HF_FTYPE_V8HF_V8HF_V8HF_UQI)
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 0373c3614a4..d26e7e41445 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -10423,6 +10423,7 @@ ix86_expand_args_builtin (const struct 
builtin_description *d,
   return ix86_expand_sse_ptest (d, exp, target);
 case FLOAT128_FTYPE_FLOAT128:
 case FLOAT_FTYPE_FLOAT:
+case FLOAT_FTYPE_BFLOAT16:
 case INT_FTYPE_INT:
 case UINT_FTYPE_UINT:
 case UINT16_FTYPE_UINT16:
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 01faa911b77..62d70330c5c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -130,6 +130,7 @@ (define_c_enum "unspec" [
   ;; For AVX/AVX512F support
   UNSPEC_SCALEF
   UNSPEC_PCMP
+  UNSPEC_CVTBFSF
 
   ;; Generic math support
   UNSPEC_IEEE_MIN  ; not commutative
@@ -4961,6 +4962,31 @@ (define_insn "*extendhf2"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_expand "extendbfsf2"
+  [(set (match_operand:SF 0 "register_operand")
+   (unspec:SF
+ [(match_operand:BF 1

1 2 3 4 5 >

1 - 100 of 408 matches

Mail list logo