[PATCH] Guard truncate from vector float to vector __bf16 with !flag_rounding_math && HONOR_NANS (BFmode).

2024-11-07 Thread liuhongt
hw instruction doesn't raise exceptions, turns sNAN into qNAN quietly,
and always round to nearest (even). Output denormals are always
flushed to zero and input denormals are always treated as zero. MXCSR
is not consulted nor updated.
W/o native instructions, flag_unsafe_math_optimizations is needed for
the permutation instructions.
Similar guard extend from vector __bf16 to vector float with
!HONOR_NANS (BFmode).

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

gcc/ChangeLog:

* config/i386/i386.md (truncsf2bf2): Add !flag_rounding_math
to the condition, require flag_unsafe_math_optimizations when
native instruction is not available.
* config/i386/mmx.md: (truncv2sfv2bf2): Ditto.
(extendv2bfv2sf2): Add !HONOR_NANS (BFmode) to the condition.
* config/i386/sse.md: (truncv4sfv4sf2): Add
!flag_rounding_math to the condition, require
flag_unsafe_math_optimizations when native instruction is not
available.
(truncv8sfv8bf2): Ditto.
(truncv16sfv16bf2): Ditto.
(extendv4bfv4sf2): Add !HONOR_NANS (BFmode) to the condition.
(extendv8bfv8sf2): Ditto.
(extendv16bfv16sf2): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bf16-truncsfbf.c: Add -ffast-math.
* gcc.target/i386/avx512bw-extendbf2sf.c: Ditto.
* gcc.target/i386/avx512bw-truncsfbf.c: Ditto.
* gcc.target/i386/sse2-extendbf2sf.c: Ditto.
* gcc.target/i386/ssse3-truncsfbf.c: Ditto.
---
 gcc/config/i386/i386.md  | 11 ++-
 gcc/config/i386/mmx.md   |  8 ++--
 gcc/config/i386/sse.md   | 16 
 .../gcc.target/i386/avx512bf16-truncsfbf.c   |  2 +-
 .../gcc.target/i386/avx512bw-extendbf2sf.c   |  2 +-
 .../gcc.target/i386/avx512bw-truncsfbf.c |  2 +-
 gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c |  2 +-
 gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c  |  2 +-
 8 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index c492fe55881..96d5420d9de 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5694,11 +5694,20 @@ (define_insn "*trunchf2"
(set_attr "prefix" "evex")
(set_attr "mode" "HF")])
 
+/* vcvtneps2bf16 doesn't honor SNAN, and turn sNAN into qNAN quietly,
+   and it always round to even.
+   flag_unsafte_math_optimization is needed for psrld.
+   If we don't expect qNaNs nor sNaNs and can assume rounding
+   to nearest, we can expand the conversion inline as
+   (fromi + 0x7fff + ((fromi >> 16) & 1)) >> 16.  */
 (define_insn "truncsfbf2"
   [(set (match_operand:BF 0 "register_operand" "=x,x,v,Yv")
(float_truncate:BF
  (match_operand:SF 1 "register_operand" "0,x,v,Yv")))]
-  "TARGET_SSE2 && flag_unsafe_math_optimizations && !HONOR_NANS (BFmode)"
+  "TARGET_SSE2 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+   || TARGET_AVXNECONVERT
+   || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
   "@
   psrld\t{$16, %0|%0, 16}
   %{vex%} vcvtneps2bf16\t{%1, %0|%0, %1}
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 021ac90ae2a..61a4f4d21ea 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2998,7 +2998,11 @@ (define_expand "truncv2sfv2bf2"
   [(set (match_operand:V2BF 0 "register_operand")
(float_truncate:V2BF
  (match_operand:V2SF 1 "nonimmediate_operand")))]
-  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
+  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE
+  && !HONOR_NANS (BFmode) && !flag_rounding_math
+  && (flag_unsafe_math_optimizations
+  || TARGET_AVXNECONVERT
+  || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   rtx op1 = gen_reg_rtx (V4SFmode);
   rtx op0 = gen_reg_rtx (V4BFmode);
@@ -3016,7 +3020,7 @@ (define_expand "extendv2bfv2sf2"
   [(set (match_operand:V2SF 0 "register_operand")
(float_extend:V2SF
  (match_operand:V2BF 1 "nonimmediate_operand")))]
-  "TARGET_SSE2 && TARGET_MMX_WITH_SSE"
+  "TARGET_SSE2 && TARGET_MMX_WITH_SSE && !HONOR_NANS (BFmode)"
 {
   rtx op0 = gen_reg_rtx (V4SFmode);
   rtx op1 = gen_reg_rtx (V4BFmode);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5eeb3ab221a..efe32e5149f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30995,7 +30995,10 @@ (define_expand "truncv4sfv4bf2"
   [(set (match_operand:V4BF 0 "register_operand")
  (float_truncate:V4BF
(match_operand:V4SF 1 "nonimmediate_operand")))]
-  "TARGET_SSSE3"
+  "TARGET_SSSE3 && !HONOR_NANS (BFmode) && !flag_rounding_math
+   && (flag_unsafe_math_optimizations
+   || TARGET_AVXNECONVERT
+   || (TARGET_AVX512BF16 && TARGET_AVX512VL))"
 {
   if (!TARGET_AVXNECONVERT
   && !(TARGET_AVX512BF16 && TARGET_AVX512VL))
@@ -31088,7 +31091,10 @@ (define_expand "truncv8sfv8bf2"
   [(set (match_operand:V8BF 0 "register_operand"

[PATCH] Make ix86_align_loops uarch-specific tune.

2024-11-06 Thread liuhongt
Disable the tune for Zhaoxin/CLX/SKX since it could hurt performance
for the inner loop.

According to last test, align_loop helps performance for SPEC2017 on EMR and 
Znver4.
So I'll still keep the tune for generic part.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comment?

gcc/ChangeLog:

PR target/117438
* config/i386/i386-features.cc (pass_align_tight_loops::gate):
Add TARGET_ALIGN_TIGHT_LOOPS to the predicate.
* config/i386/i386.h (TARGET_ALIGN_TIGHT_LOOPS): New macro.
* config/i386/x86-tune.def (X86_TUNE_ALIGN_TIGHT_LOOPS): New
tune.
---
 gcc/config/i386/i386-features.cc | 3 ++-
 gcc/config/i386/i386.h   | 2 ++
 gcc/config/i386/x86-tune.def | 8 +++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index e2e85212a4f..70bda4bc021 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3620,7 +3620,8 @@ public:
   /* opt_pass methods: */
   bool gate (function *) final override
 {
-  return optimize && optimize_function_for_speed_p (cfun);
+  return TARGET_ALIGN_TIGHT_LOOPS
+   && optimize && optimize_function_for_speed_p (cfun);
 }
 
   unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 51934400951..2c6dbf6dfdc 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -466,6 +466,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR]
 #define TARGET_SSE_MOVCC_USE_BLENDV \
ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV]
+#define TARGET_ALIGN_TIGHT_LOOPS \
+   ix86_tune_features[X86_TUNE_ALIGN_TIGHT_LOOPS]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 6ebb2fd3414..8afa0cd9823 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -214,7 +214,7 @@ DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 
| m_PENT
 DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT)
 
 /*/
-/* Branch predictor tuning  */
+/* Branch predictor and The Front-end tuning   
 */
 /*/
 
 /* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4
@@ -235,6 +235,12 @@ DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit",
  m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_GOLDMONT
  | m_GOLDMONT_PLUS | m_INTEL | m_ATHLON_K8 | m_AMDFAM10)
 
+/* X86_TUNE_ALIGN_TIGHT_LOOP: For tight loop whose size is
+   smaller than prefetch_block, align it to ceil_log2 (loop_size). The tune
+   overwrites -falign-loops=N.  */
+DEF_TUNE (X86_TUNE_ALIGN_TIGHT_LOOPS, "align_tight_loops",
+~(m_ZHAOXIN | m_CASCADELAKE | m_SKYLAKE_AVX512))
+
 /*/
 /* Integer instruction selection tuning  */
 /*/
-- 
2.34.1



[PATCH] Fix ICE due to subreg:us_truncate.

2024-10-29 Thread liuhongt
Force_operand issues an ICE when input
is (subreg:DI (us_truncate:V8QI)), it's probably because it's an
invalid rtx, So refine backend patterns for that.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/117318
* config/i386/sse.md (*avx512vl_v2div2qi2_mask_store_1):
Rename to ..
(avx512vl_v2div2qi2_mask_store_1): .. this.
(avx512vl_v2div2qi2_mask_store_2): Change to
define_expand.
(*avx512vl_v4qi2_mask_store_1): Rename to ..
(avx512vl_v4qi2_mask_store_1): .. this.
(avx512vl_v4qi2_mask_store_2): Change to
define_expand.
(*avx512vl_v8qi2_mask_store_1): Rename to ..
(avx512vl_v8qi2_mask_store_1): .. this.
(avx512vl_v8qi2_mask_store_2): Change to
define_expand.
(*avx512vl_v4hi2_mask_store_1): Rename to ..
(avx512vl_v4hi2_mask_store_1): .. this.
(avx512vl_v4hi2_mask_store_2): Change to
define_expand.
(*avx512vl_v2div2hi2_mask_store_1): Rename to ..
(avx512vl_v2div2hi2_mask_store_1): .. this.
(avx512vl_v2div2hi2_mask_store_2): Change to
define_expand.
(*avx512vl_v2div2si2_mask_store_1): Rename to ..
(avx512vl_v2div2si2_mask_store_1): .. this.
(avx512vl_v2div2si2_mask_store_2): Change to
define_expand.
(*avx512f_v8div16qi2_mask_store_1): Rename to ..
(avx512f_v8div16qi2_mask_store_1): .. this.
(avx512f_v8div16qi2_mask_store_2): Change to
define_expand.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117318.c: New test.
---
 gcc/config/i386/sse.md   | 268 +--
 gcc/testsuite/gcc.target/i386/pr117318.c |  12 +
 2 files changed, 110 insertions(+), 170 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr117318.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2345015db1b..36f8567b66f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -15439,7 +15439,7 @@ (define_insn "*avx512vl_v2div2qi2_mask_1"
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
 
-(define_insn "*avx512vl_v2div2qi2_mask_store_1"
+(define_insn "avx512vl_v2div2qi2_mask_store_1"
   [(set (match_operand:V2QI 0 "memory_operand" "=m")
  (vec_merge:V2QI
(any_truncate:V2QI
@@ -15453,28 +15453,19 @@ (define_insn "*avx512vl_v2div2qi2_mask_store_1"
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
 
-(define_insn_and_split "avx512vl_v2div2qi2_mask_store_2"
-  [(set (match_operand:HI 0 "memory_operand")
-   (subreg:HI
- (vec_merge:V2QI
-   (any_truncate:V2QI
- (match_operand:V2DI 1 "register_operand"))
-   (vec_select:V2QI
- (subreg:V4QI
-   (vec_concat:V2HI
- (match_dup 0)
- (const_int 0)) 0)
- (parallel [(const_int 0) (const_int 1)]))
-   (match_operand:QI 2 "register_operand")) 0))]
-  "TARGET_AVX512VL && ix86_pre_reload_split ()"
-  "#"
-  "&& 1"
-  [(set (match_dup 0)
-   (vec_merge:V2QI
- (any_truncate:V2QI (match_dup 1))
- (match_dup 0)
- (match_dup 2)))]
-  "operands[0] = adjust_address_nv (operands[0], V2QImode, 0);")
+(define_expand "avx512vl_v2div2qi2_mask_store_2"
+  [(match_operand:HI 0 "memory_operand")
+   (any_truncate:V2QI
+ (match_operand:V2DI 1 "register_operand"))
+   (match_operand:QI 2 "register_operand")]
+  "TARGET_AVX512VL"
+{
+  operands[0] = adjust_address_nv (operands[0], V2QImode, 0);
+  emit_insn (gen_avx512vl_v2div2qi2_mask_store_1 (operands[0],
+   operands[1],
+   operands[2]));
+  DONE;
+})
 
 (define_insn "*avx512vl_v4qi2_store_1"
   [(set (match_operand:V4QI 0 "memory_operand" "=m")
@@ -15543,7 +15534,7 @@ (define_insn "*avx512vl_v4qi2_mask_1"
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
 
-(define_insn "*avx512vl_v4qi2_mask_store_1"
+(define_insn "avx512vl_v4qi2_mask_store_1"
   [(set (match_operand:V4QI 0 "memory_operand" "=m")
(vec_merge:V4QI
  (any_truncate:V4QI
@@ -15557,29 +15548,19 @@ (define_insn 
"*avx512vl_v4qi2_mask_store_1"
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
 
-(define_insn_and_split "avx512vl_v4qi2_mask_store_2"
-  [(set (match_operand:SI 0 "memory_operand")
-   (subreg:SI
- (vec_merge:V4QI
-   (any_truncate:V4QI
- (match_operand:VI4_128_8_256 1 "register_operand"))
-   (vec_select:V4QI
- (subreg:V8QI
-   (vec_concat:V2SI
- (match_dup 0)
- (const_int 0)) 0)
- (parallel [(const_int 0) (const_int 1)
-(const_int 2) (const_int 3)]))
-   (match_operand:QI 2 "register_operand")) 0))]
-  "TARGET_AVX512VL && ix86_pre_reload_split ()"
-  "#"
-  "&& 1"
-  [(set (m

[PATCH 2/2] Support vector float_extend from __bf16 to float.

2024-10-29 Thread liuhongt
It's supported by vector permutation with zero vector.

gcc/ChangeLog:

* config/i386/i386-expand.cc
(ix86_expand_vector_bf2sf_with_vec_perm): New function.
* config/i386/i386-protos.h
(ix86_expand_vector_bf2sf_with_vec_perm): New Declare.
* config/i386/mmx.md (extendv2bfv2sf2): New expander.
* config/i386/sse.md (extend2):
Ditto.
(VF1_AVX512BW): New mode iterator.
(sf_cvt_bf16): Add V4SF.
(sf_cvt_bf16_lower): New mode attr.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-extendbf2sf.c: New test.
* gcc.target/i386/sse2-extendbf2sf.c: New test.
---
 gcc/config/i386/i386-expand.cc| 39 
 gcc/config/i386/i386-protos.h |  2 +
 gcc/config/i386/mmx.md| 18 
 gcc/config/i386/sse.md| 20 +++-
 .../gcc.target/i386/avx512bw-extendbf2sf.c| 46 +++
 .../gcc.target/i386/sse2-extendbf2sf.c| 20 
 6 files changed, 144 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-extendbf2sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse2-extendbf2sf.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 7138432659e..df9676b80d4 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -26854,5 +26854,44 @@ ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx 
src)
   emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
 }
 
+/* Implement extendv8bf2v8sf2 with vector permutation.  */
+void
+ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx src)
+{
+  machine_mode vperm_mode, src_mode = GET_MODE (src);
+  switch (src_mode)
+{
+case V16BFmode:
+  vperm_mode = V32BFmode;
+  break;
+case V8BFmode:
+  vperm_mode = V16BFmode;
+  break;
+case V4BFmode:
+  vperm_mode = V8BFmode;
+  break;
+default:
+  gcc_unreachable ();
+}
+
+  int nelt = GET_MODE_NUNITS (vperm_mode);
+  vec_perm_builder sel (nelt, nelt, 1);
+  sel.quick_grow (nelt);
+  for (int i = 0, k = 0, j = nelt; i != nelt; i++)
+sel[i] = i & 1 ? j++ : k++;
+
+  vec_perm_indices indices (sel, 2, nelt);
+
+  rtx target = gen_reg_rtx (vperm_mode);
+  rtx op1 = lowpart_subreg (vperm_mode,
+   force_reg (src_mode, src),
+   src_mode);
+  rtx op0 = CONST0_RTX (vperm_mode);
+  bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
+ target, op0, op1, indices);
+  gcc_assert (ok);
+  emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
+}
+
 
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 55ffdb9dcf1..c26ae5e4f1d 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -259,6 +259,8 @@ extern bool ix86_ternlog_operand_p (rtx op);
 extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2,
int idx, rtx target);
 extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx);
+extern void ix86_expand_vector_bf2sf_with_vec_perm (rtx, rtx);
+
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 5c776ec0aba..021ac90ae2a 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -3012,6 +3012,24 @@ (define_expand "truncv2sfv2bf2"
   DONE;
 })
 
+(define_expand "extendv2bfv2sf2"
+  [(set (match_operand:V2SF 0 "register_operand")
+   (float_extend:V2SF
+ (match_operand:V2BF 1 "nonimmediate_operand")))]
+  "TARGET_SSE2 && TARGET_MMX_WITH_SSE"
+{
+  rtx op0 = gen_reg_rtx (V4SFmode);
+  rtx op1 = gen_reg_rtx (V4BFmode);
+
+  emit_move_insn (op1, lowpart_subreg (V4BFmode,
+  force_reg (V2BFmode, operands[1]),
+  V2BFmode));
+  emit_insn (gen_extendv4bfv4sf2 (op0, op1));
+
+  emit_move_insn (operands[0], lowpart_subreg (V2SFmode, op0, V4SFmode));
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel integral arithmetic
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7f7910383ae..3d57a90fad7 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -530,6 +530,9 @@ (define_mode_iterator VF2_AVX512VL
 (define_mode_iterator VF1_AVX512VL
   [(V16SF "TARGET_EVEX512") (V8SF "TARGET_AVX512VL") (V4SF "TARGET_AVX512VL")])
 
+(define_mode_iterator VF1_AVX512BW
+  [(V16SF "TARGET_EVEX512 && TARGET_EVEX512") (V8SF "TARGET_AVX2") V4SF])
+
 (define_mode_iterator VF1_AVX10_2
   [(V16SF "TARGET_AVX10_2_512") V8SF V4SF])
 
@@ -30925,7 +30928,11 @@ (define_mode_attr bf16_cvt_2sf
   [(V32BF  "V16SF") (V16BF  "V8SF") (V8BF  "V4SF")])
 ;; Converting from SF to BF
 

[PATCH 1/2] [x86] Support vector float_truncate for SF to BF.

2024-10-29 Thread liuhongt
Generate native instruction whenever possible, otherwise use vector
permutation with odd indices.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/i386-expand.cc
(ix86_expand_vector_sf2bf_with_vec_perm): New function.
* config/i386/i386-protos.h
(ix86_expand_vector_sf2bf_with_vec_perm): New declare.
* config/i386/mmx.md (truncv2sfv2bf2): New expander.
* config/i386/sse.md (truncv4sfv4bf2): Ditto.
(truncv8sfv8bf2): Ditto.
(truncv16sfv16bf2): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bf16-truncsfbf.c: New test.
* gcc.target/i386/avx512bw-truncsfbf.c: New test.
* gcc.target/i386/ssse3-truncsfbf.c: New test.
---
 gcc/config/i386/i386-expand.cc| 38 +++
 gcc/config/i386/i386-protos.h |  1 +
 gcc/config/i386/mmx.md| 18 
 gcc/config/i386/sse.md| 44 ++
 .../gcc.target/i386/avx512bf16-truncsfbf.c|  5 ++
 .../gcc.target/i386/avx512bw-truncsfbf.c  | 46 +++
 .../gcc.target/i386/ssse3-truncsfbf.c | 20 
 7 files changed, 172 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bf16-truncsfbf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-truncsfbf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/ssse3-truncsfbf.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 63f5e348d64..7138432659e 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -26817,4 +26817,42 @@ ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx 
input, machine_mode cvt_m
   emit_move_insn (output, gen_lowpart (out_mode, d.target));
 }
 
+/* Implement truncv8sfv8bf2 with vector permutation.  */
+void
+ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
+{
+  machine_mode vperm_mode, src_mode = GET_MODE (src);
+  switch (src_mode)
+{
+case V16SFmode:
+  vperm_mode = V32BFmode;
+  break;
+case V8SFmode:
+  vperm_mode = V16BFmode;
+  break;
+case V4SFmode:
+  vperm_mode = V8BFmode;
+  break;
+default:
+  gcc_unreachable ();
+}
+
+  int nelt = GET_MODE_NUNITS (vperm_mode);
+  vec_perm_builder sel (nelt, nelt, 1);
+  sel.quick_grow (nelt);
+  for (int i = 0; i != nelt; i++)
+sel[i] = (2 * i + 1) % nelt;
+  vec_perm_indices indices (sel, 1, nelt);
+
+  rtx target = gen_reg_rtx (vperm_mode);
+  rtx op0 = lowpart_subreg (vperm_mode,
+   force_reg (src_mode, src),
+   src_mode);
+  bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
+ target, op0, op0, indices);
+  gcc_assert (ok);
+  emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
+}
+
+
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index c1f9147769c..55ffdb9dcf1 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -258,6 +258,7 @@ extern int ix86_ternlog_idx (rtx op, rtx *args);
 extern bool ix86_ternlog_operand_p (rtx op);
 extern rtx ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2,
int idx, rtx target);
+extern void ix86_expand_vector_sf2bf_with_vec_perm (rtx, rtx);
 
 #ifdef TREE_CODE
 extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int);
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 506f4cab6a8..5c776ec0aba 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -2994,6 +2994,24 @@ (define_expand "truncv2sfv2hf2"
   DONE;
 })
 
+(define_expand "truncv2sfv2bf2"
+  [(set (match_operand:V2BF 0 "register_operand")
+   (float_truncate:V2BF
+ (match_operand:V2SF 1 "nonimmediate_operand")))]
+  "TARGET_SSSE3 && TARGET_MMX_WITH_SSE"
+{
+  rtx op1 = gen_reg_rtx (V4SFmode);
+  rtx op0 = gen_reg_rtx (V4BFmode);
+
+  emit_move_insn (op1, lowpart_subreg (V4SFmode,
+  force_reg (V2SFmode, operands[1]),
+  V2SFmode));
+  emit_insn (gen_truncv4sfv4bf2 (op0, op1));
+
+  emit_move_insn (operands[0], lowpart_subreg (V2BFmode, op0, V4BFmode));
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel integral arithmetic
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6c28b74ac3f..7f7910383ae 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -30952,6 +30952,24 @@ (define_insn "avx512f_cvtne2ps2bf16_"
   "TARGET_AVX512BF16"
   "vcvtne2ps2bf16\t{%2, %1, %0|%0, %1, %2}")
 
+(define_expand "truncv4sfv4bf2"
+  [(set (match_operand:V4BF 0 "register_operand")
+ (float_truncate:V4BF
+   (match_operand:V4SF 1 "nonimmediate_operand")))]
+  "TARGET_SSSE3"
+{
+ 

[PATCH] [x86] Fix ICE due to isa mismatch for the builtins.

2024-10-22 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk and backport to release branch.

gcc/ChangeLog:

PR target/117240
* config/i386/i386-builtin.def: Add avx/avx512f to vaes
ymm/zmm builtins.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117240_avx.c: New test.
* gcc.target/i386/pr117240_avx512f.c: New test.
---
 gcc/config/i386/i386-builtin.def | 16 
 gcc/testsuite/gcc.target/i386/pr117240_avx.c | 10 ++
 gcc/testsuite/gcc.target/i386/pr117240_avx512f.c | 10 ++
 3 files changed, 28 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr117240_avx.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr117240_avx512f.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 151ccf4f252..1eb631db710 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -2819,17 +2819,17 @@ BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, 
"__builtin_ia32_rdpid", IX86_B
 
 /* VAES.  */
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, 
UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, 
"__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, 
UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, 
"__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | 
OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesdec_v64qi, 
"__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", 
IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, 
"__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", 
IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, 
"__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | 
OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesdeclast_v64qi, 
"__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, 
UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, 
"__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, 
UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, 
"__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | 
OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesenc_v64qi, 
"__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, 
CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", 
IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI)
-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, 
"__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
-BDESC (0, OPTION_MASK_ISA2_VAES | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", 
IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI)
+BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, 
"__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) 
V32QI_FTYPE_V32QI_V32QI)
+BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES | 
OPTION_MASK_ISA2_EVEX512, CODE_FOR_vaesenclast_v64qi, 
"__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) 
V64QI_FTYPE_V64QI_V64QI)
 
 /* BF16 */
 BDESC (0, OPTION_MASK_ISA2_AVX512BF16 | OPTION_MASK_ISA2_EVEX512, 
CODE_FOR_avx512f_cvtne2ps2bf16_v32bf, "__builtin_ia32_cvtne2ps2bf16_v32bf", 
IX86_BUILTIN_CVTNE2PS2BF16_V32BF, UNKN

[PATCH] i386: Optimize EQ/NE comparison between avx512 kmask and -1.

2024-10-21 Thread liuhongt
r15-974-gbf7745f887c765e06f2e75508f263debb60aeb2e has optimized for
jcc/setcc, but missed movcc.
The patch supports movcc.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/117232
* config/i386/sse.md (*kortest_cmp_movqicc):
New define_insn_and_split.
(*kortest_cmp_movcc):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117232-1.c: New test.
* gcc.target/i386/pr117232-apx-1.c: New test.
---
 gcc/config/i386/sse.md| 85 +++
 gcc/testsuite/gcc.target/i386/pr117232-1.c| 47 ++
 .../gcc.target/i386/pr117232-apx-1.c  | 48 +++
 3 files changed, 180 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr117232-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr117232-apx-1.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 6c28b74ac3f..2345015db1b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2423,6 +2423,91 @@ (define_insn_and_split "*kortest_cmp_jcc"
   DONE;
 })
 
+;; Optimize cmp + movcc with mask register by kortest + movcc.
+(define_insn_and_split "*kortest_cmp_movqicc"
+   [(set (match_operand:QI 0 "register_operand" "=r,r,r,r,r,r")
+  (if_then_else:QI
+   (match_operator 1 "bt_comparison_operator"
+ [(match_operand:SWI1248_AVX512BWDQ_64 4 "register_operand"
+ "?k,,?k, ,?k,r")
+  (const_int -1)])
+   (match_operand:QI 2 "register_operand"  "r,r,0,0,r,r")
+   (match_operand:QI 3 "register_operand" " 0,0,r,r,r,r")))
+(clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW && TARGET_CMOVE && !TARGET_PARTIAL_REG_STALL"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (if_then_else:SI
+ (match_dup 5)
+ (match_dup 2)
+ (match_dup 3)))]
+{
+  rtx flag_reg;
+  if (MASK_REGNO_P (REGNO (operands[4])))
+{
+  emit_insn (gen_kortest_ccc (operands[4], 
operands[4]));
+  flag_reg = gen_rtx_REG (CCCmode, FLAGS_REG);
+}
+  else
+{
+  flag_reg = gen_rtx_REG (CCZmode, FLAGS_REG);
+  emit_insn (gen_rtx_SET (flag_reg,
+ gen_rtx_COMPARE (CCZmode,
+  operands[4],
+  constm1_rtx)));
+}
+  operands[5] = gen_rtx_fmt_ee (GET_CODE (operands[1]), VOIDmode,
+   flag_reg,const0_rtx);
+  operands[0] = gen_lowpart (SImode, operands[0]);
+  operands[2] = gen_lowpart (SImode, operands[2]);
+  operands[3] = gen_lowpart (SImode, operands[3]);
+}
+  [(set_attr "isa" "*,*,*,*,apx_ndd,apx_ndd")
+   (set_attr "type" "icmov")
+   (set_attr "mode" "QI")])
+
+(define_insn_and_split 
"*kortest_cmp_movcc"
+   [(set (match_operand:SWI248 0 "register_operand" "=r,r,r,r,r,r,r,r")
+  (if_then_else:SWI248
+   (match_operator 1 "bt_comparison_operator"
+ [(match_operand:SWI1248_AVX512BWDQ_64 4 "register_operand"
+ "?k,,?k, ,?k,r,?k, 
r")
+  (const_int -1)])
+   (match_operand:SWI248 2 "nonimmediate_operand" "rm,rm, 0, 0,rm,rm, r, 
r")
+   (match_operand:SWI248 3 "nonimmediate_operand" " 0, 0,rm,rm, r, 
r,rm,rm")))
+(clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW && TARGET_CMOVE
+   && !(MEM_P (operands[2]) && MEM_P (operands[3]))"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (if_then_else:SWI248
+ (match_dup 5)
+ (match_dup 2)
+ (match_dup 3)))]
+{
+  rtx flag_reg;
+  if (MASK_REGNO_P (REGNO (operands[4])))
+{
+  emit_insn (gen_kortest_ccc (operands[4], 
operands[4]));
+  flag_reg = gen_rtx_REG (CCCmode, FLAGS_REG);
+}
+  else
+{
+  flag_reg = gen_rtx_REG (CCZmode, FLAGS_REG);
+  emit_insn (gen_rtx_SET (flag_reg,
+ gen_rtx_COMPARE (CCZmode,
+  operands[4],
+  constm1_rtx)));
+}
+  operands[5] = gen_rtx_fmt_ee (GET_CODE (operands[1]), VOIDmode,
+   flag_reg,const0_rtx);
+}
+  [(set_attr "isa" "*,*,*,*,apx_ndd,apx_ndd,apx_ndd,apx_ndd")
+   (set_attr "type" "icmov")
+   (set_attr "mode" "")])
+
 (define_insn "kunpckhi"
   [(set (match_operand:HI 0 "register_operand" "=k")
(ior:HI
diff --git a/gcc/testsuite/gcc.target/i386/pr117232-1.c 
b/gcc/testsuite/gcc.target/i386/pr117232-1.c
new file mode 100644
index 000..cd7f5d112a7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr117232-1.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -mavx512vl -mavx512dq -O2" } */
+/* { dg-final { scan-assembler-times {(?n)kortest[bwqd]} 7 { target { ! ia32 } 
} } } */
+/* { dg-final { scan-assembler-times {(?n)cmovn?c} 7 { target { ! ia32 } } } } 
*/
+
+#include 
+int
+foo (__m512i a, __m512i b, int c, int d) {
+  __mmask64 k = _mm512_cmpeq_epi8_mask (a, b);
+  return k == (__mmask64) -1 ? c : 

[PATCH] [GCC13/GCC12] Fix testcase.

2024-10-21 Thread liuhongt
The optimization relies on other patterns which are only available at
GCC14 and obove, so restore the xfail for GCC13/12 branch.

Pushed as an obvious fix.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512bw-pr103750-2.c: Add xfail for ia32.
---
 gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c 
b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
index 3392e193222..7303f5403ba 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c
@@ -1,7 +1,8 @@
 /* PR target/103750 */
 /* { dg-do compile }  */
 /* { dg-options "-O2 -mavx512dq -mavx512bw -mavx512vl" } */
-/* { dg-final { scan-assembler-not "kmov" } } */
+/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */
+/* xfail need to be fixed.  */
 
 #include 
 extern __m128i* pi128;
-- 
2.31.1



[PATCH] [AVX512] Refine splitters related to "combine vpcmpuw + zero_extend to vpcmpuw"

2024-10-16 Thread liuhongt
r12-6103-g1a7ce8570997eb combines vpcmpuw + zero_extend to vpcmpuw
with the pre_reload splitter, but the splitter transforms the
zero_extend into a subreg which make reload think the upper part is
garbage, it's not correct.

The patch adjusts the zero_extend define_insn_and_split to
define_insn to keep zero_extend.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/117159
* config/i386/sse.md
(*_cmp3_zero_extend):
Change from define_insn_and_split to define_insn.
(*_cmp3_zero_extend):
Ditto.
(*_ucmp3_zero_extend):
Ditto.
(*_ucmp3_zero_extend):
Ditto.
(*_cmp3_zero_extend_2):
Split to the zero_extend pattern.
(*_cmp3_zero_extend_2):
Ditto.
(*_ucmp3_zero_extend_2):
Ditto.
(*_ucmp3_zero_extend_2):
Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr117159.c: New test.
* gcc.target/i386/avx512bw-pr103750-1.c: Remove xfail.
* gcc.target/i386/avx512bw-pr103750-2.c: Remove xfail.
---
 gcc/config/i386/sse.md| 186 +++---
 .../gcc.target/i386/avx512bw-pr103750-1.c |   3 +-
 .../gcc.target/i386/avx512bw-pr103750-2.c |   3 +-
 gcc/testsuite/gcc.target/i386/pr117159.c  |  42 
 4 files changed, 113 insertions(+), 121 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr117159.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a45b50ad732..06c2c9d7a5e 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4298,32 +4298,19 @@ (define_insn 
"_cmp3"
 
 ;; Since vpcmpd implicitly clear the upper bits of dest, transform
 ;; vpcmpd + zero_extend to vpcmpd since the instruction
-(define_insn_and_split 
"*_cmp3_zero_extend"
-  [(set (match_operand:SWI248x 0 "register_operand")
+(define_insn "*_cmp3_zero_extend"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
(zero_extend:SWI248x
  (unspec:
-   [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand")
-(match_operand:V48H_AVX512VL 2 "nonimmediate_operand")
-(match_operand:SI 3 "const_0_to_7_operand")]
+   [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand" "v")
+(match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm")
+(match_operand:SI 3 "const_0_to_7_operand" "n")]
UNSPEC_PCMP)))]
   "TARGET_AVX512F
&& (!VALID_MASK_AVX512BW_MODE (mode) || TARGET_AVX512BW)
-   && ix86_pre_reload_split ()
&& (GET_MODE_NUNITS (mode)
   < GET_MODE_PRECISION (mode))"
-  "#"
-  "&& 1"
-  [(set (match_dup 0)
-   (unspec:
- [(match_dup 1)
-  (match_dup 2)
-  (match_dup 3)]
- UNSPEC_PCMP))]
-{
-  operands[1] = force_reg (mode, operands[1]);
-  operands[0] = lowpart_subreg (mode,
-operands[0], mode);
-}
+  "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}"
   [(set_attr "type" "ssecmp")
(set_attr "length_immediate" "1")
(set_attr "prefix" "evex")
@@ -4351,21 +4338,19 @@ (define_insn_and_split 
"*_cmp3_zero_extend
- [(match_dup 1)
-  (match_dup 2)
-  (match_dup 3)]
- UNSPEC_PCMP))
-   (set (match_dup 4) (match_dup 0))]
+(zero_extend:SWI248x
+ (unspec:
+   [(match_dup 1)
+(match_dup 2)
+(match_dup 3)]
+   UNSPEC_PCMP)))
+   (set (match_dup 4) (match_dup 5))]
 {
-  operands[1] = force_reg (mode, operands[1]);
-  operands[0] = lowpart_subreg (mode,
+  operands[5] = lowpart_subreg (mode,
operands[0], mode);
-}
-  [(set_attr "type" "ssecmp")
-   (set_attr "length_immediate" "1")
-   (set_attr "prefix" "evex")
-   (set_attr "mode" "")])
+  SUBREG_PROMOTED_VAR_P (operands[5]) = 1;
+  SUBREG_PROMOTED_SET (operands[5], 1);
+})
 
 (define_insn_and_split "*_cmp3"
   [(set (match_operand: 0 "register_operand")
@@ -4400,31 +4385,18 @@ (define_insn 
"_cmp3"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
-(define_insn_and_split 
"*_cmp3_zero_extend"
-  [(set (match_operand:SWI248x 0 "register_operand")
+(define_insn "*_cmp3_zero_extend"
+  [(set (match_operand:SWI248x 0 "register_operand" "=k")
(zero_extend:SWI248x
  (unspec:
-   [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand")
-(match_operand:VI12_AVX512VL 2 "nonimmediate_operand")
-(match_operand:SI 3 "const_0_to_7_operand")]
+   [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v")
+(match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm")
+(match_operand:SI 3 "const_0_to_7_operand" "n")]
UNSPEC_PCMP)))]
   "TARGET_AVX512BW
-  && ix86_pre_reload_split ()
-  && (GET_MODE_NUNITS (mode)
-  < GET_MODE_PRECISION (mode))"
-  "#"
-  "&& 1"
-  [(set (match_dup 0)
-   (unspec:
- [(match_dup 1)
-  (match_dup 2)
-  (ma

[PATCH] Adjust testcase to avoid scan FIX in REG_EQUIV.

2024-10-15 Thread liuhongt
Also add hard_float target to avoid failed on arm-eabi, cortex-m0.

Verified on cross-compiler for powerpc64le-linux-gnu, sparc-sun-solaris2.11

Ready push to trunk.

gcc/testsuite/ChangeLog:

PR testsuite/115365
* gcc.dg/pr100927.c: Adjust testcase to avoid scan FIX in REG_EQUIV.
---
 gcc/testsuite/gcc.dg/pr100927.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/pr100927.c b/gcc/testsuite/gcc.dg/pr100927.c
index 8a7d69c3831..28a168d3518 100644
--- a/gcc/testsuite/gcc.dg/pr100927.c
+++ b/gcc/testsuite/gcc.dg/pr100927.c
@@ -1,7 +1,8 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target hard_float } */
 /* { dg-options "-O2 -ftrapping-math -fdump-tree-optimized -fdump-rtl-final" } 
*/
 /* { dg-final { scan-tree-dump-times {(?n)= \(int\)} 3 "optimized" } }  */
-/* { dg-final { scan-rtl-dump-times {(?n)^[ \t]*\(fix:SI} 3 "final" } }  */
+/* { dg-final { scan-rtl-dump-times {(?n)^(?!.*REG_EQUIV)(?=.*\(fix:SI)} 3 
"final" } }  */
 
 int
 foo_ofr ()
-- 
2.31.1



[PATCH][wwwdoc] Mention O2 vectorization enhancement.

2024-10-14 Thread liuhongt
---
 htdocs/gcc-15/changes.html | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/htdocs/gcc-15/changes.html b/htdocs/gcc-15/changes.html
index 6dc46a52..8a238256 100644
--- a/htdocs/gcc-15/changes.html
+++ b/htdocs/gcc-15/changes.html
@@ -36,6 +36,16 @@ a work-in-progress.
 
 General Improvements
 
+
+  The default vectorizer cost model at -O2 has been enhanced
+to handle unknown tripcount. But it still disables vectorization of loops
+when any runtime check for data dependence or alignment is required,
+it also disables vectorization of epilogue loops but otherwise is equal
+to the cheap cost model.
+  
+
+
+
 
 New Languages and Language specific improvements
 
-- 
2.31.1



[PATCH 2/2] [x86] Canonicalize (vec_merge (fma: op2 op1 op3) (match_dup 1)) mask) to (vec_merge (fma: op1 op2 op3) (match_dup 1)) mask)

2024-10-14 Thread liuhongt
For masked FMA, there're 2 forms of RTL representation
1) (vec_merge (fma: op2 op1 op3) op1) mask)
2) (vec_merge (fma: op1 op2 op3) op1) mask)
It's because op1 op2 are communatative in RTL(the second op1 is
written as (match_dup 1))

we once tried to replace (match_dup 1)
with (match_operand:VFH_AVX512VL 5 "nonimmediate_operand" "0,0")), but
trigger an ICE in reload(reload can handle at most one operand with
"0" constraint).

So the patch do the canonicalizaton for the backend part.

gcc/ChangeLog:

PR target/117072
(_fmadd__mask): Relax predicates of
fma operands from register_operand to nonimmediate_operand
(_fmadd__mask3): Ditto.
(_fmsub__mask): Ditto.
(_fmsub__mask3): Ditto.
(_fnmadd__mask): Ditto.
(_fnmadd__mask3): Ditto.
(_fnmsub__mask): Ditto.
(_fnmsub__mask3): Ditto.
(_fmaddsub__mask3): Ditto.
(_fmsubadd__mask): Ditto.
(_fmsubadd__mask3): Ditto.
(avx512f_vmfmadd__mask): Ditto.
(avx512f_vmfmadd__mask3): Ditto.
(avx512f_vmfmadd__maskz_1): Ditto.
(*avx512f_vmfmsub__mask): Ditto.
(avx512f_vmfmsub__mask3): Ditto.
(*avx512f_vmfmsub__maskz_1): Ditto.
(avx512f_vmfnmadd__mask): Ditto.
(avx512f_vmfnmadd__mask3): Ditto.
(avx512f_vmfnmadd__maskz_1): Ditto.
(*avx512f_vmfnmsub__mask): Ditto.
(*avx512f_vmfnmsub__mask3): Ditto.
(*avx512f_vmfnmsub__maskz_1): Ditto.
(avx10_2_fmaddnepbf16__mask3): Ditto.
(avx10_2_fnmaddnepbf16__mask3): Ditto.
(avx10_2_fmsubnepbf16__mask3): Ditto.
(avx10_2_fnmsubnepbf16__mask3): Ditto.
(fmai_vmfmadd_): Swap operands[1] and operands[2].
(fmai_vmfmsub_): Ditto.
(fmai_vmfnmadd_): Ditto.
(fmai_vmfnmsub_): Ditto.
(*fmai_fmadd_): Swap operands[1] and operands[2] adjust
operands[1] predicates from register_operand to
nonimmediate_operand.
(*fmai_fmsub_): Ditto.
(*fmai_fnmadd_): Ditto.
(*fmai_fnmsub_): Ditto.
---
 gcc/config/i386/sse.md | 86 +-
 1 file changed, 43 insertions(+), 43 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a45b50ad732..9201b1a0782 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -5895,7 +5895,7 @@ (define_insn "_fmadd__mask"
   [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VFH_AVX512VL
  (fma:VFH_AVX512VL
-   (match_operand:VFH_AVX512VL 1 "register_operand" "0,0")
+   (match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "0,0")
(match_operand:VFH_AVX512VL 2 "" 
",v")
(match_operand:VFH_AVX512VL 3 "" 
"v,"))
  (match_dup 1)
@@ -5914,7 +5914,7 @@ (define_insn "_fmadd__mask3"
  (fma:VFH_AVX512VL
(match_operand:VFH_AVX512VL 1 "" "%v")
(match_operand:VFH_AVX512VL 2 "" 
"")
-   (match_operand:VFH_AVX512VL 3 "register_operand" "0"))
+   (match_operand:VFH_AVX512VL 3 "nonimmediate_operand" "0"))
  (match_dup 3)
  (match_operand: 4 "register_operand" "Yk")))]
   "TARGET_AVX512F && "
@@ -5999,7 +5999,7 @@ (define_insn "_fmsub__mask"
   [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v")
(vec_merge:VFH_AVX512VL
  (fma:VFH_AVX512VL
-   (match_operand:VFH_AVX512VL 1 "register_operand" "0,0")
+   (match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "0,0")
(match_operand:VFH_AVX512VL 2 "" 
",v")
(neg:VFH_AVX512VL
  (match_operand:VFH_AVX512VL 3 "" 
"v,")))
@@ -6020,7 +6020,7 @@ (define_insn "_fmsub__mask3"
(match_operand:VFH_AVX512VL 1 "" "%v")
(match_operand:VFH_AVX512VL 2 "" 
"")
(neg:VFH_AVX512VL
- (match_operand:VFH_AVX512VL 3 "register_operand" "0")))
+ (match_operand:VFH_AVX512VL 3 "nonimmediate_operand" "0")))
  (match_dup 3)
  (match_operand: 4 "register_operand" "Yk")))]
   "TARGET_AVX512F && "
@@ -6106,7 +6106,7 @@ (define_insn "_fnmadd__mask"
(vec_merge:VFH_AVX512VL
  (fma:VFH_AVX512VL
(neg:VFH_AVX512VL
- (match_operand:VFH_AVX512VL 1 "register_operand" "0,0"))
+ (match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "0,0"))
(match_operand:VFH_AVX512VL 2 "" 
",v")
(match_operand:VFH_AVX512VL 3 "" 
"v,"))
  (match_dup 1)
@@ -6126,7 +6126,7 @@ (define_insn "_fnmadd__mask3"
(neg:VFH_AVX512VL
  (match_operand:VFH_AVX512VL 1 "" "%v"))
(match_operand:VFH_AVX512VL 2 "" 
"")
-   (match_operand:VFH_AVX512VL 3 "register_operand" "0"))
+   (match_operand:VFH_AVX512VL 3 "nonimmediate_operand" "0"))
  (match_dup 3)
  (match_operand: 4 "register_operand" "Yk")))]
   "TARGET_AVX512F && "
@@ -6215,7 +6215,7 @@ (define_insn "_fnmsub__mask"
   

[PATCH 1/2] [Middle-end] Canonicalize (vec_merge (fma op2 op1 op3) op1 mask) to (vec_merge (fma op1 op2 op3) op1 mask).

2024-10-14 Thread liuhongt
For x86 masked fma, there're 2 rtl representations
1) (vec_merge (fma op2 op1 op3) op1 mask)
2) (vec_merge (fma op1 op2 op3) op1 mask).

 5894(define_insn "_fmadd__mask"
 5895  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v")
 5896(vec_merge:VFH_AVX512VL
 5897  (fma:VFH_AVX512VL
 5898(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "0,0")
 5899(match_operand:VFH_AVX512VL 2 "" 
",v")
 5900(match_operand:VFH_AVX512VL 3 "" 
"v,"))
 5901  (match_dup 1)
 5902  (match_operand: 4 "register_operand" "Yk,Yk")))]
 5903  "TARGET_AVX512F && "
 5904  "@
 5905   vfmadd132\t{%2, %3, %0%{%4%}|%0%{%4%}, %3, 
%2}
 5906   vfmadd213\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, 
%3}"
 5907  [(set_attr "type" "ssemuladd")
 5908   (set_attr "prefix" "evex")
 5909   (set_attr "mode" "")])

Here op1 has constraint "0", and the scecond op1 is (match_dup 1),
we once tried to replace it with (match_operand:M 5
"nonimmediate_operand" "0")) to enable more flexibility for pattern
match and recog, but it triggered an ICE in reload(reload can handle
at most one perand with "0" constraint).

So we need either add 2 patterns in the backend or just do the
canonicalization in the middle-end.

gcc/ChangeLog:

* combine.cc (maybe_swap_commutative_operands):
Canonicalize (vec_merge (fma op2 op1 op3) op1 mask)
to (vec_merge (fma op1 op2 op3) op1 mask).
---
 gcc/combine.cc | 25 +
 1 file changed, 25 insertions(+)

diff --git a/gcc/combine.cc b/gcc/combine.cc
index fef06a6cdc0..aa40fdcc50d 100644
--- a/gcc/combine.cc
+++ b/gcc/combine.cc
@@ -5656,6 +5656,31 @@ maybe_swap_commutative_operands (rtx x)
   SUBST (XEXP (x, 1), temp);
 }
 
+  /* Canonicalize (vec_merge (fma op2 op1 op3) op1 mask) to
+ (vec_merge (fma op1 op2 op3) op1 mask).  */
+  if (GET_CODE (x) == VEC_MERGE
+  && GET_CODE (XEXP (x, 0)) == FMA)
+{
+  rtx fma_op1 = XEXP (XEXP (x, 0), 0);
+  rtx fma_op2 = XEXP (XEXP (x, 0), 1);
+  rtx masked_op = XEXP (x, 1);
+  if (rtx_equal_p (masked_op, fma_op2))
+   {
+ if (GET_CODE (fma_op1) == NEG)
+   {
+ fma_op1 = XEXP (fma_op1, 0);
+ SUBST (XEXP (XEXP (XEXP (x, 0), 0), 0), fma_op2);
+ SUBST (XEXP (XEXP (x, 0), 1), fma_op1);
+   }
+ else
+   {
+ SUBST (XEXP (XEXP (x, 0), 0), fma_op2);
+ SUBST (XEXP (XEXP (x, 0), 1), fma_op1);
+   }
+
+   }
+}
+
   unsigned n_elts = 0;
   if (GET_CODE (x) == VEC_MERGE
   && CONST_INT_P (XEXP (x, 2))
-- 
2.31.1



[PATCH 0/2] Canonicalize (vec_merge (fma op1 op2 op3) op1 mask) to (vec_merge (fma op1 op2 op3) op1 mask)

2024-10-14 Thread liuhongt


For x86 masked fma, there're 2 rtl representations
1) (vec_merge (fma op2 op1 op3) op1 mask)
2) (vec_merge (fma op1 op2 op3) op1 mask).

 5894(define_insn "_fmadd__mask"
 5895  [(set (match_operand:VFH_AVX512VL 0 "register_operand" "=v,v")
 5896(vec_merge:VFH_AVX512VL
 5897  (fma:VFH_AVX512VL
 5898(match_operand:VFH_AVX512VL 1 "nonimmediate_operand" "0,0")
 5899(match_operand:VFH_AVX512VL 2 "" 
",v")
 5900(match_operand:VFH_AVX512VL 3 "" 
"v,"))
 5901  (match_dup 1)
 5902  (match_operand: 4 "register_operand" "Yk,Yk")))]
 5903  "TARGET_AVX512F && "
 5904  "@
 5905   vfmadd132\t{%2, %3, %0%{%4%}|%0%{%4%}, %3, 
%2}
 5906   vfmadd213\t{%3, %2, %0%{%4%}|%0%{%4%}, %2, 
%3}"
 5907  [(set_attr "type" "ssemuladd")
 5908   (set_attr "prefix" "evex")
 5909   (set_attr "mode" "")])

Here op1 has constraint "0", and the scecond op1 is (match_dup 1),
we once tried to replace it with (match_operand:M 5
"nonimmediate_operand" "0")) to enable more flexibility for pattern
match and recog, but it triggered an ICE in reload(reload can handle
at most one perand with "0" constraint).

So we need either add 2 patterns in the backend or just do the
canonicalization in the middle-end.

The patch canonicalize it at combine, and adjust x86 backend patterns.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

liuhongt (2):
  Canonicalize (vec_merge (fma op2 op1 op3) op1 mask) to (vec_merge (fma
op1 op2 op3) op1 mask).
  [x86] Canonicalize (vec_merge (fma: op2 op1 op3) (match_dup 1)) mask)
to (vec_merge (fma: op1 op2 op3) (match_dup 1)) mask)

 gcc/combine.cc | 25 
 gcc/config/i386/sse.md | 86 +-
 2 files changed, 68 insertions(+), 43 deletions(-)

-- 
2.31.1



[PATCH v3 2/2] Adjust testcase after relax O2 vectorization.

2024-10-08 Thread liuhongt
Update in V3.
>The testcase looks bogus:
>
>   b[i+k] = b[i+k-5] + 2;
>
>accesses b[-3], can you instead adjust the inner loop to start with k == 4?

Changed, also adjust b[100] to b[200] to avoid array out of bound.

>Please remove this testcase - even with fully masking we'd need alias
>versioning.

Changed.

Ready push to trunk.

gcc/testsuite/ChangeLog:

* gcc.dg/fstack-protector-strong.c: Adjust
scan-assembler-times.
* gcc.dg/graphite/scop-6.c: Refine the testcase to avoid array
out of bounds.
* gcc.dg/graphite/scop-9.c: Ditto.
* gcc.dg/tree-ssa/ivopts-lt-2.c: Add -fno-tree-vectorize.
* gcc.dg/tree-ssa/ivopts-lt.c: Ditto.
* gcc.dg/tree-ssa/loop-16.c: Ditto.
* gcc.dg/tree-ssa/loop-28.c: Ditto.
* gcc.dg/tree-ssa/loop-bound-2.c: Ditto.
* gcc.dg/tree-ssa/loop-bound-4.c: Ditto.
* gcc.dg/tree-ssa/loop-bound-6.c: Ditto.
* gcc.dg/tree-ssa/predcom-4.c: Ditto.
* gcc.dg/tree-ssa/predcom-5.c: Ditto.
* gcc.dg/tree-ssa/scev-11.c: Ditto.
* gcc.dg/tree-ssa/scev-9.c: Ditto.
* gcc.dg/tree-ssa/split-path-11.c: Ditto.
* gcc.dg/unroll-8.c: Ditto.
* gcc.dg/var-expand1.c: Ditto.
* gcc.dg/vect/vect-cost-model-6.c: Removed.
* gcc.target/i386/pr86270.c: Ditto.
* gcc.target/i386/pr86722.c: Ditto.
* gcc.target/x86_64/abi/callabi/leaf-2.c: Ditto.
---
 gcc/testsuite/gcc.dg/fstack-protector-strong.c   |  2 +-
 gcc/testsuite/gcc.dg/graphite/scop-6.c   |  7 +++
 gcc/testsuite/gcc.dg/graphite/scop-9.c   |  4 ++--
 gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c  |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c|  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-16.c  |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-28.c  |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-bound-2.c |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-bound-4.c |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-bound-6.c |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/predcom-4.c|  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/predcom-5.c|  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/scev-11.c  |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/scev-9.c   |  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-11.c|  2 +-
 gcc/testsuite/gcc.dg/unroll-8.c  |  3 +--
 gcc/testsuite/gcc.dg/var-expand1.c   |  2 +-
 gcc/testsuite/gcc.dg/vect/vect-cost-model-6.c| 12 
 gcc/testsuite/gcc.target/i386/pr86270.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pr86722.c  |  2 +-
 gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c |  2 +-
 21 files changed, 23 insertions(+), 37 deletions(-)
 delete mode 100644 gcc/testsuite/gcc.dg/vect/vect-cost-model-6.c

diff --git a/gcc/testsuite/gcc.dg/fstack-protector-strong.c 
b/gcc/testsuite/gcc.dg/fstack-protector-strong.c
index 94dc3508f1a..b9f63966b7c 100644
--- a/gcc/testsuite/gcc.dg/fstack-protector-strong.c
+++ b/gcc/testsuite/gcc.dg/fstack-protector-strong.c
@@ -154,4 +154,4 @@ void foo12 ()
   global3 ();
 }
 
-/* { dg-final { scan-assembler-times "stack_chk_fail" 12 } } */
+/* { dg-final { scan-assembler-times "stack_chk_fail" 11 } } */
diff --git a/gcc/testsuite/gcc.dg/graphite/scop-6.c 
b/gcc/testsuite/gcc.dg/graphite/scop-6.c
index 9bc1d9f4ccd..e7e0a080c5f 100644
--- a/gcc/testsuite/gcc.dg/graphite/scop-6.c
+++ b/gcc/testsuite/gcc.dg/graphite/scop-6.c
@@ -4,7 +4,7 @@ int toto()
 {
   int i, j, k;
   int a[100][100];
-  int b[100];
+  int b[200];
 
   for (i = 1; i < 100; i++)
 {
@@ -18,9 +18,8 @@ int toto()
 for (k = 1; k < 100; k++)
   b[i+k] = b[i+k-1] + 2;
 }
-  
-  for (k = 1; k < 100; k++)
-b[i+k] = b[i+k-5] + 2;
+  for (k = 4; k < 100; k++)
+   b[i+k] = b[i+k-5] + 2;
 }
 
   return a[3][5] + b[2];
diff --git a/gcc/testsuite/gcc.dg/graphite/scop-9.c 
b/gcc/testsuite/gcc.dg/graphite/scop-9.c
index b19291be2f8..2676452b1e6 100644
--- a/gcc/testsuite/gcc.dg/graphite/scop-9.c
+++ b/gcc/testsuite/gcc.dg/graphite/scop-9.c
@@ -4,7 +4,7 @@ int toto()
 {
   int i, j, k;
   int a[100][100];
-  int b[100];
+  int b[200];
 
   for (i = 1; i < 100; i++)
 {
@@ -14,7 +14,7 @@ int toto()
   if (i * 2 == i + 8)
a[i][i] = 2;
 
-  for (k = 1; k < 100; k++)
+  for (k = 4; k < 100; k++)
 b[i+k] = b[i+k-5] + 2;
 }
 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c
index bdbdbff19ff..be325775fbb 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fno-tree-loop-distribute-patterns -fdump-tree-ivopts" } 
*/
+/* { dg-options "-O2 -fno-tree-vectorize -fno-tree-loop-distribute-patterns 
-fdump-tree-ivopts" } */
 /* { dg-skip-if "PR68644" { hppa*-*-*

[PATCH v3 1/2] Enable vectorization for unknown tripcount in very cheap cost model but disable epilog vectorization.

2024-10-08 Thread liuhongt
>We'd also need to update the documentation:

>... The @samp{very-cheap} model only
>allows vectorization if the vector code would entirely replace the
>scalar code that is being vectorized.  For example, if each iteration
>of a vectorized loop would only be able to handle exactly four iterations
>of the scalar loop, the @samp{very-cheap} model would only allow
>vectorization if the scalar iteration count is known to be a multiple
>of four.
Changed.

>And since it's a change in documented behaviour, it should probably
>be in the release notes too.

Will submit another patch for that when it lands on trunk.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}, 
aarch64-unknown-linux-gnu{-m32,}.

Ok for trunk?

gcc/ChangeLog:

* tree-vect-loop.cc (vect_analyze_loop_costing): Enable
vectorization for LOOP_VINFO_PEELING_FOR_NITER in very cheap
cost model.
(vect_analyze_loop): Disable epilogue vectorization in very
cheap cost model.
* doc/invoke.texi: Adjust documents for very-cheap cost model.
---
 gcc/doc/invoke.texi   | 11 ---
 gcc/tree-vect-loop.cc |  6 +++---
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index b2f16b45eaf..edcadeb108a 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -14309,13 +14309,10 @@ counts that will likely execute faster than when 
executing the original
 scalar loop.  The @samp{cheap} model disables vectorization of
 loops where doing so would be cost prohibitive for example due to
 required runtime checks for data dependence or alignment but otherwise
-is equal to the @samp{dynamic} model.  The @samp{very-cheap} model only
-allows vectorization if the vector code would entirely replace the
-scalar code that is being vectorized.  For example, if each iteration
-of a vectorized loop would only be able to handle exactly four iterations
-of the scalar loop, the @samp{very-cheap} model would only allow
-vectorization if the scalar iteration count is known to be a multiple
-of four.
+is equal to the @samp{dynamic} model.  The @samp{very-cheap} model disables
+vectorization of loops when any runtime check for data dependence or alignment
+is required, it also disables vectorization of epilogue loops but otherwise is
+equal to the @samp{cheap} model.
 
 The default cost model depends on other optimization flags and is
 either @samp{dynamic} or @samp{cheap}.
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 6933f597b4d..a76d3b8ea5f 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2375,8 +2375,7 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo,
  a copy of the scalar code (even if we might be able to vectorize it).  */
   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
   && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
- || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
- || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+ || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
 {
   if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -3681,7 +3680,8 @@ vect_analyze_loop (class loop *loop, gimple 
*loop_vectorized_call,
   /* No code motion support for multiple epilogues so 
for now
  not supported when multiple exits.  */
 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
-&& !loop->simduid);
+&& !loop->simduid
+&& loop_cost_model (loop) > 
VECT_COST_MODEL_VERY_CHEAP);
   if (!vect_epilogues)
 return first_loop_vinfo;
 
-- 
2.31.1



[PATCH] Don't lower vpcmpu to pcmpgt since the latter is for signed comparison.

2024-10-08 Thread liuhongt
r15-1737-gb06a108f0fbffe lower AVX512 kmask comparison to AVX2 ones,
but wrong lowered unsigned comparison to signed ones, for unsigned
comparison, only EQ/NEQ can be lowered.

The commit fix that.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/116940
* config/i386/sse.md (*avx2_pcmp3_7): Change
UNSPEC_PCMP_ITER to UNSPEC_PCMP.
(*avx2_pcmp3_8): New pre_reload
define_insn_and_splitter.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116940.c: New test.
---
 gcc/config/i386/sse.md   | 27 ++-
 gcc/testsuite/gcc.target/i386/pr116940.c | 28 
 2 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116940.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d6e2135423d..944b73a8e83 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -18142,7 +18142,7 @@ (define_insn_and_split "*avx2_pcmp3_7"
[(match_operand:VI_128_256 3 "nonimmediate_operand")
 (match_operand:VI_128_256 4 "nonimmediate_operand")
 (match_operand:SI 5 "const_0_to_7_operand")]
-UNSPEC_PCMP_ITER)))]
+UNSPEC_PCMP)))]
   "TARGET_AVX512VL && ix86_pre_reload_split ()
  /* NE is commutative.  */
&& (INTVAL (operands[5]) == 4
@@ -18165,6 +18165,31 @@ (define_insn_and_split "*avx2_pcmp3_7"
   DONE;
 })
 
+(define_insn_and_split "*avx2_pcmp3_8"
+ [(set (match_operand:VI_128_256  0 "register_operand")
+   (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "const0_operand")
+ (match_operand:VI_128_256 2 "vector_all_ones_operand")
+ (unspec:
+   [(match_operand:VI_128_256 3 "nonimmediate_operand")
+(match_operand:VI_128_256 4 "nonimmediate_operand")
+(match_operand:SI 5 "const_0_to_7_operand")]
+UNSPEC_UNSIGNED_PCMP)))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()
+ /* NE is commutative.  */
+   && INTVAL (operands[5]) == 4"
+
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (MEM_P (operands[3]))
+operands[3] = force_reg (mode, operands[3]);
+  emit_move_insn (operands[0], gen_rtx_fmt_ee (EQ, mode,
+  operands[3], operands[4]));
+  DONE;
+})
+
 (define_expand "_eq3"
   [(set (match_operand: 0 "register_operand")
(unspec:
diff --git a/gcc/testsuite/gcc.target/i386/pr116940.c 
b/gcc/testsuite/gcc.target/i386/pr116940.c
new file mode 100644
index 000..721596bb8bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116940.c
@@ -0,0 +1,28 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512vl" } */
+/* { dg-require-effective-target avx512vl } */
+
+#define AVX512VL
+#include "avx512f-helper.h"
+
+typedef __attribute__((__vector_size__ (16))) unsigned V;
+
+short s;
+
+V
+foo ()
+{
+  return ~(-(V){ 0, 0, 0, 1 } <= s);
+}
+
+void
+test_128 ()
+{
+  V x = foo ();
+  if (x[0] != 0 || x[1] != 0 || x[2] != 0 || x[3] != 0x)
+__builtin_abort();
+}
+
+void
+test_256 ()
+{}
-- 
2.31.1



[PATCH 1/2] [x86] Add new microarchitecture tune for SRF/GRR/CWF.

2024-10-08 Thread liuhongt
For Crestmont, 4-operand vex blendv instructions come from MSROM and
is slower than 3-instructions sequence (op1 & mask) | (op2 & ~mask).
legacy blendv instruction can still be handled by the decoder.

The patch add a new tune which is enabled for all processors except
for SRF/CWF. It will use vpand + vpandn + vpor instead of
vpblendvb(similar for vblendvps/vblendvpd) for SRF/CWF.

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_sse_movcc): Guard
instruction blendv generation under new tune.
* config/i386/i386.h (TARGET_SSE_MOVCC_USE_BLENDV): New Macro.
* config/i386/x86-tune.def (X86_TUNE_SSE_MOVCC_USE_BLENDV):
New tune.
---
 gcc/config/i386/i386-expand.cc| 24 +--
 gcc/config/i386/i386.h|  2 ++
 gcc/config/i386/x86-tune.def  |  8 +++
 .../gcc.target/i386/sse_movcc_use_blendv.c| 12 ++
 4 files changed, 34 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 124cb976ec8..e4087cccb7c 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -4254,23 +4254,23 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
   switch (mode)
 {
 case E_V2SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_mmx_blendvps;
   break;
 case E_V4SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvps;
   break;
 case E_V2DFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvpd;
   break;
 case E_SFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvss;
   break;
 case E_DFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_sse4_1_blendvsd;
   break;
 case E_V8QImode:
@@ -4278,7 +4278,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V4HFmode:
 case E_V4BFmode:
 case E_V2SImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_mmx_pblendvb_v8qi;
  blend_mode = V8QImode;
@@ -4288,14 +4288,14 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V2HImode:
 case E_V2HFmode:
 case E_V2BFmode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_mmx_pblendvb_v4qi;
  blend_mode = V4QImode;
}
   break;
 case E_V2QImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
gen = gen_mmx_pblendvb_v2qi;
   break;
 case E_V16QImode:
@@ -4305,18 +4305,18 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V4SImode:
 case E_V2DImode:
 case E_V1TImode:
-  if (TARGET_SSE4_1)
+  if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
{
  gen = gen_sse4_1_pblendvb;
  blend_mode = V16QImode;
}
   break;
 case E_V8SFmode:
-  if (TARGET_AVX)
+  if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
gen = gen_avx_blendvps256;
   break;
 case E_V4DFmode:
-  if (TARGET_AVX)
+  if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
gen = gen_avx_blendvpd256;
   break;
 case E_V32QImode:
@@ -4325,7 +4325,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, 
rtx op_false)
 case E_V16BFmode:
 case E_V8SImode:
 case E_V4DImode:
-  if (TARGET_AVX2)
+  if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
{
  gen = gen_avx2_pblendvb;
  blend_mode = V32QImode;
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c1ec92ffb15..f01f31d208a 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -462,6 +462,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_DEST_FALSE_DEP_FOR_GLC]
 #define TARGET_SLOW_STC ix86_tune_features[X86_TUNE_SLOW_STC]
 #define TARGET_USE_RCR ix86_tune_features[X86_TUNE_USE_RCR]
+#define TARGET_SSE_MOVCC_USE_BLENDV \
+   ix86_tune_features[X86_TUNE_SSE_MOVCC_USE_BLENDV]
 
 /* Feature tests against the various architecture variations.  */
 enum ix86_arch_indices {
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 3d123da95f0..b815b6dc255 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -534,6 +534,14 @@ DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, 
"avoid_fma512_chains", m_ZNVER5)
 DEF_TUNE (X86_TUNE_V2DF_REDUCTION_PREFER_HADDPD,
  "v2df_reduction_prefer_haddpd", m_NONE)
 
+/* X86_TUNE_SSE_MOVCC_US

[PATCH 2/2] [x86] Add a new tune avx256_avoid_vec_perm for SRF.

2024-10-08 Thread liuhongt
According to Intel SOM[1], For Crestmont,  most 256-bit Intel AVX2
instructions can be decomposed into two independent 128-bit
micro-operations, except for a subset of Intel AVX2 instructions,
known as cross-lane operations, can only compute the result for an
element by utilizing one or more sources belonging to other elements.

The 256-bit instructions listed below use more operand sources than
can be natively supported by a single reservation station within these
microarchitectures. They are decomposed into two μops, where the first
μop resolves a subset of operand dependencies across two cycles. The
dependent second μop executes the 256-bit operation by using a single
128-bit execution port for two consecutive cycles with a five-cycle
latency for a total latency of seven cycles.

VPERM2I128 ymm1, ymm2, ymm3/m256, imm8
VPERM2F128 ymm1, ymm2, ymm3/m256, imm8
VPERMPD ymm1, ymm2/m256, imm8
VPERMPS ymm1, ymm2, ymm3/m256
VPERMD ymm1, ymm2, ymm3/m256
VPERMQ ymm1, ymm2/m256, imm8

Instead of setting tune avx128_optimal for SRF, the patch add a new
tune avx256_avoid_vec_perm for it. so by default, vectorizer still
uses 256-bit VF if cost is profitable, but lowers to 128-bit whenever
256-bit vec_perm is needed for auto-vectorization. w/o vec_perm,
performance of 256-bit vectorization should be similar as 128-bit
ones(some benchmark results show it's even better than 128-bit
vectorization since it enables more parallelism for convert cases.)

[1] 
https://www.intel.com/content/www/us/en/content-details/814198/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

gcc/ChangeLog:

* config/i386/i386.cc (ix86_vector_costs::ix86_vector_costs):
Add new member m_num_avx256_vec_perm.
(ix86_vector_costs::add_stmt_cost): Record 256-bit vec_perm.
(ix86_vector_costs::finish_cost): Prevent vectorization for
TAREGT_AVX256_AVOID_VEC_PERM when there's 256-bit vec_perm
instruction.
* config/i386/i386.h (TARGET_AVX256_AVOID_VEC_PERM): New
Macro.
* config/i386/x86-tune.def (X86_TUNE_AVX256_SPLIT_REGS): Add
m_CORE_ATOM.
(X86_TUNE_AVX256_AVOID_VEC_PERM): New tune.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx256_avoid_vec_perm.c: New test.
---
 gcc/config/i386/i386.cc   | 14 +++-
 gcc/config/i386/i386.h|  2 ++
 gcc/config/i386/x86-tune.def  |  7 +-
 .../gcc.target/i386/avx256_avoid_vec_perm.c   | 22 +++
 4 files changed, 43 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 7dbae1d72e3..77567b233d5 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24816,12 +24816,15 @@ private:
  where we know it's not loaded from memory.  */
   unsigned m_num_gpr_needed[3];
   unsigned m_num_sse_needed[3];
+  /* Number of 256-bit vector permutation.  */
+  unsigned m_num_avx256_vec_perm[3];
 };
 
 ix86_vector_costs::ix86_vector_costs (vec_info* vinfo, bool costing_for_scalar)
   : vector_costs (vinfo, costing_for_scalar),
 m_num_gpr_needed (),
-m_num_sse_needed ()
+m_num_sse_needed (),
+m_num_avx256_vec_perm ()
 {
 }
 
@@ -25055,6 +25058,10 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   if (stmt_cost == -1)
 stmt_cost = ix86_builtin_vectorization_cost (kind, vectype, misalign);
 
+  if (kind == vec_perm && vectype
+  && GET_MODE_SIZE (TYPE_MODE (vectype)) == 32)
+m_num_avx256_vec_perm[where]++;
+
   /* Penalize DFmode vector operations for Bonnell.  */
   if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
   && vectype && GET_MODE_INNER (TYPE_MODE (vectype)) == DFmode)
@@ -25124,6 +25131,11 @@ ix86_vector_costs::finish_cost (const vector_costs 
*scalar_costs)
 
   ix86_vect_estimate_reg_pressure ();
 
+  for (int i = 0; i != 3; i++)
+if (m_num_avx256_vec_perm[i]
+   && TARGET_AVX256_AVOID_VEC_PERM)
+  m_costs[i] = INT_MAX;
+
   vector_costs::finish_cost (scalar_costs);
 }
 
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index f01f31d208a..d57a1ca3e5c 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -439,6 +439,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL]
 #define TARGET_AVX256_SPLIT_REGS \
ix86_tune_features[X86_TUNE_AVX256_SPLIT_REGS]
+#define TARGET_AVX256_AVOID_VEC_PERM \
+   ix86_tune_features[X86_TUNE_AVX256_AVOID_VEC_PERM]
 #define TARGET_AVX512_SPLIT_REGS \
ix86_tune_features[X86_TUNE_AVX512_SPLIT_REGS]
 #define TARGET_GENERAL_REGS_SSE_SPILL \
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index b815b6dc255..6ebb2fd3414 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -558,7 +558,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OP

[PATCH 0/2] Enable more SRF tuning

2024-10-08 Thread liuhongt
The series add 2 tune for SRF/CWF according to Intel SOE
Crestmont microarchitecture.

1) Generate vpandn + vpand + vpor instead of vblendvps/vblendvpd/vpblendvb
instruction since 4-operand vex instruction comes from MSROM on Crestmont,
and it's slower than 3-instruction sequence.

2) Don't do 256-bit auto-vectorization when there's cross-lane permutation,
use 128-bit vectorization instead.
Instead of setting tune avx128_optimal for SRF, the patch add a new
tune avx256_avoid_vec_perm for it. so by default, vectorizer still
uses 256-bit VF if cost is profitable, but lowers to 128-bit whenever
256-bit vec_perm is needed for auto-vectorization. w/o vec_perm,
performance of 256-bit vectorization should be similar as 128-bit
ones(some benchmark results show it's even better than 128-bit
vectorization since it enables more parallelism for convert cases.)


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
The patch generally improves SPEC2017 allrate geomean by 1% with 
-march=sierraforest -Ofast on SRF.

Ready push to trunk.

liuhongt (2):
  [x86] Add new microarchitecture tune for SRF/GRR/CWF.
  [x86] Add a new tune avx256_avoid_vec_perm for SRF.

 gcc/config/i386/i386-expand.cc| 24 +--
 gcc/config/i386/i386.cc   | 14 ++-
 gcc/config/i386/i386.h|  4 
 gcc/config/i386/x86-tune.def  | 15 +++-
 .../gcc.target/i386/avx256_avoid_vec_perm.c   | 22 +
 .../gcc.target/i386/sse_movcc_use_blendv.c| 12 ++
 6 files changed, 77 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx256_avoid_vec_perm.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse_movcc_use_blendv.c

-- 
2.31.1



[PATCH v2 2/2] Adjust testcase after relax O2 vectorization.

2024-10-08 Thread liuhongt
gcc/testsuite/ChangeLog:

* gcc.dg/fstack-protector-strong.c: Adjust
scan-assembler-times.
* gcc.dg/graphite/scop-6.c: Add
-Wno-aggressive-loop-optimizations.
* gcc.dg/graphite/scop-9.c: Ditto.
* gcc.dg/tree-ssa/ivopts-lt-2.c: Add -fno-tree-vectorize.
* gcc.dg/tree-ssa/ivopts-lt.c: Ditto.
* gcc.dg/tree-ssa/loop-16.c: Ditto.
* gcc.dg/tree-ssa/loop-28.c: Ditto.
* gcc.dg/tree-ssa/loop-bound-2.c: Ditto.
* gcc.dg/tree-ssa/loop-bound-4.c: Ditto.
* gcc.dg/tree-ssa/loop-bound-6.c: Ditto.
* gcc.dg/tree-ssa/predcom-4.c: Ditto.
* gcc.dg/tree-ssa/predcom-5.c: Ditto.
* gcc.dg/tree-ssa/scev-11.c: Ditto.
* gcc.dg/tree-ssa/scev-9.c: Ditto.
* gcc.dg/tree-ssa/split-path-11.c: Ditto.
* gcc.dg/unroll-8.c: Ditto.
* gcc.dg/var-expand1.c: Ditto.
* gcc.dg/vect/vect-cost-model-6.c: Ditto.
* gcc.target/i386/pr86270.c: Ditto.
* gcc.target/i386/pr86722.c: Ditto.
* gcc.target/x86_64/abi/callabi/leaf-2.c: Ditto.
---
 gcc/testsuite/gcc.dg/fstack-protector-strong.c   | 2 +-
 gcc/testsuite/gcc.dg/graphite/scop-6.c   | 1 +
 gcc/testsuite/gcc.dg/graphite/scop-9.c   | 1 +
 gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c  | 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c| 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-16.c  | 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-28.c  | 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-bound-2.c | 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-bound-4.c | 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/loop-bound-6.c | 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/predcom-4.c| 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/predcom-5.c| 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/scev-11.c  | 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/scev-9.c   | 2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-11.c| 2 +-
 gcc/testsuite/gcc.dg/unroll-8.c  | 3 +--
 gcc/testsuite/gcc.dg/var-expand1.c   | 2 +-
 gcc/testsuite/gcc.dg/vect/vect-cost-model-6.c| 2 +-
 gcc/testsuite/gcc.target/i386/pr86270.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pr86722.c  | 2 +-
 gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c | 2 +-
 21 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/fstack-protector-strong.c 
b/gcc/testsuite/gcc.dg/fstack-protector-strong.c
index 94dc3508f1a..b9f63966b7c 100644
--- a/gcc/testsuite/gcc.dg/fstack-protector-strong.c
+++ b/gcc/testsuite/gcc.dg/fstack-protector-strong.c
@@ -154,4 +154,4 @@ void foo12 ()
   global3 ();
 }
 
-/* { dg-final { scan-assembler-times "stack_chk_fail" 12 } } */
+/* { dg-final { scan-assembler-times "stack_chk_fail" 11 } } */
diff --git a/gcc/testsuite/gcc.dg/graphite/scop-6.c 
b/gcc/testsuite/gcc.dg/graphite/scop-6.c
index 9bc1d9f4ccd..6ea887d9041 100644
--- a/gcc/testsuite/gcc.dg/graphite/scop-6.c
+++ b/gcc/testsuite/gcc.dg/graphite/scop-6.c
@@ -26,4 +26,5 @@ int toto()
   return a[3][5] + b[2];
 }
 
+/* { dg-additional-options "-Wno-aggressive-loop-optimizations" } */
 /* { dg-final { scan-tree-dump-times "number of SCoPs: 1" 1 "graphite"} } */
diff --git a/gcc/testsuite/gcc.dg/graphite/scop-9.c 
b/gcc/testsuite/gcc.dg/graphite/scop-9.c
index b19291be2f8..2a36bf92fd4 100644
--- a/gcc/testsuite/gcc.dg/graphite/scop-9.c
+++ b/gcc/testsuite/gcc.dg/graphite/scop-9.c
@@ -21,4 +21,5 @@ int toto()
   return a[3][5] + b[2];
 }
 
+/* { dg-additional-options "-Wno-aggressive-loop-optimizations" } */
 /* { dg-final { scan-tree-dump-times "number of SCoPs: 1" 1 "graphite"} } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c
index bdbdbff19ff..be325775fbb 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt-2.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fno-tree-loop-distribute-patterns -fdump-tree-ivopts" } 
*/
+/* { dg-options "-O2 -fno-tree-vectorize -fno-tree-loop-distribute-patterns 
-fdump-tree-ivopts" } */
 /* { dg-skip-if "PR68644" { hppa*-*-* powerpc*-*-* } } */
 
 void
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
index 71d7f672c44..8d2b9d39355 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ivopts-lt.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -fno-tree-loop-distribute-patterns -fdump-tree-ivopts" } 
*/
+/* { dg-options "-O2 -fno-tree-vectorize -fno-tree-loop-distribute-patterns 
-fdump-tree-ivopts" } */
 /* { dg-require-effective-target stdint_types } */
 
 #include "stdint.h"
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loop-16.c 
b/gcc/testsuite/gcc.dg/tree-ssa/loop-16.c
index 6bcb56cf3a9..92587f17df0 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/loop-16.c
+++ b/gcc/testsuite/gcc.dg/tree

[PATCH v2 1/2] Enable vectorization for unknown tripcount in very cheap cost model but disable epilog vectorization.

2024-10-08 Thread liuhongt
>So should we adjust very-cheap to allow niter peeling as proposed or
>should we switch the default at -O2 to cheap?
I prefer the former.

Update in V2:
Adjust testcase after relax O2 vectorization.

Ok for trunk?

gcc/ChangeLog:

* tree-vect-loop.cc (vect_analyze_loop_costing): Enable
vectorization for LOOP_VINFO_PEELING_FOR_NITER in very cheap
cost model.
(vect_analyze_loop): Disable epilogue vectorization in very
cheap cost model.
---
 gcc/tree-vect-loop.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 242d5e2d916..06afd8cae79 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2356,8 +2356,7 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo,
  a copy of the scalar code (even if we might be able to vectorize it).  */
   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
   && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
- || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
- || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+ || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
 {
   if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -3638,7 +3637,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
   /* No code motion support for multiple epilogues so 
for now
  not supported when multiple exits.  */
 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
-&& !loop->simduid);
+&& !loop->simduid
+&& loop_cost_model (loop) > 
VECT_COST_MODEL_VERY_CHEAP);
   if (!vect_epilogues)
 return first_loop_vinfo;
 
-- 
2.31.1



[PATCH] [x86] Define VECTOR_STORE_FLAG_VALUE

2024-09-24 Thread liuhongt
Return constm1_rtx when GET_MODE_CLASS (MODE) == MODE_VECTOR_INT.
Otherwise NULL_RTX.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/i386.h (VECTOR_STORE_FLAG_VALUE): New macro.

gcc/testsuite/ChangeLog:
* gcc.dg/rtl/x86_64/vector_eq.c: New test.
---
 gcc/config/i386/i386.h  |  5 +++-
 gcc/testsuite/gcc.dg/rtl/x86_64/vector_eq.c | 26 +
 2 files changed, 30 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/rtl/x86_64/vector_eq.c

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c1ec92ffb15..b12be41424f 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -899,7 +899,10 @@ extern const char *host_detect_local_cpu (int argc, const 
char **argv);
and give entire struct the alignment of an int.  */
 /* Required on the 386 since it doesn't have bit-field insns.  */
 #define PCC_BITFIELD_TYPE_MATTERS 1
-
+
+#define VECTOR_STORE_FLAG_VALUE(MODE) \
+  (GET_MODE_CLASS (MODE) == MODE_VECTOR_INT ? constm1_rtx : NULL_RTX)
+
 /* Standard register usage.  */
 
 /* This processor has special stack-like registers.  See reg-stack.cc
diff --git a/gcc/testsuite/gcc.dg/rtl/x86_64/vector_eq.c 
b/gcc/testsuite/gcc.dg/rtl/x86_64/vector_eq.c
new file mode 100644
index 000..b82603d0b64
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/rtl/x86_64/vector_eq.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target x86_64-*-* } } */
+/* { dg-additional-options "-O2 -march=x86-64-v3" } */
+
+typedef int v4si __attribute__((vector_size(16)));
+
+v4si __RTL (startwith ("vregs")) foo (void)
+{
+(function "foo"
+  (insn-chain
+(block 2
+  (edge-from entry (flags "FALLTHRU"))
+  (cnote 1 [bb 2] NOTE_INSN_BASIC_BLOCK)
+  (cnote 2 NOTE_INSN_FUNCTION_BEG)
+  (cinsn 3 (set (reg:V4SI <0>) (const_vector:V4SI [(const_int 0) 
(const_int 0) (const_int 0) (const_int 0)])))
+  (cinsn 5 (set (reg:V4SI <2>)
+   (eq:V4SI (reg:V4SI <0>) (reg:V4SI <1>
+  (cinsn 6 (set (reg:V4SI <3>) (reg:V4SI <2>)))
+  (cinsn 7 (set (reg:V4SI xmm0) (reg:V4SI <3>)))
+  (edge-to exit (flags "FALLTHRU"))
+)
+  )
+ (crtl (return_rtx (reg/i:V4SI xmm0)))
+)
+}
+
+/* { dg-final { scan-assembler-not "vpxor" } } */
-- 
2.31.1



[RFC PATCH] Enable vectorization for unknown tripcount in very cheap cost model but disable epilog vectorization.

2024-09-10 Thread liuhongt
GCC12 enables vectorization for O2 with very cheap cost model which is 
restricted
to constant tripcount. The vectorization capacity is very limited w/ 
consideration
of codesize impact.

The patch extends the very cheap cost model a little bit to support variable 
tripcount.
But still disable peeling for gaps/alignment, runtime aliasing checking and 
epilogue
vectorization with the consideration of codesize.

So there're at most 2 versions of loop for O2 vectorization, one vectorized 
main loop
, one scalar/remainder loop.

.i.e.

void
foo1 (int* __restrict a, int* b, int* c, int n)
{
 for (int i = 0; i != n; i++)
  a[i] = b[i] + c[i];
}

with -O2 -march=x86-64-v3, will be vectorized to

.L10:
vmovdqu (%r8,%rax), %ymm0
vpaddd  (%rsi,%rax), %ymm0, %ymm0
vmovdqu %ymm0, (%rdi,%rax)
addq$32, %rax
cmpq%rdx, %rax
jne .L10
movl%ecx, %eax
andl$-8, %eax
cmpl%eax, %ecx
je  .L21
vzeroupper
.L12:
movl(%r8,%rax,4), %edx
addl(%rsi,%rax,4), %edx
movl%edx, (%rdi,%rax,4)
addq$1, %rax
cmpl%eax, %ecx
jne .L12

As measured with SPEC2017 on EMR, the patch(N-Iter) improves performance by 
4.11%
with extra 2.8% codeisze, and cheap cost model improve performance by 5.74% with
extra 8.88% codesize. The details are as below

Performance measured with -march=x86-64-v3 -O2 on EMR

N-Iter  cheap cost model
500.perlbench_r -0.12%  -0.12%
502.gcc_r   0.44%   -0.11%  
505.mcf_r   0.17%   4.46%
520.omnetpp_r   0.28%   -0.27%
523.xalancbmk_r 0.00%   5.93%
525.x264_r  -0.09%  23.53%
531.deepsjeng_r 0.19%   0.00%
541.leela_r 0.22%   0.00%
548.exchange2_r -11.54% -22.34%
557.xz_r0.74%   0.49%
GEOMEAN INT -1.04%  0.60%

503.bwaves_r3.13%   4.72%
507.cactuBSSN_r 1.17%   0.29%
508.namd_r  0.39%   6.87%
510.parest_r3.14%   8.52%
511.povray_r0.10%   -0.20%
519.lbm_r   -0.68%  10.14%
521.wrf_r   68.20%  76.73%
526.blender_r   0.12%   0.12%
527.cam4_r  19.67%  23.21%
538.imagick_r   0.12%   0.24%
544.nab_r   0.63%   0.53%
549.fotonik3d_r 14.44%  9.43%
554.roms_r  12.39%  0.00%
GEOMEAN FP  8.26%   9.41%
GEOMEAN ALL 4.11%   5.74%

Code sise impact
N-Iter  cheap cost model
500.perlbench_r 0.22%   1.03%
502.gcc_r   0.25%   0.60%   
505.mcf_r   0.00%   32.07%
520.omnetpp_r   0.09%   0.31%
523.xalancbmk_r 0.08%   1.86%
525.x264_r  0.75%   7.96%
531.deepsjeng_r 0.72%   3.28%
541.leela_r 0.18%   0.75%
548.exchange2_r 8.29%   12.19%
557.xz_r0.40%   0.60%
GEOMEAN INT 1.07%%  5.71%

503.bwaves_r12.89%  21.59%
507.cactuBSSN_r 0.90%   20.19%
508.namd_r  0.77%   14.75%
510.parest_r0.91%   3.91%
511.povray_r0.45%   4.08%
519.lbm_r   0.00%   0.00%
521.wrf_r   5.97%   12.79%
526.blender_r   0.49%   3.84%
527.cam4_r  1.39%   3.28%
538.imagick_r   1.86%   7.78%
544.nab_r   0.41%   3.00%
549.fotonik3d_r 25.50%  47.47%
554.roms_r  5.17%   13.01%
GEOMEAN FP  4.14%   11.38%
GEOMEAN ALL 2.80%   8.88%


The only regression is from 548.exchange_r, the vectorization for inner loop in 
each layer
of the 9-layer loops increases register pressure and causes more spill.
- block(rnext:9, 1, i1) = block(rnext:9, 1, i1) + 10
  - block(rnext:9, 2, i2) = block(rnext:9, 2, i2) + 10
.
- block(rnext:9, 9, i9) = block(rnext:9, 9, i9) + 10
...
- block(rnext:9, 2, i2) = block(rnext:9, 2, i2) + 10
- block(rnext:9, 1, i1) = block(rnext:9, 1, i1) + 10

Looks like aarch64 doesn't have the issue because aarch64 has 32 gprs, but x86 
only has 16.
I have a extra patch to prevent loop vectorization in deep-depth loop for x86 
backend which can
bring the performance back.

For 503.bwaves_r/505.mcf_r/507.cactuBSSN_r/508.namd_r, cheap cost model 
increases codesize
a lot but don't imporve any performance. And N-iter is much better for that for 
codesize.


Any comments?


gcc/ChangeLog:

* tree-vect-loop.cc (vect_analyze_loop_costing): Enable
vectorization for LOOP_VINFO_PEELING_FOR_NITER in very cheap
cost model.
(vect_analyze_loop): Disable epilogue vectorization in very
cheap cost model.
---
 gcc/tree-vect-loop.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 242d5e2d916..06afd8cae79 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -2356,8 +2356,7 @@ vect_analyze

[PATCH] Enable tune fuse_move_and_alu for GNR/GNR-D.

2024-09-10 Thread liuhongt
According to Intel Software Optimization Manual[1], the Redwood cove
microarchitecture supports LD+OP and MOV+OP macro fusions.

The patch enables MOV+OP tune for GNR.

[1] 
https://www.intel.com/content/www/us/en/content-details/814198/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_FUSE_MOV_AND_ALU): Enable
for GNR and GNR-D.
---
 gcc/config/i386/x86-tune.def | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index d7e2ad7fd25..3d123da95f0 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -153,7 +153,8 @@ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, 
"fuse_alu_and_branch",
 /* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
and the destination is used by alu.  alu must be one of
ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR.  */
-DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5)
+DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu",
+m_ZNVER5 | m_GRANITERAPIDS | m_GRANITERAPIDS_D)
 
 /*/
 /* Function prologue, epilogue and function calling sequences.   */
-- 
2.31.1



[PATCH] Don't force_reg operands[3] when it's not const0_rtx.

2024-09-08 Thread liuhongt
It fix the regression by

a51f2fc0d80869ab079a93cc3858f24a1fd28237 is the first bad commit
commit a51f2fc0d80869ab079a93cc3858f24a1fd28237
Author: liuhongt 
Date:   Wed Sep 4 15:39:17 2024 +0800

Handle const0_operand for *avx2_pcmp3_1.

caused

FAIL: gcc.target/i386/pr59539-1.c scan-assembler-times vmovdqu|vmovups 1

To reproduce:

$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/pr59539-1.c --target_board='unix{-m32\ 
-march=cascadelake}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/pr59539-1.c --target_board='unix{-m64\ 
-march=cascadelake}'"

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.


gcc/ChangeLog:

* config/i386/sse.md (*avx2_pcmp3_1): Don't force_reg
operands[3] when it's not const0_rtx.
---
 gcc/config/i386/sse.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1946d3513be..1ae61182d0c 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17929,7 +17929,8 @@ (define_insn_and_split "*avx2_pcmp3_1"
   if (INTVAL (operands[5]) == 1)
 std::swap (operands[3], operands[4]);
 
-  operands[3] = force_reg (mode, operands[3]);
+  if (operands[3] == CONST0_RTX (mode))
+operands[3] = force_reg (mode, operands[3]);
   if (operands[4] == CONST0_RTX (mode))
 operands[4] = force_reg (mode, operands[4]);
 
-- 
2.31.1



[PATCH] Handle const0_operand for *avx2_pcmp3_1.

2024-09-04 Thread liuhongt
*_eq3_1 supports
nonimm_or_0_operand for op1 and op2, pass_combine would fail to lower
avx512 comparision back to avx2 one when op1/op2 is const0_rtx. It's
because the splitter only support nonimmediate_operand.

Failed to match this instruction:
(set (reg/i:V16QI 20 xmm0)
(vec_merge:V16QI (const_vector:V16QI [
(const_int -1 [0x]) repeated x16
])
(const_vector:V16QI [
(const_int 0 [0]) repeated x16
])
(unspec:HI [
(reg:V16QI 105 [ a ])
(const_vector:V16QI [
(const_int 0 [0]) repeated x16
])
(const_int 0 [0])
] UNSPEC_PCMP)))

The patch extend predicates of the splitter to handles that.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/115517
* config/i386/sse.md (*avx2_pcmp3_1): Change predicate
of operands[1] and operands[2] from nonimmdiate_operand to
nonimm_or_0_operand.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115517.c: New test.
---
 gcc/config/i386/sse.md   |  9 --
 gcc/testsuite/gcc.target/i386/pr115517.c | 38 
 2 files changed, 45 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115517.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3bf95f0b0e5..1946d3513be 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17908,8 +17908,8 @@ (define_insn_and_split "*avx2_pcmp3_1"
  (match_operand:VI_128_256 1 "vector_all_ones_operand")
  (match_operand:VI_128_256 2 "const0_operand")
  (unspec:
-   [(match_operand:VI_128_256 3 "nonimmediate_operand")
-(match_operand:VI_128_256 4 "nonimmediate_operand")
+   [(match_operand:VI_128_256 3 "nonimm_or_0_operand")
+(match_operand:VI_128_256 4 "nonimm_or_0_operand")
 (match_operand:SI 5 "const_0_to_7_operand")]
 UNSPEC_PCMP)))]
   "TARGET_AVX512VL && ix86_pre_reload_split ()
@@ -17928,6 +17928,11 @@ (define_insn_and_split "*avx2_pcmp3_1"
 {
   if (INTVAL (operands[5]) == 1)
 std::swap (operands[3], operands[4]);
+
+  operands[3] = force_reg (mode, operands[3]);
+  if (operands[4] == CONST0_RTX (mode))
+operands[4] = force_reg (mode, operands[4]);
+
   enum rtx_code code = INTVAL (operands[5]) ? GT : EQ;
   emit_move_insn (operands[0], gen_rtx_fmt_ee (code, mode,
   operands[3], operands[4]));
diff --git a/gcc/testsuite/gcc.target/i386/pr115517.c 
b/gcc/testsuite/gcc.target/i386/pr115517.c
new file mode 100644
index 000..e91d2c23a6b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115517.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpcmpeq" 4 } } */
+/* { dg-final { scan-assembler-not {(?n)%k[0-9]} } } */
+
+typedef char v16qi __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+
+v16qi
+foo (v16qi a)
+{
+  v16qi b = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  return a == b;
+}
+
+v8hi
+foo2 (v8hi a)
+{
+  v8hi b = {0, 0, 0, 0, 0, 0, 0, 0};
+  return a == b;
+}
+
+v4si
+foo3 (v4si a)
+{
+  v4si b = {0, 0, 0, 0};
+  return a == b;
+}
+
+v2di
+foo4 (v2di a)
+{
+  v2di b = {0, 0};
+  return a == b;
+}
+
-- 
2.31.1



[PATCH] [x86] Check avx upper register for parallel.

2024-08-29 Thread liuhongt
> Can the above loop be a part of ix86_check_avx_upper_register, so this
> function would scan the full RTX for avx upper register?
Changed, also adjust ix86_check_avx_upper_stores and ix86_avx_u128_mode_needed
to either inline the old ix86_check_avx_upper_register or replace 
FOR_EACH_SUBRTX
with new ix86_check_avx_upper_register.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport?

For function arguments/return, when it's BLK mode, it's put in a
parallel with an expr_list, and the expr_list contains the real mode
and registers.
Current ix86_check_avx_upper_register only checked for SSE_REG_P, and
failed to handle that. The patch extend the handle to each subrtx.

gcc/ChangeLog:

PR target/116512
* config/i386/i386.cc (ix86_check_avx_upper_register): Iterate
subrtx to scan for avx upper register.
(ix86_check_avx_upper_stores): Inline old
ix86_check_avx_upper_register.
(ix86_avx_u128_mode_needed): Ditto, and replace
FOR_EACH_SUBRTX with call to new
ix86_check_avx_upper_register.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116512.c: New test.
---
 gcc/config/i386/i386.cc  | 36 +++-
 gcc/testsuite/gcc.target/i386/pr116512.c | 26 +
 2 files changed, 49 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116512.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 224a78cc832..c40cee5b885 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14881,9 +14881,19 @@ ix86_dirflag_mode_needed (rtx_insn *insn)
 static bool
 ix86_check_avx_upper_register (const_rtx exp)
 {
-  return (SSE_REG_P (exp)
- && !EXT_REX_SSE_REG_P (exp)
- && GET_MODE_BITSIZE (GET_MODE (exp)) > 128);
+  /* construct_container may return a parallel with expr_list
+ which contains the real reg and mode  */
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, exp, NONCONST)
+{
+  const_rtx x = *iter;
+  if (SSE_REG_P (x)
+ && !EXT_REX_SSE_REG_P (x)
+ && GET_MODE_BITSIZE (GET_MODE (x)) > 128)
+   return true;
+}
+
+  return false;
 }
 
 /* Check if a 256bit or 512bit AVX register is referenced in stores.   */
@@ -14891,7 +14901,9 @@ ix86_check_avx_upper_register (const_rtx exp)
 static void
 ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data)
 {
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+  && !EXT_REX_SSE_REG_P (dest)
+  && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
 {
   bool *used = (bool *) data;
   *used = true;
@@ -14950,14 +14962,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
   return AVX_U128_CLEAN;
 }
 
-  subrtx_iterator::array_type array;
-
   rtx set = single_set (insn);
   if (set)
 {
   rtx dest = SET_DEST (set);
   rtx src = SET_SRC (set);
-  if (ix86_check_avx_upper_register (dest))
+  if (SSE_REG_P (dest)
+ && !EXT_REX_SSE_REG_P (dest)
+ && GET_MODE_BITSIZE (GET_MODE (dest)) > 128)
{
  /* This is an YMM/ZMM load.  Return AVX_U128_DIRTY if the
 source isn't zero.  */
@@ -14968,9 +14980,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
}
   else
{
- FOR_EACH_SUBRTX (iter, array, src, NONCONST)
-   if (ix86_check_avx_upper_register (*iter))
- return AVX_U128_DIRTY;
+ if (ix86_check_avx_upper_register (src))
+   return AVX_U128_DIRTY;
}
 
   /* This isn't YMM/ZMM load/store.  */
@@ -14981,9 +14992,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn)
  Hardware changes state only when a 256bit register is written to,
  but we need to prevent the compiler from moving optimal insertion
  point above eventual read from 256bit or 512 bit register.  */
-  FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST)
-if (ix86_check_avx_upper_register (*iter))
-  return AVX_U128_DIRTY;
+  if (ix86_check_avx_upper_register (PATTERN (insn)))
+return AVX_U128_DIRTY;
 
   return AVX_U128_ANY;
 }
diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c 
b/gcc/testsuite/gcc.target/i386/pr116512.c
new file mode 100644
index 000..c2bc6c91b64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116512.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+
+#include 
+
+struct B {
+  union {
+__m512 f;
+__m512i s;
+  };
+};
+
+struct B foo(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res;
+}
+
+__m512i bar(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res.s;
+}
-- 
2.31.1



[PATCH] [x86] Check avx upper register for parallel.

2024-08-29 Thread liuhongt
For function arguments/return, when it's BLK mode, it's put in a
parallel with an expr_list, and the expr_list contains the real mode
and registers.
Current ix86_check_avx_upper_register only checked for SSE_REG_P, and
failed to handle that. The patch extend the handle to each subrtx.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/116512
* config/i386/i386.cc (ix86_avx_u128_mode_entry): Iterate
each subrtx for potential rtx parallel to check avx upper
register.
(ix86_avx_u128_mode_exit): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116512.c: New test.
---
 gcc/config/i386/i386.cc  | 28 
 gcc/testsuite/gcc.target/i386/pr116512.c | 26 ++
 2 files changed, 50 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116512.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 224a78cc832..94d1a14056e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -15148,8 +15148,18 @@ ix86_avx_u128_mode_entry (void)
 {
   rtx incoming = DECL_INCOMING_RTL (arg);
 
-  if (incoming && ix86_check_avx_upper_register (incoming))
-   return AVX_U128_DIRTY;
+  if (incoming)
+   {
+ /* construct_container may return a parallel with expr_list
+which contains the real reg and mode  */
+ subrtx_var_iterator::array_type array;
+ FOR_EACH_SUBRTX_VAR (iter, array, incoming, ALL)
+   {
+ rtx x = *iter;
+ if (ix86_check_avx_upper_register (x))
+   return AVX_U128_DIRTY;
+   }
+   }
 }
 
   return AVX_U128_CLEAN;
@@ -15184,8 +15194,18 @@ ix86_avx_u128_mode_exit (void)
 
   /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
  or 512 bit modes used in the function return register. */
-  if (reg && ix86_check_avx_upper_register (reg))
-return AVX_U128_DIRTY;
+  if (reg)
+{
+  /* construct_container may return a parallel with expr_list
+which contains the real reg and mode  */
+  subrtx_var_iterator::array_type array;
+  FOR_EACH_SUBRTX_VAR (iter, array, reg, ALL)
+   {
+ rtx x = *iter;
+ if (ix86_check_avx_upper_register (x))
+   return AVX_U128_DIRTY;
+   }
+}
 
   /* Exit mode is set to AVX_U128_DIRTY if there are 256bit or 512bit
  modes used in function arguments, otherwise return AVX_U128_CLEAN.
diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c 
b/gcc/testsuite/gcc.target/i386/pr116512.c
new file mode 100644
index 000..c2bc6c91b64
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116512.c
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */
+
+#include 
+
+struct B {
+  union {
+__m512 f;
+__m512i s;
+  };
+};
+
+struct B foo(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res;
+}
+
+__m512i bar(int n) {
+  struct B res;
+  res.s = _mm512_set1_epi32(n);
+
+  return res.s;
+}
-- 
2.31.1



[PATCH v2 1/2] Enhance cse_insn to handle all-zeros and all-ones for vector mode.

2024-08-26 Thread liuhongt
> You are possibly overwriting src_related_elt - I'd suggest to either break
> here or do the loop below for each found elt?
Changed.

> Do we know that will always succeed?
1) validate_subreg allows subreg for 2 vector modes with same component modes.
2) gen_lowpart in cse.cc is defined as gen_lowpart_if_possible,
If it fails, it returns 0, just fallback to src_related = 0.

> So on the GIMPLE side we are trying to handle such cases by maintaining
> only a single element in the hashtables, thus hash and compare them
> the same - them in this case (vec_dup:M (reg:c)) and (vec_dup:N (reg:c)),
> leaving it up to the consumer to reject or pun mismatches.
rtx_cost will be used to decided if it's profitable
((subreg:M (reg: N) 0) vs (vec_dup:M (reg:c))), if M and N is
not tieable, rtx_cost will be expensive and failed the replacement.
>
> For constants that would hold even more - note CSEing vs. duplicating
> constants might not be universally good.
Assume you mean (reg:c) in (vec_dup:M (reg:c) is from a constant, the later
rtl optimizer (.i.e forwprop/combine) will try to do the further simplication
for the constants if rtx_cost is profitable.)
For const_vector, it handled by the other codes

5063  /* Try to re-materialize a vec_dup with an existing constant.   */
5064  rtx src_elt;
5065  if ((!src_eqv_here || CONSTANT_P (src_eqv_here))
5066  && const_vec_duplicate_p (src, &src_elt))
5067{


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

Also try to handle redundant broadcasts when there's already a
broadcast to a bigger mode with exactly the same component value.
For broadcast, component mode needs to be the same.
For all-zeros/ones, only need to check the bigger mode.

gcc/ChangeLog:

PR rtl-optimization/92080
* cse.cc (cse_insn): Handle all-ones/all-zeros, and vec_dup
with variables.
---
 gcc/cse.cc | 82 ++
 1 file changed, 82 insertions(+)

diff --git a/gcc/cse.cc b/gcc/cse.cc
index 65794ac5f2c..fab2f515f8c 100644
--- a/gcc/cse.cc
+++ b/gcc/cse.cc
@@ -4870,6 +4870,50 @@ cse_insn (rtx_insn *insn)
}
}
 
+  /* Try to handle special const_vector with elt 0 or -1.
+They can be represented with different modes, and can be cse.  */
+  if (src_const && src_related == 0 && CONST_VECTOR_P (src_const)
+ && (src_const == CONST0_RTX (mode)
+ || src_const == CONSTM1_RTX (mode))
+ && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+   {
+ machine_mode mode_iter;
+
+ for (int l = 0; l != 2; l++)
+   {
+ FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_VECTOR_INT)
+   {
+ if (maybe_lt (GET_MODE_SIZE (mode_iter),
+   GET_MODE_SIZE (mode)))
+   continue;
+
+ rtx src_const_iter = (src_const == CONST0_RTX (mode)
+   ? CONST0_RTX (mode_iter)
+   : CONSTM1_RTX (mode_iter));
+
+ struct table_elt *const_elt
+   = lookup (src_const_iter, HASH (src_const_iter, mode_iter),
+ mode_iter);
+
+ if (const_elt == 0)
+   continue;
+
+ for (const_elt = const_elt->first_same_value;
+  const_elt; const_elt = const_elt->next_same_value)
+   if (REG_P (const_elt->exp))
+ {
+   src_related = gen_lowpart (mode, const_elt->exp);
+   break;
+ }
+
+ if (src_related != 0)
+   break;
+   }
+ if (src_related != 0)
+   break;
+   }
+   }
+
   /* See if we have a CONST_INT that is already in a register in a
 wider mode.  */
 
@@ -5041,6 +5085,44 @@ cse_insn (rtx_insn *insn)
}
}
 
+  /* Try to find something like (vec_dup:v16si (reg:c))
+for (vec_dup:v8si (reg:c)).  */
+  if (src_related == 0
+ && VECTOR_MODE_P (mode)
+ && GET_CODE (src) == VEC_DUPLICATE)
+   {
+ poly_uint64 nunits = GET_MODE_NUNITS (GET_MODE (src)) * 2;
+ rtx inner_elt = XEXP (src, 0);
+ machine_mode result_mode;
+ struct table_elt *src_related_elt = NULL;;
+ while (related_vector_mode (mode, GET_MODE_INNER (mode),
+ nunits).exists (&result_mode))
+   {
+ rtx vec_dup = gen_rtx_VEC_DUPLICATE (result_mode, inner_elt);
+ struct table_elt* tmp = lookup (vec_dup, HASH (vec_dup, 
result_mode),
+ result_mode);
+ if (tmp)
+   {
+ src_related_elt = tmp;
+ break;
+   }
+ nunits *= 2;
+   }
+
+ if (src_related_elt)

[PATCH v2 2/2] [x86] Update ix86_mode_tieable_p and ix86_rtx_costs.

2024-08-26 Thread liuhongt
For mode2 bigger than 16-bytes, when it can be allocated to FIRST_SSE_REGS,
then it can only be allocated to ALL_SSE_REGS, and it can be tiebale
to all mode1 with smaller size which is available to FIRST_SSE_REGS.
When modes is equal to 16 bytes, exclude non-vector modes(TI/TFmode).
This is need for cse of all-ones/all-zeros, CSE checks costs with
ix86_modes_tieable_p with different size modes.

ALso update ix86_rtx_cost to prevent CONST0_RTX be propogated, it will
fail CSE of CONST0_RTX.

gcc/ChangeLog:

PR target/92080
* config/i386/i386.cc (ix86_modes_tieable_p): Relax
MODE_SIZE (mode1) to <= 64/32/16 bytes when it can be
allocated to FIRST_SSE_REG.
doesn't need to be exactly the same when >= 16.
(ix86_rtx_costs): Increase cost of const_double/const_vector
0/-1 a little to prevent propagation and enable more CSE.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr92080_vec_dup.c: New test.
* gcc.target/i386/pr92080_zero.c: New test.
---
 gcc/config/i386/i386.cc   | 14 +++--
 .../gcc.target/i386/pr92080_vec_dup.c | 48 +
 gcc/testsuite/gcc.target/i386/pr92080_zero.c  | 51 +++
 3 files changed, 108 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080_zero.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 224a78cc832..72b9859e376 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20933,15 +20933,17 @@ ix86_modes_tieable_p (machine_mode mode1, 
machine_mode mode2)
  any other mode acceptable to SSE registers.  */
   if (GET_MODE_SIZE (mode2) == 64
   && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-return (GET_MODE_SIZE (mode1) == 64
+return (GET_MODE_SIZE (mode1) <= 64
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
   if (GET_MODE_SIZE (mode2) == 32
   && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-return (GET_MODE_SIZE (mode1) == 32
+return (GET_MODE_SIZE (mode1) <= 32
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
   if (GET_MODE_SIZE (mode2) == 16
   && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-return (GET_MODE_SIZE (mode1) == 16
+return ((VECTOR_MODE_P (mode2)
+? GET_MODE_SIZE (mode1) <= 16
+: GET_MODE_SIZE (mode1) == 16)
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
 
   /* If MODE2 is appropriate for an MMX register, then tie
@@ -21507,10 +21509,12 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
case 0:
  break;
case 1:  /* 0: xor eliminates false dependency */
- *total = 0;
+ /* Add extra cost 1 to prevent propagation of CONST_VECTOR
+for SET, which will enable more CSE optimization.  */
+ *total = 0 + (outer_code == SET);
  return true;
default: /* -1: cmp contains false dependency */
- *total = 1;
+ *total = 1 + (outer_code == SET);
  return true;
}
   /* FALLTHRU */
diff --git a/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c 
b/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
new file mode 100644
index 000..67fdd15d69c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcast\[bwd\]" 3 } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+
+typedef short v32hi __attribute__((vector_size(64)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16si sinksz;
+v8si sinksy;
+v4si sinksx;
+v32hi sinkhz;
+v16hi sinkhy;
+v8hi sinkhx;
+v64qi sinkbz;
+v32qi sinkby;
+v16qi sinkbx;
+
+void foo(char c) {
+  sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
+  sinksx = __extension__(v4si){c,c,c,c};
+}
+
+void foo1(char c) {
+  sinkhz = __extension__(v32hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkhy = __extension__(v16hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkhx = __extension__(v8hi){c,c,c,c,c,c,c,c};
+}
+
+void foo2(char c) {
+  sinkbz = __extension__(v64qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkby = __extension__(v32qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkbx = __extension__(v16qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+}
diff --git a/g

[PATCH 1/2] Enhance cse_insn to handle all-zeros and all-ones for vector mode.

2024-08-26 Thread liuhongt
Also try to handle redundant broadcasts when there's already a
broadcast to a bigger mode with exactly the same component value.
For broadcast, component mode needs to be the same.
For all-zeros/ones, only need to check the bigger mode.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} and 
aarch64-linux-gnu{-m32,}.
OK for trunk?

gcc/ChangeLog:

PR rtl-optimization/92080
* cse.cc (cse_insn): Handle all-ones/all-zeros, and vec_dup
with variables.
---
 gcc/cse.cc | 79 ++
 1 file changed, 79 insertions(+)

diff --git a/gcc/cse.cc b/gcc/cse.cc
index 65794ac5f2c..baf90910b94 100644
--- a/gcc/cse.cc
+++ b/gcc/cse.cc
@@ -4870,6 +4870,50 @@ cse_insn (rtx_insn *insn)
}
}
 
+  /* Try to handle special const_vector with elt 0 or -1.
+They can be represented with different modes, and can be cse.  */
+  if (src_const && src_related == 0 && CONST_VECTOR_P (src_const)
+ && (src_const == CONST0_RTX (mode)
+ || src_const == CONSTM1_RTX (mode))
+ && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+   {
+ machine_mode mode_iter;
+
+ for (int l = 0; l != 2; l++)
+   {
+ FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_VECTOR_INT)
+   {
+ if (maybe_lt (GET_MODE_SIZE (mode_iter),
+   GET_MODE_SIZE (mode)))
+   continue;
+
+ rtx src_const_iter = (src_const == CONST0_RTX (mode)
+   ? CONST0_RTX (mode_iter)
+   : CONSTM1_RTX (mode_iter));
+
+ struct table_elt *const_elt
+   = lookup (src_const_iter, HASH (src_const_iter, mode_iter),
+ mode_iter);
+
+ if (const_elt == 0)
+   continue;
+
+ for (const_elt = const_elt->first_same_value;
+  const_elt; const_elt = const_elt->next_same_value)
+   if (REG_P (const_elt->exp))
+ {
+   src_related = gen_lowpart (mode, const_elt->exp);
+   break;
+ }
+
+ if (src_related != 0)
+   break;
+   }
+ if (src_related != 0)
+   break;
+   }
+   }
+
   /* See if we have a CONST_INT that is already in a register in a
 wider mode.  */
 
@@ -5041,6 +5085,41 @@ cse_insn (rtx_insn *insn)
}
}
 
+  /* Try to find something like (vec_dup:v16si (reg:c))
+for (vec_dup:v8si (reg:c)).  */
+  if (src_related == 0
+ && VECTOR_MODE_P (mode)
+ && GET_CODE (src) == VEC_DUPLICATE)
+   {
+ poly_uint64 nunits = GET_MODE_NUNITS (GET_MODE (src)) * 2;
+ rtx inner_elt = XEXP (src, 0);
+ machine_mode result_mode;
+ struct table_elt *src_related_elt = NULL;;
+ while (related_vector_mode (mode, GET_MODE_INNER (mode),
+ nunits).exists (&result_mode))
+   {
+ rtx vec_dup = gen_rtx_VEC_DUPLICATE (result_mode, inner_elt);
+ struct table_elt* tmp = lookup (vec_dup, HASH (vec_dup, 
result_mode),
+ result_mode);
+ if (tmp)
+   src_related_elt = tmp;
+ nunits *= 2;
+   }
+
+ if (src_related_elt)
+   {
+ for (src_related_elt = src_related_elt->first_same_value;
+  src_related_elt;
+  src_related_elt = src_related_elt->next_same_value)
+   if (REG_P (src_related_elt->exp))
+ {
+   src_related = gen_lowpart (mode, src_related_elt->exp);
+   break;
+ }
+   }
+   }
+
+
   if (src == src_folded)
src_folded = 0;
 
-- 
2.31.1



[PATCH 2/2] [x86] Update ix86_mode_tieable_p and ix86_rtx_costs.

2024-08-26 Thread liuhongt
For mode2 bigger than 16-bytes, when it can be allocated to FIRST_SSE_REGS,
then it can only be allocated to ALL_SSE_REGS, and it can be tiebale
to all mode1 with smaller size which is available to FIRST_SSE_REGS.
When modes is equal to 16 bytes, exclude non-vector modes(TI/TFmode).
This is need for cse of all-ones/all-zeros, CSE checks costs with
ix86_modes_tieable_p with different size modes.

ALso update ix86_rtx_cost to prevent CONST0_RTX be propogated, it will
fail CSE of CONST0_RTX.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/92080
* config/i386/i386.cc (ix86_modes_tieable_p): Relax
MODE_SIZE (mode1) to <= 64/32/16 bytes when it can be
allocated to FIRST_SSE_REG.
doesn't need to be exactly the same when >= 16.
(ix86_rtx_costs): Increase cost of const_double/const_vector
0/-1 a little to prevent propagation and enable more CSE.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr92080_vec_dup.c: New test.
* gcc.target/i386/pr92080_zero.c: New test.
---
 gcc/config/i386/i386.cc   | 14 +++--
 .../gcc.target/i386/pr92080_vec_dup.c | 48 +
 gcc/testsuite/gcc.target/i386/pr92080_zero.c  | 51 +++
 3 files changed, 108 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr92080_zero.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 224a78cc832..72b9859e376 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20933,15 +20933,17 @@ ix86_modes_tieable_p (machine_mode mode1, 
machine_mode mode2)
  any other mode acceptable to SSE registers.  */
   if (GET_MODE_SIZE (mode2) == 64
   && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-return (GET_MODE_SIZE (mode1) == 64
+return (GET_MODE_SIZE (mode1) <= 64
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
   if (GET_MODE_SIZE (mode2) == 32
   && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-return (GET_MODE_SIZE (mode1) == 32
+return (GET_MODE_SIZE (mode1) <= 32
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
   if (GET_MODE_SIZE (mode2) == 16
   && ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode2))
-return (GET_MODE_SIZE (mode1) == 16
+return ((VECTOR_MODE_P (mode2)
+? GET_MODE_SIZE (mode1) <= 16
+: GET_MODE_SIZE (mode1) == 16)
&& ix86_hard_regno_mode_ok (FIRST_SSE_REG, mode1));
 
   /* If MODE2 is appropriate for an MMX register, then tie
@@ -21507,10 +21509,12 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
case 0:
  break;
case 1:  /* 0: xor eliminates false dependency */
- *total = 0;
+ /* Add extra cost 1 to prevent propagation of CONST_VECTOR
+for SET, which will enable more CSE optimization.  */
+ *total = 0 + (outer_code == SET);
  return true;
default: /* -1: cmp contains false dependency */
- *total = 1;
+ *total = 1 + (outer_code == SET);
  return true;
}
   /* FALLTHRU */
diff --git a/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c 
b/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
new file mode 100644
index 000..67fdd15d69c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr92080_vec_dup.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v4 -O2" } */
+/* { dg-final { scan-assembler-times "vpbroadcast\[bwd\]" 3 } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+
+typedef short v32hi __attribute__((vector_size(64)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v16qi __attribute__((vector_size(16)));
+
+v16si sinksz;
+v8si sinksy;
+v4si sinksx;
+v32hi sinkhz;
+v16hi sinkhy;
+v8hi sinkhx;
+v64qi sinkbz;
+v32qi sinkby;
+v16qi sinkbx;
+
+void foo(char c) {
+  sinksz = __extension__(v16si){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinksy = __extension__(v8si){c,c,c,c,c,c,c,c};
+  sinksx = __extension__(v4si){c,c,c,c};
+}
+
+void foo1(char c) {
+  sinkhz = __extension__(v32hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkhy = __extension__(v16hi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkhx = __extension__(v8hi){c,c,c,c,c,c,c,c};
+}
+
+void foo2(char c) {
+  sinkbz = __extension__(v64qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkby = __extension__(v32qi){c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,
+c,c,c,c,c,c,c,c,c,c,c,c,c,c,c,c};
+  sinkbx =

[GCC13/GCC12 PATCH] Fix testcase failure.

2024-08-21 Thread liuhongt
Looks like -mprefer-vector-width=128 doesn't impact store_max/mov_max
for GCC13/GCC12 branch, explicitly use -mmov-max=128, -mstore-max=128
for those testcases.

Committed as an obvious fix.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pieces-memcpy-10.c: Use -mmove-max=256 and
-mstore-max=256.
* gcc.target/i386/pieces-memcpy-6.c: Ditto.
* gcc.target/i386/pieces-memset-38.c: Ditto.
* gcc.target/i386/pieces-memset-40.c: Ditto.
* gcc.target/i386/pieces-memset-41.c: Ditto.
* gcc.target/i386/pieces-memset-42.c: Ditto.
* gcc.target/i386/pieces-memset-43.c: Ditto.
* gcc.target/i386/pieces-strcpy-2.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c  | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-38.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-40.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-41.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-42.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-43.c | 2 +-
 gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c  | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
index 53ad0b3be44..78f92ac5197 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
index cfd2a86cf33..57b74ae4b23 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
index ddd194debd5..d9443678735 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
index 9c206465d46..8ad6ad7e494 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx512f -mavx2 -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
index b0756182e35..08fd6e9a927 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge -mno-stackrealign" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge -mno-stackrealign" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
index 103da699ae5..6b73bb256af 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c 
b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
index f1494e17610..c6c7ff234da 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 
-mtune=sandybridge" } */
 
 extern char *dst;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c 
b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c
index 9bb94b7419b..40ada119625 10064

[PATCH] Align ix86_{move_max,store_max} with vectorizer.

2024-08-20 Thread liuhongt
When none of mprefer-vector-width, avx256_optimal/avx128_optimal,
avx256_store_by_pieces/avx512_store_by_pieces is specified, GCC will
set ix86_{move_max,store_max} as max available vector length except
for AVX part.

  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
  else
opts->x_ix86_move_max = PVW_AVX128;

So for -mavx2, vectorizer will choose 256-bit for vectorization, but
128-bit is used for struct copy, there could be a potential STLF issue
due to this "misalign".

The patch fixes that and improved 538.imagick_r by ~30% for -march=x86-64-v3 
-O2.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_option_override_internal):
set ix86_{move_max,store_max} to PVW_AVX256 when TARGET_AVX
instead of PVW_AVX128.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pieces-memcpy-10.c: Add -mprefer-vector-width=128.
* gcc.target/i386/pieces-memcpy-6.c: Ditto.
* gcc.target/i386/pieces-memset-38.c: Ditto.
* gcc.target/i386/pieces-memset-40.c: Ditto.
* gcc.target/i386/pieces-memset-41.c: Ditto.
* gcc.target/i386/pieces-memset-42.c: Ditto.
* gcc.target/i386/pieces-memset-43.c: Ditto.
* gcc.target/i386/pieces-strcpy-2.c: Ditto.
* gcc.target/i386/pieces-memcpy-22.c: New test.
* gcc.target/i386/pieces-memset-51.c: New test.
* gcc.target/i386/pieces-strcpy-3.c: New test.
---
 gcc/config/i386/i386-options.cc  |  6 ++
 gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c | 12 
 gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-38.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-40.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-41.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-42.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-43.c |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-memset-51.c | 12 
 gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c  | 15 +++
 12 files changed, 53 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-51.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index f423455b363..f79257cc764 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -3023,6 +3023,9 @@ ix86_option_override_internal (bool main_args_p,
  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_move_max = PVW_AVX512;
+ /* Align with vectorizer to avoid potential STLF issue.  */
+ else if (TARGET_AVX_P (opts->x_ix86_isa_flags))
+   opts->x_ix86_move_max = PVW_AVX256;
  else
opts->x_ix86_move_max = PVW_AVX128;
}
@@ -3047,6 +3050,9 @@ ix86_option_override_internal (bool main_args_p,
  if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)
  && TARGET_EVEX512_P (opts->x_ix86_isa_flags2))
opts->x_ix86_store_max = PVW_AVX512;
+ /* Align with vectorizer to avoid potential STLF issue.  */
+ else if (TARGET_AVX_P (opts->x_ix86_isa_flags))
+   opts->x_ix86_store_max = PVW_AVX256;
  else
opts->x_ix86_store_max = PVW_AVX128;
}
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
index 5faee21f9b9..53ad0b3be44 100644
--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 
-mtune=sandybridge" } */
 
 extern char *dst, *src;
 
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
new file mode 100644
index 000..605b3623ffc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */
+
+extern char *dst, *src;
+
+void
+foo (void)
+{
+  __builtin_memcpy (dst, src, 33);
+}
+
+/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c 
b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c
index 5f99cc98c47..cfd2a86cf

[PATCH] Align predicates for operands[1] between mov and *mov_internal.

2024-08-20 Thread liuhongt
>From [1]
> > It's not obvious to me why movv16qi requires a nonimmediate_operand
> > source, especially since ix86_expand_vector_mode does have code to
> > cope with constant operand[1]s.  emit_move_insn_1 doesn't check the
> > predicates anyway, so the predicate will have little effect.
> >
> > A workaround would be to check legitimate_constant_p instead of the
> > predicate, but I'm not sure that that should be necessary.
> >
> > Has this already been discussed?  If not, we should loop in the x86
> > maintainers (but I didn't do that here in case it would be a repeat).
>
> I also noticed it. Not sure why movv16qi requires a
> nonimmediate_operand, while ix86_expand_vector_mode could deal with
> constant op. Looking forward to Hongtao's comments.
The code has been there since 2005 before I'm involved.
 It looks to me at the beginning both mov and
*mov_internal only support nonimmediate_operand for the
operands[1].
And r0-75606-g5656a184e83983 adjusted the nonimmediate_operand to
nonimmediate_or_sse_const_operand for *mov_internal, but not for
mov. I think we can align the predicate between mov
and *mov_internal.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/sse.md (mov): Align predicates for
operands[1] between mov and *mov_internal.
---
 gcc/config/i386/sse.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d1010bc5682..7ecfbd55809 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1387,7 +1387,7 @@ (define_mode_attr DOUBLEMASKMODE
 
 (define_expand "mov"
   [(set (match_operand:VMOVE 0 "nonimmediate_operand")
-   (match_operand:VMOVE 1 "nonimmediate_operand"))]
+   (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand"))]
   "TARGET_SSE"
 {
   ix86_expand_vector_move (mode, operands);
-- 
2.31.1



[PATCH v2] [x86] Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload.

2024-08-15 Thread liuhongt
It results in 2 failures for x86_64-pc-linux-gnu{\
-march=cascadelake};

gcc: gcc.target/i386/extendditi3-1.c scan-assembler cqt?o
gcc: gcc.target/i386/pr113560.c scan-assembler-times \tmulq 1

For pr113560.c, now GCC generates mulx instead of mulq with
-march=cascadelake, which should be optimal, so adjust testcase for
that.
For gcc.target/i386/extendditi2-1.c, RA happens to choose another
register instead of rax and result in

movq%rdi, %rbp
movq%rdi, %rax
sarq$63, %rbp
movq%rbp, %rdx

The patch adds a new define_peephole2 for that.

gcc/ChangeLog:

PR target/116274
* config/i386/i386-expand.cc (ix86_expand_vector_move):
Restrict special case TImode to 128-bit vector conversions via
V2DI under ix86_pre_reload_split ().
* config/i386/i386.cc (inline_secondary_memory_needed):
Movement between GENERAL_REGS and SSE_REGS for TImode doesn't
need secondary reload.
* config/i386/i386.md (*extendsidi2_rex64): Add a
define_peephole2 after it.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116274.c: New test.
* gcc.target/i386/pr113560.c: Scan either mulq or mulx.
---
 gcc/config/i386/i386-expand.cc   |  2 +-
 gcc/config/i386/i386.cc  | 18 --
 gcc/config/i386/i386.md  | 19 +++
 gcc/testsuite/gcc.target/i386/pr113560.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr116274.c | 12 
 5 files changed, 45 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116274.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index bdbc1423267..ed546eeed6b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -751,7 +751,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
   && SUBREG_P (op1)
   && GET_MODE (SUBREG_REG (op1)) == TImode
   && TARGET_64BIT && TARGET_SSE
-  && can_create_pseudo_p ())
+  && ix86_pre_reload_split ())
 {
   rtx tmp = gen_reg_rtx (V2DImode);
   rtx lo = gen_reg_rtx (DImode);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f044826269c..4821892d1e0 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20292,6 +20292,18 @@ inline_secondary_memory_needed (machine_mode mode, 
reg_class_t class1,
   if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2)))
return true;
 
+  /* If the target says that inter-unit moves are more expensive
+than moving through memory, then don't generate them.  */
+  if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
+ || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
+   return true;
+
+  /* Under SSE4.1, *movti_internal supports movement between
+SSE_REGS and GENERAL_REGS with pinsrq and pextrq.  */
+  if (TARGET_SSE4_1
+ && (TARGET_64BIT ? mode == TImode : mode == DImode))
+   return false;
+
   int msize = GET_MODE_SIZE (mode);
 
   /* Between SSE and general, we have moves no larger than word size.  */
@@ -20304,12 +20316,6 @@ inline_secondary_memory_needed (machine_mode mode, 
reg_class_t class1,
 
   if (msize < minsize)
return true;
-
-  /* If the target says that inter-unit moves are more expensive
-than moving through memory, then don't generate them.  */
-  if ((SSE_CLASS_P (class1) && !TARGET_INTER_UNIT_MOVES_FROM_VEC)
- || (SSE_CLASS_P (class2) && !TARGET_INTER_UNIT_MOVES_TO_VEC))
-   return true;
 }
 
   return false;
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index db7789c17d2..1962a7ba5c9 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5041,6 +5041,25 @@ (define_split
   DONE;
 })
 
+(define_peephole2
+  [(set (match_operand:DI 0 "general_reg_operand")
+   (match_operand:DI 1 "general_reg_operand"))
+   (parallel [(set (match_dup 0)
+  (ashiftrt:DI (match_dup 0)
+   (const_int 63)))
+  (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:DI 2 "general_reg_operand") (match_dup 1))
+   (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))]
+  "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)
+   && REGNO (operands[2]) == AX_REG
+   && REGNO (operands[3]) == DX_REG
+   && peep2_reg_dead_p (4, operands[0])
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[2], operands[0])"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 3) (ashiftrt:DI (match_dup 2) (const_int 63)))
+ (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "extenddi2"
   [(set (match_operand:DI 0 "register_operand" "=r")
(sign_extend:DI
diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c 
b/gcc/testsuite/gcc.target/i386/pr113560.c
index ac2e01a4589..9431a2d1d90 100644
--- a/gcc/testsuite/gcc.

[PATCH] [x86] Movement between GENERAL_REGS and SSE_REGS for TImode doesn't need secondary reload.

2024-08-13 Thread liuhongt
It results in 2 failures for x86_64-pc-linux-gnu{\
-march=cascadelake};

gcc: gcc.target/i386/extendditi3-1.c scan-assembler cqt?o
gcc: gcc.target/i386/pr113560.c scan-assembler-times \tmulq 1

For pr113560.c, now GCC generates mulx instead of mulq with
-march=cascadelake, which should be optimal, so adjust testcase for
that.
For gcc.target/i386/extendditi2-1.c, RA happens to choose another
register instead of rax and result in

movq%rdi, %rbp
movq%rdi, %rax
sarq$63, %rbp
movq%rbp, %rdx

The patch adds a new define_peephole2 for that.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/116274
* config/i386/i386-expand.cc (ix86_expand_vector_move):
Restrict special case TImode to 128-bit vector conversions via
V2DI under ix86_pre_reload_split ().
* config/i386/i386.cc (inline_secondary_memory_needed):
Movement between GENERAL_REGS and SSE_REGS for TImode doesn't
need secondary reload.
* config/i386/i386.md (*extendsidi2_rex64): Add a
define_peephole2 after it.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116274.c: New test.
* gcc.target/i386/pr113560.c: Scan either mulq or mulx.
---
 gcc/config/i386/i386-expand.cc   |  2 +-
 gcc/config/i386/i386.cc  |  5 +
 gcc/config/i386/i386.md  | 19 +++
 gcc/testsuite/gcc.target/i386/pr113560.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr116274.c | 12 
 5 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116274.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index bdbc1423267..ed546eeed6b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -751,7 +751,7 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
   && SUBREG_P (op1)
   && GET_MODE (SUBREG_REG (op1)) == TImode
   && TARGET_64BIT && TARGET_SSE
-  && can_create_pseudo_p ())
+  && ix86_pre_reload_split ())
 {
   rtx tmp = gen_reg_rtx (V2DImode);
   rtx lo = gen_reg_rtx (DImode);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index f044826269c..31fe8a199c9 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20292,6 +20292,11 @@ inline_secondary_memory_needed (machine_mode mode, 
reg_class_t class1,
   if (!(INTEGER_CLASS_P (class1) || INTEGER_CLASS_P (class2)))
return true;
 
+  /* Under SSE4.1, *movti_internal supports movement between
+SSE_REGS and GENERAL_REGS with pinsrq and pextrq.  */
+  if (mode == TImode && TARGET_SSE4_1)
+   return false;
+
   int msize = GET_MODE_SIZE (mode);
 
   /* Between SSE and general, we have moves no larger than word size.  */
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index db7789c17d2..1962a7ba5c9 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -5041,6 +5041,25 @@ (define_split
   DONE;
 })
 
+(define_peephole2
+  [(set (match_operand:DI 0 "general_reg_operand")
+   (match_operand:DI 1 "general_reg_operand"))
+   (parallel [(set (match_dup 0)
+  (ashiftrt:DI (match_dup 0)
+   (const_int 63)))
+  (clobber (reg:CC FLAGS_REG))])
+   (set (match_operand:DI 2 "general_reg_operand") (match_dup 1))
+   (set (match_operand:DI 3 "general_reg_operand") (match_dup 0))]
+  "(optimize_function_for_size_p (cfun) || TARGET_USE_CLTD)
+   && REGNO (operands[2]) == AX_REG
+   && REGNO (operands[3]) == DX_REG
+   && peep2_reg_dead_p (4, operands[0])
+   && !reg_mentioned_p (operands[0], operands[1])
+   && !reg_mentioned_p (operands[2], operands[0])"
+  [(set (match_dup 2) (match_dup 1))
+   (parallel [(set (match_dup 3) (ashiftrt:DI (match_dup 2) (const_int 63)))
+ (clobber (reg:CC FLAGS_REG))])])
+
 (define_insn "extenddi2"
   [(set (match_operand:DI 0 "register_operand" "=r")
(sign_extend:DI
diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c 
b/gcc/testsuite/gcc.target/i386/pr113560.c
index ac2e01a4589..9431a2d1d90 100644
--- a/gcc/testsuite/gcc.target/i386/pr113560.c
+++ b/gcc/testsuite/gcc.target/i386/pr113560.c
@@ -11,7 +11,7 @@ __int128 bar(__int128 x, __int128 y)
   return (x & 1000) * (y & 1000);
 }
 
-/* { dg-final { scan-assembler-times "\tmulq" 1 } } */
+/* { dg-final { scan-assembler-times "\tmul\[qx\]" 1 } } */
 /* { dg-final { scan-assembler-times "\timulq" 1 } } */
 /* { dg-final { scan-assembler-not "addq" } } */
 /* { dg-final { scan-assembler-not "xorl" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr116274.c 
b/gcc/testsuite/gcc.target/i386/pr116274.c
new file mode 100644
index 000..51ac3e1572d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116274.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O3 -fno-vect-cost-model -msse4

[PATCH] Move ix86_align_loops into a separate pass and insert the pass after pass_endbr_and_patchable_area.

2024-08-12 Thread liuhongt
> Are there any assumptions that BB_HEAD must be a note or label?
> Maybe we should move ix86_align_loops into a separate pass and insert
> the pass just before pass_final.
The patch inserts .p2align after endbr pass, it can also fix the issue.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

gcc/ChangeLog:

PR target/116174
* config/i386/i386.cc (ix86_align_loops): Move this to ..
* config/i386/i386-features.cc (ix86_align_loops): .. here.
(class pass_align_tight_loops): New class.
(make_pass_align_tight_loops): New function.
* config/i386/i386-passes.def: Insert pass_align_tight_loops
after pass_insert_endbr_and_patchable_area.
* config/i386/i386-protos.h (make_pass_align_tight_loops): New
declare.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116174.c: New test.
---
 gcc/config/i386/i386-features.cc | 190 +++
 gcc/config/i386/i386-passes.def  |   3 +
 gcc/config/i386/i386-protos.h|   1 +
 gcc/config/i386/i386.cc  | 146 -
 gcc/testsuite/gcc.target/i386/pr116174.c |  12 ++
 5 files changed, 206 insertions(+), 146 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index c36d181f2d6..7e80e7b0103 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -3417,6 +3417,196 @@ make_pass_apx_nf_convert (gcc::context *ctxt)
   return new pass_apx_nf_convert (ctxt);
 }
 
+/* When a hot loop can be fit into one cacheline,
+   force align the loop without considering the max skip.  */
+static void
+ix86_align_loops ()
+{
+  basic_block bb;
+
+  /* Don't do this when we don't know cache line size.  */
+  if (ix86_cost->prefetch_block == 0)
+return;
+
+  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+  FOR_EACH_BB_FN (bb, cfun)
+{
+  rtx_insn *label = BB_HEAD (bb);
+  bool has_fallthru = 0;
+  edge e;
+  edge_iterator ei;
+
+  if (!LABEL_P (label))
+   continue;
+
+  profile_count fallthru_count = profile_count::zero ();
+  profile_count branch_count = profile_count::zero ();
+
+  FOR_EACH_EDGE (e, ei, bb->preds)
+   {
+ if (e->flags & EDGE_FALLTHRU)
+   has_fallthru = 1, fallthru_count += e->count ();
+ else
+   branch_count += e->count ();
+   }
+
+  if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+   continue;
+
+  if (bb->loop_father
+ && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+ && (has_fallthru
+ ? (!(single_succ_p (bb)
+  && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+&& optimize_bb_for_speed_p (bb)
+&& branch_count + fallthru_count > count_threshold
+&& (branch_count > fallthru_count * 
param_align_loop_iterations))
+ /* In case there'no fallthru for the loop.
+Nops inserted won't be executed.  */
+ : (branch_count > count_threshold
+|| (bb->count > bb->prev_bb->count * 10
+&& (bb->prev_bb->count
+<= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)
+   {
+ rtx_insn* insn, *end_insn;
+ HOST_WIDE_INT size = 0;
+ bool padding_p = true;
+ basic_block tbb = bb;
+ unsigned cond_branch_num = 0;
+ bool detect_tight_loop_p = false;
+
+ for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+  i++, tbb = tbb->next_bb)
+   {
+ /* Only handle continuous cfg layout. */
+ if (bb->loop_father != tbb->loop_father)
+   {
+ padding_p = false;
+ break;
+   }
+
+ FOR_BB_INSNS (tbb, insn)
+   {
+ if (!NONDEBUG_INSN_P (insn))
+   continue;
+ size += ix86_min_insn_size (insn);
+
+ /* We don't know size of inline asm.
+Don't align loop for call.  */
+ if (asm_noperands (PATTERN (insn)) >= 0
+ || CALL_P (insn))
+   {
+ size = -1;
+ break;
+   }
+   }
+
+ if (size == -1 || size > ix86_cost->prefetch_block)
+   {
+ padding_p = false;
+ break;
+   }
+
+ FOR_EACH_EDGE (e, ei, tbb->succs)
+   {
+ /* It could be part of the loop.  */
+ if (e->dest == bb)
+   {
+ detect_tight_loop_p = true;
+ break;
+   }
+   }
+
+ if (det

[PATCH] [x86] Mention _Float16 and __bf16 changes in GCC14.

2024-07-30 Thread liuhongt
Ok for trunk?

---
 htdocs/gcc-14/changes.html| 7 +++
 htdocs/gcc-14/porting_to.html | 9 +
 2 files changed, 16 insertions(+)

diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index ca4cae0f..b023a4b9 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -982,6 +982,13 @@ __asm (".global __flmap_lock"  "\n\t"
 AVX512VP2INTERSECT, AVXVNNI, MOVDIR64B, MOVDIRI, and PREFETCHI ISA
 extensions.
   
+   The _Float16 and __bf16 type are supported
+independent of SSE2. W/o SSE2, these types are storage-only, compiler will
+issue an error when they're used in conversion, unary operation,
+binary operation, parameter passing or value return. Please use
+__SSE2__ to detect arithmetic support of these types
+instead of __FLT16_MAX__(or other similar Macros).
+  
 
 
 MCore
diff --git a/htdocs/gcc-14/porting_to.html b/htdocs/gcc-14/porting_to.html
index 3de15d02..b4f87149 100644
--- a/htdocs/gcc-14/porting_to.html
+++ b/htdocs/gcc-14/porting_to.html
@@ -490,6 +490,8 @@ in C23.
 GCC will probably continue to support old-style function definitions
 even once C23 is used as the default language dialect.
 
+
+
 C++ language issues
 
 Header dependency changes
@@ -554,6 +556,13 @@ incorrect instruction set by GCC 14.
 The fix in this case is to remember whether pop_options 
 needs to be performed in a new user-defined macro.
 
+Type _Float16 and __bf16 are supported independent of 
SSE2 for IA-32/x86-64
+W/o SSE2, these types are storage-only, compiler will issue an error when
+  they're used in conversion, unary operation, binary operation, parameter
+  passing or value return. Please use __SSE2__ to detect
+  arithmetic support of these types instead of
+  __FLT16_MAX__(or other similar Macros).
+
 
 
 
-- 
2.31.1



[PATCH] Fix mismatch between constraint and predicate for ashl3_doubleword.

2024-07-29 Thread liuhongt
(insn 98 94 387 2 (parallel [
(set (reg:TI 337 [ _32 ])
(ashift:TI (reg:TI 329)
(reg:QI 521)))
(clobber (reg:CC 17 flags))
]) "test.c":11:13 953 {ashlti3_doubleword}

is reloaded into

(insn 98 452 387 2 (parallel [
(set (reg:TI 0 ax [orig:337 _32 ] [337])
(ashift:TI (const_int 1671291085 [0x639de0cd])
(reg:QI 2 cx [521])))
(clobber (reg:CC 17 flags))

since constraint n in the pattern accepts that.
(Not sure why reload doesn't check predicate)

(define_insn "ashl3_doubleword"
  [(set (match_operand:DWI 0 "register_operand" "=&r,&r")
(ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0n,r")
(match_operand:QI 2 "nonmemory_operand" "c,c")))

The patch fixes the mismatch between constraint and predicate.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/116096
* config/i386/constraints.md (Wc): New constraint for integer
1 or -1.
* config/i386/i386.md (ashl3_doubleword): Refine
constraint with Wc.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116096.c: New test.
---
 gcc/config/i386/constraints.md   |  6 ++
 gcc/config/i386/i386.md  |  2 +-
 gcc/testsuite/gcc.target/i386/pr116096.c | 26 
 3 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116096.c

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 7508d7a58bd..154cbccd09e 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -254,6 +254,12 @@ (define_constraint "Wb"
   (and (match_code "const_int")
(match_test "IN_RANGE (ival, 0, 7)")))
 
+(define_constraint "Wc"
+  "Integer constant -1 or 1."
+  (and (match_code "const_int")
+   (ior (match_test "op == constm1_rtx")
+   (match_test "op == const1_rtx"
+
 (define_constraint "Ww"
   "Integer constant in the range 0 @dots{} 15, for 16-bit shifts."
   (and (match_code "const_int")
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 6207036a2a0..79d5de5b46a 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14774,7 +14774,7 @@ (define_insn_and_split "*ashl3_doubleword_mask_1"
 
 (define_insn "ashl3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=&r,&r")
-   (ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0n,r")
+   (ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0Wc,r")
(match_operand:QI 2 "nonmemory_operand" "c,c")))
(clobber (reg:CC FLAGS_REG))]
   ""
diff --git a/gcc/testsuite/gcc.target/i386/pr116096.c 
b/gcc/testsuite/gcc.target/i386/pr116096.c
new file mode 100644
index 000..5ef39805f58
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116096.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -flive-range-shrinkage -fno-peephole2 -mstackrealign 
-Wno-psabi" } */
+
+typedef char U __attribute__((vector_size (32)));
+typedef unsigned V __attribute__((vector_size (32)));
+typedef __int128 W __attribute__((vector_size (32)));
+U g;
+
+W baz ();
+
+static inline U
+bar (V x, W y)
+{
+  y = y | y << (W) x;
+  return (U)y;
+}
+
+void
+foo (W w)
+{
+  g = g <<
+bar ((V){baz ()[1], 3, 3, 5, 7},
+(W){w[0], ~(int) 2623676210}) >>
+bar ((V){baz ()[1]},
+(W){-w[0], ~(int) 2623676210});
+}
-- 
2.31.1



[PATCH] Fix mismatch between constraint and predicate for ashl3_doubleword.

2024-07-25 Thread liuhongt
(insn 98 94 387 2 (parallel [
(set (reg:TI 337 [ _32 ])
(ashift:TI (reg:TI 329)
(reg:QI 521)))
(clobber (reg:CC 17 flags))
]) "test.c":11:13 953 {ashlti3_doubleword}

is reloaded into

(insn 98 452 387 2 (parallel [
(set (reg:TI 0 ax [orig:337 _32 ] [337])
(ashift:TI (const_int 1671291085 [0x639de0cd])
(reg:QI 2 cx [521])))
(clobber (reg:CC 17 flags))

since constraint n in the pattern accepts that.
(Not sure why reload doesn't check predicate)

(define_insn "ashl3_doubleword"
  [(set (match_operand:DWI 0 "register_operand" "=&r,&r")
(ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0n,r")
(match_operand:QI 2 "nonmemory_operand" "c,c")))

The patch fixes the mismatch between constraint and predicate.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/116096
* config/i386/constraints.md (BC): Move TARGET_SSE to
vector_all_ones_operand.
* config/i386/i386.md (ashl3_doubleword): Refine
constraint with BC.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116096.c: New test.
---
 gcc/config/i386/constraints.md   |  4 ++--
 gcc/config/i386/i386.md  |  2 +-
 gcc/testsuite/gcc.target/i386/pr116096.c | 26 
 3 files changed, 29 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116096.c

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 7508d7a58bd..fd032c2b9f0 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -225,8 +225,8 @@ (define_constraint "Bz"
 
 (define_constraint "BC"
   "@internal integer SSE constant with all bits set operand."
-  (and (match_test "TARGET_SSE")
-   (ior (match_test "op == constm1_rtx")
+  (ior (match_test "op == constm1_rtx")
+   (and (match_test "TARGET_SSE")
(match_operand 0 "vector_all_ones_operand"
 
 (define_constraint "BF"
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 6207036a2a0..9c4e847fba1 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -14774,7 +14774,7 @@ (define_insn_and_split "*ashl3_doubleword_mask_1"
 
 (define_insn "ashl3_doubleword"
   [(set (match_operand:DWI 0 "register_operand" "=&r,&r")
-   (ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0n,r")
+   (ashift:DWI (match_operand:DWI 1 "reg_or_pm1_operand" "0BC,r")
(match_operand:QI 2 "nonmemory_operand" "c,c")))
(clobber (reg:CC FLAGS_REG))]
   ""
diff --git a/gcc/testsuite/gcc.target/i386/pr116096.c 
b/gcc/testsuite/gcc.target/i386/pr116096.c
new file mode 100644
index 000..5ef39805f58
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116096.c
@@ -0,0 +1,26 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -flive-range-shrinkage -fno-peephole2 -mstackrealign 
-Wno-psabi" } */
+
+typedef char U __attribute__((vector_size (32)));
+typedef unsigned V __attribute__((vector_size (32)));
+typedef __int128 W __attribute__((vector_size (32)));
+U g;
+
+W baz ();
+
+static inline U
+bar (V x, W y)
+{
+  y = y | y << (W) x;
+  return (U)y;
+}
+
+void
+foo (W w)
+{
+  g = g <<
+bar ((V){baz ()[1], 3, 3, 5, 7},
+(W){w[0], ~(int) 2623676210}) >>
+bar ((V){baz ()[1]},
+(W){-w[0], ~(int) 2623676210});
+}
-- 
2.31.1



[PATCH] [x86]Refine constraint "Bk" to define_special_memory_constraint.

2024-07-24 Thread liuhongt
For below pattern, RA may still allocate r162 as v/k register, try to
reload for address with leaq __libc_tsd_CTYPE_B@gottpoff(%rip), %rsi
which result a linker error.

(set (reg:DI 162)
 (mem/u/c:DI
   (const:DI (unspec:DI
 [(symbol_ref:DI ("a") [flags 0x60]  )]
 UNSPEC_GOTNTPOFF))

Quote from H.J for why linker issue an error.
>What do these do:
>
>leaq__libc_tsd_CTYPE_B@gottpoff(%rip), %rax
>vmovq   (%rax), %xmm0
>
>From x86-64 TLS psABI:
>
>The assembler generates for the x@gottpoff(%rip) expressions a R X86
>64 GOTTPOFF relocation for the symbol x which requests the linker to
>generate a GOT entry with a R X86 64 TPOFF64 relocation. The offset of
>the GOT entry relative to the end of the instruction is then used in
>the instruction. The R X86 64 TPOFF64 relocation is pro- cessed at
>program startup time by the dynamic linker by looking up the symbol x
>in the modules loaded at that point. The offset is written in the GOT
>entry and later loaded by the addq instruction.
>
>The above code sequence looks wrong to me.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport?

gcc/ChangeLog:

PR target/116043
* config/i386/constraints.md (Bk): Refine to
define_special_memory_constraint.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr116043.c: New test.
---
 gcc/config/i386/constraints.md   |  2 +-
 gcc/testsuite/gcc.target/i386/pr116043.c | 33 
 2 files changed, 34 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr116043.c

diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md
index 7508d7a58bd..b760e7c221a 100644
--- a/gcc/config/i386/constraints.md
+++ b/gcc/config/i386/constraints.md
@@ -187,7 +187,7 @@ (define_special_memory_constraint "Bm"
   "@internal Vector memory operand."
   (match_operand 0 "vector_memory_operand"))
 
-(define_memory_constraint "Bk"
+(define_special_memory_constraint "Bk"
   "@internal TLS address that allows insn using non-integer registers."
   (and (match_operand 0 "memory_operand")
(not (match_test "ix86_gpr_tls_address_pattern_p (op)"
diff --git a/gcc/testsuite/gcc.target/i386/pr116043.c 
b/gcc/testsuite/gcc.target/i386/pr116043.c
new file mode 100644
index 000..76553496c10
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr116043.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bf16 -O3" } */
+/* { dg-final { scan-assembler-not {(?n)lea.*@gottpoff} } } */
+
+extern __thread int a, c, i, j, k, l;
+int *b;
+struct d {
+  int e;
+} f, g;
+char *h;
+
+void m(struct d *n) {
+  b = &k;
+  for (; n->e; b++, n--) {
+i = b && a;
+if (i)
+  j = c;
+  }
+}
+
+char *o(struct d *n) {
+  for (; n->e;)
+return h;
+}
+
+int q() {
+  if (l)
+return 1;
+  int p = *o(&g);
+  m(&f);
+  m(&g);
+  l = p;
+}
-- 
2.31.1



[PATCH] Relax ix86_hardreg_mov_ok after split1.

2024-07-22 Thread liuhongt
ix86_hardreg_mov_ok is added by r11-5066-gbe39636d9f68c4

>The solution proposed here is to have the x86 backend/recog prevent
>early RTL passes composing instructions (that set likely_spilled hard
>registers) that they (combine) can't simplify, until after reload.
>We allow sets from pseudo registers, immediate constants and memory
>accesses, but anything more complicated is performed via a temporary
>pseudo.  Not only does this simplify things for the register allocator,
>but any remaining register-to-register moves are easily cleaned up
>by the late optimization passes after reload, such as peephole2 and
>cprop_hardreg.

The restriction is mainly for rtl optimization passes before pass_combine.

But split1 splits

```
(insn 17 13 18 2 (set (reg/i:V4SI 20 xmm0)
(vec_merge:V4SI (const_vector:V4SI [
(const_int -1 [0x]) repeated x4
])
(const_vector:V4SI [
(const_int 0 [0]) repeated x4
])
(unspec:QI [
(reg:V4SF 106)
(reg:V4SF 102)
(const_int 0 [0])
] UNSPEC_PCMP))) "/app/example.cpp":20:1 2929 {*avx_cmpv4sf3_1}
 (expr_list:REG_DEAD (reg:V4SF 102)
(expr_list:REG_DEAD (reg:V4SF 106)
(nil
```

into:
```
(insn 23 13 24 2 (set (reg:V4SF 107)
(unspec:V4SF [
(reg:V4SF 106)
(reg:V4SF 102)
(const_int 0 [0])
] UNSPEC_PCMP)) "/app/example.cpp":20:1 -1
 (nil))
(insn 24 23 18 2 (set (reg/i:V4SI 20 xmm0)
(subreg:V4SI (reg:V4SF 107) 0)) "/app/example.cpp":20:1 -1
 (nil))
```

There're many splitters generating MOV insn with SUBREG and would have
same problem.
Instead of changing those splitters one by one, the patch relaxes
ix86_hard_mov_ok to allow mov subreg to hard register after
split1. ix86_pre_reload_split () is used to replace
!reload_completed && !lra_in_progress.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_hardreg_mov_ok): Relax mov subreg
to hard register after split1.

gcc/testsuite/ChangeLog:

* g++.target/i386/pr115982.C: New test.
---
 gcc/config/i386/i386.cc  |  5 ++---
 gcc/testsuite/g++.target/i386/pr115982.C | 11 +++
 2 files changed, 13 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr115982.C

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 9c2ebe74fc9..77c441893b4 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20212,7 +20212,7 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
 }
 
 /* Return true if a set of DST by the expression SRC should be allowed.
-   This prevents complex sets of likely_spilled hard regs before reload.  */
+   This prevents complex sets of likely_spilled hard regs before split1.  */
 
 bool
 ix86_hardreg_mov_ok (rtx dst, rtx src)
@@ -20224,8 +20224,7 @@ ix86_hardreg_mov_ok (rtx dst, rtx src)
   ? standard_sse_constant_p (src, GET_MODE (dst))
   : x86_64_immediate_operand (src, GET_MODE (dst)))
   && ix86_class_likely_spilled_p (REGNO_REG_CLASS (REGNO (dst)))
-  && !reload_completed
-  && !lra_in_progress)
+  && ix86_pre_reload_split ())
 return false;
   return true;
 }
diff --git a/gcc/testsuite/g++.target/i386/pr115982.C 
b/gcc/testsuite/g++.target/i386/pr115982.C
new file mode 100644
index 000..4b91618405d
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr115982.C
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -O2" } */
+
+typedef float VF __attribute__((__vector_size__(16)));
+typedef int VI __attribute__((__vector_size__(16)));
+
+VI
+foo (VF x)
+{
+  return !x;
+}
-- 
2.31.1



[PATCH v2] [x86][avx512] Optimize maskstore when mask is 0 or -1 in UNSPEC_MASKMOV

2024-07-17 Thread liuhongt
> Also, in case the insn is deleted, do:
>
> emit_note (NOTE_INSN_DELETED);
>
> DONE;
>
> instead of leaving (const_int 0) in the stream.
>
> So, the above insn preparation statements should read:
>
> --cut here--
> if (constm1_operand (operands[2], mode))
>   emit_move_insn (operands[0], operands[1]);
> else
>   emit_note (NOTE_INSN_DELETED);
>
> DONE;
> --cut here--
Changed.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/115843
* config/i386/predicates.md (const0_or_m1_operand): New
predicate.
* config/i386/sse.md (*_store_mask_1): New
pre_reload define_insn_and_split.
(V): Add V32BF,V16BF,V8BF.
(V4SF_V8BF): Rename to ..
(V24F_128): .. this.
(*vec_concat): Adjust with V24F_128.
(*vec_concat_0): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115843.c: New test.
---
 gcc/config/i386/predicates.md|  5 
 gcc/config/i386/sse.md   | 33 
 gcc/testsuite/gcc.target/i386/pr115843.c | 38 
 3 files changed, 70 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115843.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 5d0bb1e0f54..680594871de 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -825,6 +825,11 @@ (define_predicate "constm1_operand"
   (and (match_code "const_int")
(match_test "op == constm1_rtx")))
 
+;; Match 0 or -1.
+(define_predicate "const0_or_m1_operand"
+  (ior (match_operand 0 "const0_operand")
+   (match_operand 0 "constm1_operand")))
+
 ;; Match exactly eight.
 (define_predicate "const8_operand"
   (and (match_code "const_int")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e44822f705b..f54e966bdbb 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -294,6 +294,7 @@ (define_mode_iterator V
(V16SI "TARGET_AVX512F && TARGET_EVEX512") (V8SI "TARGET_AVX") V4SI
(V8DI "TARGET_AVX512F && TARGET_EVEX512")  (V4DI "TARGET_AVX") V2DI
(V32HF "TARGET_AVX512F && TARGET_EVEX512") (V16HF "TARGET_AVX") V8HF
+   (V32BF "TARGET_AVX512F && TARGET_EVEX512") (V16BF "TARGET_AVX") V8BF
(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F && TARGET_EVEX512")  (V4DF "TARGET_AVX") (V2DF 
"TARGET_SSE2")])
 
@@ -430,8 +431,8 @@ (define_mode_iterator VFB_512
(V16SF "TARGET_EVEX512")
(V8DF "TARGET_EVEX512")])
 
-(define_mode_iterator V4SF_V8HF
-  [V4SF V8HF])
+(define_mode_iterator V24F_128
+  [V4SF V8HF V8BF])
 
 (define_mode_iterator VI48_AVX512VL
   [(V16SI "TARGET_EVEX512") (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
@@ -11543,8 +11544,8 @@ (define_insn "*vec_concatv2sf_sse"
(set_attr "mode" "V4SF,SF,DI,DI")])
 
 (define_insn "*vec_concat"
-  [(set (match_operand:V4SF_V8HF 0 "register_operand"   "=x,v,x,v")
-   (vec_concat:V4SF_V8HF
+  [(set (match_operand:V24F_128 0 "register_operand"   "=x,v,x,v")
+   (vec_concat:V24F_128
  (match_operand: 1 "register_operand" " 0,v,0,v")
  (match_operand: 2 "nonimmediate_operand" " 
x,v,m,m")))]
   "TARGET_SSE"
@@ -11559,8 +11560,8 @@ (define_insn "*vec_concat"
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
 
 (define_insn "*vec_concat_0"
-  [(set (match_operand:V4SF_V8HF 0 "register_operand"   "=v")
-   (vec_concat:V4SF_V8HF
+  [(set (match_operand:V24F_128 0 "register_operand"   "=v")
+   (vec_concat:V24F_128
  (match_operand: 1 "nonimmediate_operand" "vm")
  (match_operand: 2 "const0_operand")))]
   "TARGET_SSE2"
@@ -28574,6 +28575,26 @@ (define_insn "_store_mask"
(set_attr "memory" "store")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_store_mask_1"
+  [(set (match_operand:V 0 "memory_operand")
+   (unspec:V
+ [(match_operand:V 1 "register_operand")
+  (match_dup 0)
+  (match_operand: 2 "const0_or_m1_operand")]
+ UNSPEC_MASKMOV))]
+  "TARGET_AVX512F && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (constm1_operand (operands[2], mode))
+emit_move_insn (operands[0], operands[1]);
+  else
+emit_note (NOTE_INSN_DELETED);
+
+  DONE;
+})
+
 (define_expand "cbranch4"
   [(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:VI_AVX_AVX512F 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr115843.c 
b/gcc/testsuite/gcc.target/i386/pr115843.c
new file mode 100644
index 000..00d8605757a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115843.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl --param vect-partial-vector-usage=2 
-mtune=znver5 -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-not "kxor\[bw]" } } */
+
+typedef unsigned long long BITBOARD;
+BITBOARD KingPressureMask1[64], KingSafetyMask1[64];
+
+void __attribute__((noinline))

[PATCH] [x86][avx512] Optimize maskstore when mask is 0 or -1 in UNSPEC_MASKMOV

2024-07-16 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/115843
* config/i386/predicates.md (const0_or_m1_operand): New
predicate.
* config/i386/sse.md (*_store_mask_1): New
pre_reload define_insn_and_split.
(V): Add V32BF,V16BF,V8BF.
(V4SF_V8BF): Rename to ..
(V24F_128): .. this.
(*vec_concat): Adjust with V24F_128.
(*vec_concat_0): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115843.c: New test.
---
 gcc/config/i386/predicates.md|  5 
 gcc/config/i386/sse.md   | 32 
 gcc/testsuite/gcc.target/i386/pr115843.c | 38 
 3 files changed, 69 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115843.c

diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 5d0bb1e0f54..680594871de 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -825,6 +825,11 @@ (define_predicate "constm1_operand"
   (and (match_code "const_int")
(match_test "op == constm1_rtx")))
 
+;; Match 0 or -1.
+(define_predicate "const0_or_m1_operand"
+  (ior (match_operand 0 "const0_operand")
+   (match_operand 0 "constm1_operand")))
+
 ;; Match exactly eight.
 (define_predicate "const8_operand"
   (and (match_code "const_int")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index e44822f705b..e11610f4b88 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -294,6 +294,7 @@ (define_mode_iterator V
(V16SI "TARGET_AVX512F && TARGET_EVEX512") (V8SI "TARGET_AVX") V4SI
(V8DI "TARGET_AVX512F && TARGET_EVEX512")  (V4DI "TARGET_AVX") V2DI
(V32HF "TARGET_AVX512F && TARGET_EVEX512") (V16HF "TARGET_AVX") V8HF
+   (V32BF "TARGET_AVX512F && TARGET_EVEX512") (V16BF "TARGET_AVX") V8BF
(V16SF "TARGET_AVX512F && TARGET_EVEX512") (V8SF "TARGET_AVX") V4SF
(V8DF "TARGET_AVX512F && TARGET_EVEX512")  (V4DF "TARGET_AVX") (V2DF 
"TARGET_SSE2")])
 
@@ -430,8 +431,8 @@ (define_mode_iterator VFB_512
(V16SF "TARGET_EVEX512")
(V8DF "TARGET_EVEX512")])
 
-(define_mode_iterator V4SF_V8HF
-  [V4SF V8HF])
+(define_mode_iterator V24F_128
+  [V4SF V8HF V8BF])
 
 (define_mode_iterator VI48_AVX512VL
   [(V16SI "TARGET_EVEX512") (V8SI "TARGET_AVX512VL") (V4SI "TARGET_AVX512VL")
@@ -11543,8 +11544,8 @@ (define_insn "*vec_concatv2sf_sse"
(set_attr "mode" "V4SF,SF,DI,DI")])
 
 (define_insn "*vec_concat"
-  [(set (match_operand:V4SF_V8HF 0 "register_operand"   "=x,v,x,v")
-   (vec_concat:V4SF_V8HF
+  [(set (match_operand:V24F_128 0 "register_operand"   "=x,v,x,v")
+   (vec_concat:V24F_128
  (match_operand: 1 "register_operand" " 0,v,0,v")
  (match_operand: 2 "nonimmediate_operand" " 
x,v,m,m")))]
   "TARGET_SSE"
@@ -11559,8 +11560,8 @@ (define_insn "*vec_concat"
(set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
 
 (define_insn "*vec_concat_0"
-  [(set (match_operand:V4SF_V8HF 0 "register_operand"   "=v")
-   (vec_concat:V4SF_V8HF
+  [(set (match_operand:V24F_128 0 "register_operand"   "=v")
+   (vec_concat:V24F_128
  (match_operand: 1 "nonimmediate_operand" "vm")
  (match_operand: 2 "const0_operand")))]
   "TARGET_SSE2"
@@ -28574,6 +28575,25 @@ (define_insn "_store_mask"
(set_attr "memory" "store")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_store_mask_1"
+  [(set (match_operand:V 0 "memory_operand")
+   (unspec:V
+ [(match_operand:V 1 "register_operand")
+  (match_dup 0)
+  (match_operand: 2 "const0_or_m1_operand")]
+ UNSPEC_MASKMOV))]
+  "TARGET_AVX512F"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (constm1_operand (operands[2], mode))
+  {
+emit_move_insn (operands[0], operands[1]);
+DONE;
+  }
+})
+
 (define_expand "cbranch4"
   [(set (reg:CC FLAGS_REG)
(compare:CC (match_operand:VI_AVX_AVX512F 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr115843.c 
b/gcc/testsuite/gcc.target/i386/pr115843.c
new file mode 100644
index 000..00d8605757a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115843.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl --param vect-partial-vector-usage=2 
-mtune=znver5 -mprefer-vector-width=512" } */
+/* { dg-final { scan-assembler-not "kxor\[bw]" } } */
+
+typedef unsigned long long BITBOARD;
+BITBOARD KingPressureMask1[64], KingSafetyMask1[64];
+
+void __attribute__((noinline))
+foo()
+{
+  int i;
+
+  for (i = 0; i < 64; i++) {
+if ((i & 7) == 0) {
+  KingPressureMask1[i] = KingSafetyMask1[i + 1];
+} else if ((i & 7) == 7) {
+  KingPressureMask1[i] = KingSafetyMask1[i - 1];
+} else {
+  KingPressureMask1[i] = KingSafetyMask1[i];
+}
+  }
+}
+
+BITBOARD verify[64] = {1, 1, 2, 3, 4, 5, 6, 6, 9, 9, 10, 11, 12, 13, 14, 14, 
17, 17, 18, 19,
+  20, 21, 22, 22, 25, 25, 26, 27, 28, 29,

[PATCH] Fix SSA_NAME leak due to def_stmt is removed before use_stmt.

2024-07-11 Thread liuhongt
>-  _5 = __atomic_fetch_or_8 (&set_work_pending_p, 1, 0);
>-  # DEBUG old => (long int) _5
>+  _6 = .ATOMIC_BIT_TEST_AND_SET (&set_work_pending_p, 0, 1, 0, 
>__atomic_fetch_or_8);
>+  # DEBUG old => NULL
>   # DEBUG BEGIN_STMT
>-  # DEBUG D#2 => _5 & 1
>+  # DEBUG D#2 => NULL
>...
>-  _10 = ~_5;
>-  _8 = (_Bool) _10;
>-  # DEBUG ret => _8
>+  _8 = _6 == 0;
>+  # DEBUG ret => (_Bool) _10
>
>confirmed.  convert_atomic_bit_not does this, it checks for single_use
>and removes the def, failing to release the name (which would fix this up
>IIRC).
>
>Note the function removes stmts in "wrong" order (before uses of LHS
>are removed), so it requires larger surgery.  And it leaks SSA names.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/115872
* tree-ssa-ccp.cc (convert_atomic_bit_not): Remove use_stmt after 
use_nop_stmt is removed.
(optimize_atomic_bit_test_and): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115872.c: New test.
---
 gcc/testsuite/gcc.target/i386/pr115872.c | 16 
 gcc/tree-ssa-ccp.cc  | 12 
 2 files changed, 24 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115872.c

diff --git a/gcc/testsuite/gcc.target/i386/pr115872.c 
b/gcc/testsuite/gcc.target/i386/pr115872.c
new file mode 100644
index 000..937004456d3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115872.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g" } */
+
+long set_work_pending_p;
+_Bool set_work_pending() {
+  _Bool __trans_tmp_1;
+  long mask = 1, old = __atomic_fetch_or(&set_work_pending_p, mask, 0);
+  __trans_tmp_1 = old & mask;
+  return !__trans_tmp_1;
+}
+void __queue_work() {
+  _Bool ret = set_work_pending();
+  if (ret)
+__queue_work();
+}
+
diff --git a/gcc/tree-ssa-ccp.cc b/gcc/tree-ssa-ccp.cc
index 3749126b5f7..de83d26d311 100644
--- a/gcc/tree-ssa-ccp.cc
+++ b/gcc/tree-ssa-ccp.cc
@@ -3332,9 +3332,10 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
 return nullptr;
 
   gimple_stmt_iterator gsi;
-  gsi = gsi_for_stmt (use_stmt);
-  gsi_remove (&gsi, true);
   tree var = make_ssa_name (TREE_TYPE (lhs));
+  /* use_stmt need to be removed after use_nop_stmt,
+ so use_lhs can be released.  */
+  gimple *use_stmt_removal = use_stmt;
   use_stmt = gimple_build_assign (var, BIT_AND_EXPR, lhs, and_mask);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_insert_before (&gsi, use_stmt, GSI_NEW_STMT);
@@ -3344,6 +3345,8 @@ convert_atomic_bit_not (enum internal_fn fn, gimple 
*use_stmt,
   gsi_insert_after (&gsi, g, GSI_NEW_STMT);
   gsi = gsi_for_stmt (use_not_stmt);
   gsi_remove (&gsi, true);
+  gsi = gsi_for_stmt (use_stmt_removal);
+  gsi_remove (&gsi, true);
   return use_stmt;
 }
 
@@ -3646,8 +3649,7 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
   */
}
  var = make_ssa_name (TREE_TYPE (use_rhs));
- gsi = gsi_for_stmt (use_stmt);
- gsi_remove (&gsi, true);
+ gimple* use_stmt_removal = use_stmt;
  g = gimple_build_assign (var, BIT_AND_EXPR, use_rhs,
   and_mask);
  gsi = gsi_for_stmt (use_nop_stmt);
@@ -3664,6 +3666,8 @@ optimize_atomic_bit_test_and (gimple_stmt_iterator *gsip,
  gsi_insert_after (&gsi, g, GSI_NEW_STMT);
  gsi = gsi_for_stmt (use_nop_stmt);
  gsi_remove (&gsi, true);
+ gsi = gsi_for_stmt (use_stmt_removal);
+ gsi_remove (&gsi, true);
}
}
  else
-- 
2.31.1



[PATCH] Rename __{float, double}_u to __x86_{float, double}_u to avoid pulluting the namespace.

2024-07-07 Thread liuhongt
I have a build failure on NetBSD as the namespace pollution avoidance causes
a direct hit with the system /usr/include/math.h
===

In file included from /usr/src/local/gcc/obj/gcc/include/emmintrin.h:31,
 from 
/usr/src/local/gcc/obj/x86_64-unknown-netbsd10.99/libstdc++-v3/include/ext/random:45,
 from 
/usr/src/local/gcc/libstdc++-v3/include/precompiled/extc++.h:65:
/usr/src/local/gcc/obj/gcc/include/xmmintrin.h:75:15: error: conflicting 
declaration 'typedef float __float_u'
   75 | typedef float __float_u __attribute__ ((__may_alias__, __aligned__ 
(1)));
  |   ^
In file included from 
/usr/src/local/gcc/obj/x86_64-unknown-netbsd10.99/libstdc++-v3/include/cmath:47,
 from 
/usr/src/local/gcc/obj/x86_64-unknown-netbsd10.99/libstdc++-v3/include/x86_64-unknown-netbsd10.99/bits/stdc++.h:114,
 from 
/usr/src/local/gcc/libstdc++-v3/include/precompiled/extc++.h:32:
/usr/src/local/gcc/obj/gcc/include-fixed/math.h:49:7: note: previous 
declaration as 'union __float_u'
   49 | union __float_u {

As pinski suggested in #c2, use __x86_float_u which seems less likely to 
pullute the namespace.

Bootstrapped and regtested on x86_64-pc-linux{-m32,}.
Ready push to trunk if there's no other concerns.

gcc/ChangeLog:

PR target/115796
* config/i386/emmintrin.h (__float_u): Rename to ..
(__x86_float_u): .. this.
(_mm_load_sd): Ditto.
(_mm_store_sd): Ditto.
(_mm_loadh_pd): Ditto.
(_mm_loadl_pd): Ditto.
* config/i386/xmmintrin.h (__double_u): Rename to ..
(__x86_double_u): .. this.
(_mm_load_ss): Ditto.
(_mm_store_ss): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115796.c: New test.
---
 gcc/config/i386/emmintrin.h  | 10 +-
 gcc/config/i386/xmmintrin.h  |  6 +++---
 gcc/testsuite/gcc.target/i386/pr115796.c | 24 
 3 files changed, 32 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115796.c

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index d58030e5c4f..a3fcd7a869c 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -56,7 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), 
__may_alias__));
 /* Unaligned version of the same types.  */
 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), 
__may_alias__, __aligned__ (1)));
 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef double __double_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef double __x86_double_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
@@ -146,7 +146,7 @@ _mm_load1_pd (double const *__P)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_sd (double const *__P)
 {
-  return __extension__ (__m128d) { *(__double_u *)__P, 0.0 };
+  return __extension__ (__m128d) { *(__x86_double_u *)__P, 0.0 };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -181,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *(__double_u *)__P = ((__v2df)__A)[0] ;
+  *(__x86_double_u *)__P = ((__v2df)__A)[0] ;
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -974,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { ((__v2df)__A)[0], *(__double_u*)__B };
+  return __extension__ (__m128d) { ((__v2df)__A)[0], *(__x86_double_u*)__B };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { *(__double_u*)__B, ((__v2df)__A)[1] };
+  return __extension__ (__m128d) { *(__x86_double_u*)__B, ((__v2df)__A)[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 37e5a94cf10..7f10f96d72c 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -72,7 +72,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), 
__may_alias__));
 
 /* Unaligned version of the same type.  */
 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef float __float_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef float __x86_float_u __

[PATCH] [committed] Use __builtin_cpu_support instead of __get_cpuid_count.

2024-07-03 Thread liuhongt
>> Hmm, now all avx512 tests SIGILL when testing with -m32:
>>
>> Dump of assembler code for function __get_cpuid_count:
>> => 0x08049500 <+0>:     kmovd  %eax,%k2
>>    0x08049504 <+4>:     kmovd  %edx,%k1
>>    0x08049508 <+8>:     pushf
>>    0x08049509 <+9>:     pushf
>>    0x0804950a <+10>:    pop    %eax
>>    0x0804950b <+11>:    mov    %eax,%edx
>>
>> looks like __get_cpuid_count is no longer inlined but AVX512 is in
>> effect for it.
>>
>> Maybe use #pragma GCC target around the includes instead?
>
>
> Can the built-in cpu supports be used?

Changed, and verified on both AVX512 and non-AVX512 machine.

Push to trunk.
gcc/testsuite/ChangeLog:

PR target/115748
* gcc.target/i386/avx512-check.h: Use __builtin_cpu_support
instead of __get_cpuid_count.
---
 gcc/testsuite/gcc.target/i386/avx512-check.h | 46 +---
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h 
b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 71858a33dac..8ec1a7ccbae 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -38,69 +38,63 @@ __attribute__((noipa,target("no-avx")))
 int
 avx512_runtime_support_p ()
 {
-  unsigned int eax, ebx, ecx, edx;
-
-  if (!__get_cpuid_count (7, 0, &eax, &ebx, &ecx, &edx))
-return 0;
-
   /* Run AVX512 test only if host has ISA support.  */
-  if (check_osxsave ()
-  && (ebx & bit_AVX512F)
+  if (__builtin_cpu_supports ("avx512f")
 #ifdef AVX512VL
-  && (ebx & bit_AVX512VL)
+  && __builtin_cpu_supports ("avx512vl")
 #endif
 #ifdef AVX512ER
-  && (ebx & bit_AVX512ER)
+  && __builtin_cpu_supports ("avx512er")
 #endif
 #ifdef AVX512CD
-  && (ebx & bit_AVX512CD)
+  && __builtin_cpu_supports ("avx512cd")
 #endif
 #ifdef AVX512DQ
-  && (ebx & bit_AVX512DQ)
+  && __builtin_cpu_supports ("avx512dq")
 #endif
 #ifdef AVX512BW
-  && (ebx & bit_AVX512BW)
+  && __builtin_cpu_supports ("avx512bw")
 #endif
 #ifdef AVX512IFMA
-  && (ebx & bit_AVX512IFMA)
+  && __builtin_cpu_supports ("avx512ifma")
 #endif
 #ifdef AVX512VBMI
-  && (ecx & bit_AVX512VBMI)
+  && __builtin_cpu_supports ("avx512vbmi")
 #endif
 #ifdef AVX5124FMAPS
-  && (edx & bit_AVX5124FMAPS)
+  && __builtin_cpu_supports ("avx5124fmaps")
 #endif
 #ifdef AVX5124VNNIW
-  && (edx & bit_AVX5124VNNIW)
+  && __builtin_cpu_supports ("avx5124vnniw")
 #endif
 #ifdef AVX512VPOPCNTDQ
-  && (ecx & bit_AVX512VPOPCNTDQ)
+  && __builtin_cpu_supports ("avx512vpopcntdq")
 #endif
 #ifdef AVX512BITALG
-  && (ecx & bit_AVX512BITALG)
+  && __builtin_cpu_supports ("avx512bitalg")
 #endif
 #ifdef GFNI
-  && (ecx & bit_GFNI)
+  && __builtin_cpu_supports ("gfni")
 #endif
 #ifdef AVX512VBMI2
-  && (ecx & bit_AVX512VBMI2)
+  && __builtin_cpu_supports ("avx512vbmi2")
 #endif
 #ifdef AVX512VNNI
-  && (ecx & bit_AVX512VNNI)
+  && __builtin_cpu_supports ("avx512vnni")
 #endif
 #ifdef AVX512FP16
-  && (edx & bit_AVX512FP16)
+  && __builtin_cpu_supports ("avx512fp16")
 #endif
 #ifdef VAES
-  && (ecx & bit_VAES)
+  && __builtin_cpu_supports ("vaes")
 #endif
 #ifdef VPCLMULQDQ
-  && (ecx & bit_VPCLMULQDQ)
+  && __builtin_cpu_supports ("vpclmulqdq")
 #endif
 #ifdef AVX512VP2INTERSECT
-  && (edx & bit_AVX512VP2INTERSECT)
+  && __builtin_cpu_supports ("avx512vp2intersect")
 #endif
-  && avx512f_os_support ())
+  )
 {
   return 1;
 }
-- 
2.31.1



[PATCH V2] x86: Update branch hint for Redwood Cove.

2024-07-03 Thread liuhongt
From: "H.J. Lu" 

>The above reads like it would be worth splitting branc_prediction_hits
>into branch_prediction_hints_taken and branch_prediction_hints_not_taken
>given not-taken is the default and thus will just increase code size?
>According to Intel® 64 and IA-32 Architectures Optimization Reference
>Manual[1], Branch Hint is updated for Redwood Cove.
Changed.

cut from [1]-
Starting with the Redwood Cove microarchitecture, if the predictor has
no stored information about a branch, the branch has the Intel® SSE2
branch taken hint (i.e., instruction prefix 3EH), When the codec
decodes the branch, it flips the branch’s prediction from not-taken to
taken. It then flushes the pipeline in front of it and steers this
pipeline to fetch the taken path of the branch.
cut end -

Split tune branch_prediction_hints into branch_prediction_hints_taken
and branch_prediction_hints_not_taken, always generate branch hint for
conditional branches, both tunes are disabled by default.

[1] 
https://www.intel.com/content/www/us/en/content-details/821612/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/

* config/i386/i386.cc (ix86_print_operand): Always generate
branch hint for conditional branches.
* config/i386/i386.h (TARGET_BRANCH_PREDICTION_HINTS): Split
into ..
(TARGET_BRANCH_PREDICTION_HINTS_TAKEN): .. this, and ..
(TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN): .. this.
* config/i386/x86-tune.def (X86_TUNE_BRANCH_PREDICTION_HINTS):
Split into ..
(X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN): .. this, and ..
(X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN): .. this.
---
 gcc/config/i386/i386.cc  | 29 +
 gcc/config/i386/i386.h   |  6 --
 gcc/config/i386/x86-tune.def | 13 +++--
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1f71ed04be6..ea9cb620f8d 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14041,7 +14041,8 @@ ix86_print_operand (FILE *file, rtx x, int code)
 
if (!optimize
|| optimize_function_for_size_p (cfun)
-   || !TARGET_BRANCH_PREDICTION_HINTS)
+   || (!TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN
+   && !TARGET_BRANCH_PREDICTION_HINTS_TAKEN))
  return;
 
x = find_reg_note (current_output_insn, REG_BR_PROB, 0);
@@ -14050,25 +14051,13 @@ ix86_print_operand (FILE *file, rtx x, int code)
int pred_val = profile_probability::from_reg_br_prob_note
 (XINT (x, 0)).to_reg_br_prob_base ();
 
-   if (pred_val < REG_BR_PROB_BASE * 45 / 100
-   || pred_val > REG_BR_PROB_BASE * 55 / 100)
- {
-   bool taken = pred_val > REG_BR_PROB_BASE / 2;
-   bool cputaken
- = final_forward_branch_p (current_output_insn) == 0;
-
-   /* Emit hints only in the case default branch prediction
-  heuristics would fail.  */
-   if (taken != cputaken)
- {
-   /* We use 3e (DS) prefix for taken branches and
-  2e (CS) prefix for not taken branches.  */
-   if (taken)
- fputs ("ds ; ", file);
-   else
- fputs ("cs ; ", file);
- }
- }
+   bool taken = pred_val > REG_BR_PROB_BASE / 2;
+   /* We use 3e (DS) prefix for taken branches and
+  2e (CS) prefix for not taken branches.  */
+   if (taken && TARGET_BRANCH_PREDICTION_HINTS_TAKEN)
+ fputs ("ds ; ", file);
+   else if (!taken && TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN)
+ fputs ("cs ; ", file);
  }
return;
  }
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 9ed225ec587..50ebed221dc 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -309,8 +309,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
 #define TARGET_ZERO_EXTEND_WITH_AND \
ix86_tune_features[X86_TUNE_ZERO_EXTEND_WITH_AND]
 #define TARGET_UNROLL_STRLEN   ix86_tune_features[X86_TUNE_UNROLL_STRLEN]
-#define TARGET_BRANCH_PREDICTION_HINTS \
-   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS]
+#define TARGET_BRANCH_PREDICTION_HINTS_NOT_TAKEN \
+   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_NOT_TAKEN]
+#define TARGET_BRANCH_PREDICTION_HINTS_TAKEN \
+   ix86_tune_features[X86_TUNE_BRANCH_PREDICTION_HINTS_TAKEN]
 #define TARGET_DOUBLE_WITH_ADD ix86_tune_features[X86_TUNE_DOUBLE_WITH_ADD]
 #defi

[PATCH][committed] Move runtime check into a separate function and guard it with target ("no-avx")

2024-07-03 Thread liuhongt
The patch can avoid SIGILL on non-AVX512 machine due to kmovd is
generated in dynamic check.

Committed as an obvious fix.

gcc/testsuite/ChangeLog:

PR target/115748
* gcc.target/i386/avx512-check.h: Move runtime check into a
separate function and guard it with target ("no-avx").
---
 gcc/testsuite/gcc.target/i386/avx512-check.h | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512-check.h 
b/gcc/testsuite/gcc.target/i386/avx512-check.h
index 0ad9064f637..71858a33dac 100644
--- a/gcc/testsuite/gcc.target/i386/avx512-check.h
+++ b/gcc/testsuite/gcc.target/i386/avx512-check.h
@@ -34,8 +34,9 @@ check_osxsave (void)
   return (ecx & bit_OSXSAVE) != 0;
 }
 
+__attribute__((noipa,target("no-avx")))
 int
-main ()
+avx512_runtime_support_p ()
 {
   unsigned int eax, ebx, ecx, edx;
 
@@ -100,6 +101,17 @@ main ()
   && (edx & bit_AVX512VP2INTERSECT)
 #endif
   && avx512f_os_support ())
+{
+  return 1;
+}
+
+  return 0;
+}
+
+int
+main ()
+{
+  if (avx512_runtime_support_p ())
 {
   DO_TEST ();
 #ifdef DEBUG
-- 
2.31.1



[PATCH] x86: Update branch hint for Redwood Cove.

2024-07-01 Thread liuhongt
From: "H.J. Lu" 

According to Intel® 64 and IA-32 Architectures Optimization Reference
Manual[1], Branch Hint is updated for Redwood Cove.

cut from [1]-
Starting with the Redwood Cove microarchitecture, if the predictor has
no stored information about a branch, the branch has the Intel® SSE2
branch taken hint (i.e., instruction prefix 3EH), When the codec
decodes the branch, it flips the branch’s prediction from not-taken to
taken. It then flushes the pipeline in front of it and steers this
pipeline to fetch the taken path of the branch.
cut end -

For -mtune-ctrl=branch_prediction_hints, always generate branch hint for
conditional branches, this tune is disabled by default.

[1] 
https://www.intel.com/content/www/us/en/content-details/821612/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/

* config/i386/i386.cc (ix86_print_operand): Always generate
branch hint for conditional branches.
---
 gcc/config/i386/i386.cc | 24 +---
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1f71ed04be6..9992b9d6186 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -14050,25 +14050,11 @@ ix86_print_operand (FILE *file, rtx x, int code)
int pred_val = profile_probability::from_reg_br_prob_note
 (XINT (x, 0)).to_reg_br_prob_base ();
 
-   if (pred_val < REG_BR_PROB_BASE * 45 / 100
-   || pred_val > REG_BR_PROB_BASE * 55 / 100)
- {
-   bool taken = pred_val > REG_BR_PROB_BASE / 2;
-   bool cputaken
- = final_forward_branch_p (current_output_insn) == 0;
-
-   /* Emit hints only in the case default branch prediction
-  heuristics would fail.  */
-   if (taken != cputaken)
- {
-   /* We use 3e (DS) prefix for taken branches and
-  2e (CS) prefix for not taken branches.  */
-   if (taken)
- fputs ("ds ; ", file);
-   else
- fputs ("cs ; ", file);
- }
- }
+   bool taken = pred_val > REG_BR_PROB_BASE / 2;
+   /* We use 3e (DS) prefix for taken branches and
+  2e (CS) prefix for not taken branches.  */
+   if (taken)
+ fputs ("ds ; ", file);
  }
return;
  }
-- 
2.31.1



[PATCH 2/3] Extend lshifrtsi3_1_zext to ?k alternative.

2024-06-27 Thread liuhongt
late_combine will combine lshift + zero into *lshifrtsi3_1_zext which
cause extra mov between gpr and kmask, add ?k to the pattern.

gcc/ChangeLog:

PR target/115610
* config/i386/i386.md (<*insnsi3_zext): Add alternative ?k,
enable it only for lshiftrt and under avx512bw.
* config/i386/sse.md (*klshrsi3_1_zext): New define_insn, and
add corresponding define_split after it.
---
 gcc/config/i386/i386.md | 19 +--
 gcc/config/i386/sse.md  | 28 
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index fd48e764469..57a10c1af48 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -16836,10 +16836,10 @@ (define_insn "*bmi2_si3_1_zext"
(set_attr "mode" "SI")])
 
 (define_insn "*si3_1_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r,r,r")
+  [(set (match_operand:DI 0 "register_operand" "=r,r,r,?k")
(zero_extend:DI
- (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" "0,rm,rm")
- (match_operand:QI 2 "nonmemory_operand" "cI,r,cI"
+ (any_shiftrt:SI (match_operand:SI 1 "nonimmediate_operand" 
"0,rm,rm,k")
+ (match_operand:QI 2 "nonmemory_operand" 
"cI,r,cI,I"
(clobber (reg:CC FLAGS_REG))]
   "TARGET_64BIT
&& ix86_binary_operator_ok (, SImode, operands, TARGET_APX_NDD)"
@@ -16850,6 +16850,8 @@ (define_insn "*si3_1_zext"
 case TYPE_ISHIFTX:
   return "#";
 
+case TYPE_MSKLOG:
+  return "#";
 default:
   if (operands[2] == const1_rtx
  && (TARGET_SHIFT1 || optimize_function_for_size_p (cfun))
@@ -16860,8 +16862,8 @@ (define_insn "*si3_1_zext"
   : "{l}\t{%2, %k0|%k0, %2}";
 }
 }
-  [(set_attr "isa" "*,bmi2,apx_ndd")
-   (set_attr "type" "ishift,ishiftx,ishift")
+  [(set_attr "isa" "*,bmi2,apx_ndd,avx512bw")
+   (set_attr "type" "ishift,ishiftx,ishift,msklog")
(set (attr "length_immediate")
  (if_then_else
(and (match_operand 2 "const1_operand")
@@ -16869,7 +16871,12 @@ (define_insn "*si3_1_zext"
 (match_test "optimize_function_for_size_p (cfun)")))
(const_string "0")
(const_string "*")))
-   (set_attr "mode" "SI")])
+   (set_attr "mode" "SI")
+   (set (attr "enabled")
+   (if_then_else
+ (eq_attr "alternative" "3")
+ (symbol_ref " == LSHIFTRT && TARGET_AVX512BW")
+ (const_string "*")))])
 
 ;; Convert shift to the shiftx pattern to avoid flags dependency.
 (define_split
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0be2dcd8891..20665a6f097 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2179,6 +2179,34 @@ (define_split
 (match_dup 2)))
   (unspec [(const_int 0)] UNSPEC_MASKOP)])])
 
+(define_insn "*klshrsi3_1_zext"
+  [(set (match_operand:DI 0 "register_operand" "=k")
+   (zero_extend:DI
+ (lshiftrt:SI (match_operand:SI 1 "register_operand" "k")
+  (match_operand 2 "const_0_to_31_operand" "I"
+  (unspec [(const_int 0)] UNSPEC_MASKOP)]
+  "TARGET_AVX512BW"
+  "kshiftrd\t{%2, %1, %0|%0, %1, %2}"
+[(set_attr "type" "msklog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SI")])
+
+(define_split
+  [(set (match_operand:DI 0 "mask_reg_operand")
+   (zero_extend:DI
+ (lshiftrt:SI
+   (match_operand:SI 1 "mask_reg_operand")
+   (match_operand 2 "const_0_to_31_operand"
+(clobber (reg:CC FLAGS_REG))]
+  "TARGET_AVX512BW && reload_completed"
+  [(parallel
+ [(set (match_dup 0)
+  (zero_extend:DI
+(lshiftrt:SI
+  (match_dup 1)
+  (match_dup 2
+  (unspec [(const_int 0)] UNSPEC_MASKOP)])])
+
 (define_insn "ktest"
   [(set (reg:CC FLAGS_REG)
(unspec:CC
-- 
2.31.1



[PATCH 3/3] [x86] Enable flate-combine.

2024-06-27 Thread liuhongt
Move pass_stv2 and pass_rpad after pre_reload pass_late_combine, also
define target_insn_cost to prevent post_reload pass_late_combine to
revert the optimziation did in pass_rpad.

Adjust testcases since pass_late_combine generates better code but
break scan assembly.

.i.e
Under 32-bit target, gcc used to generate broadcast from stack and
then do the real operation.
After flate_combine, they're combined into embeded broadcast
operations.

gcc/ChangeLog:

* config/i386/i386-features.cc (ix86_rpad_gate): New function.
* config/i386/i386-options.cc (ix86_override_options_after_change):
Don't disable flate_combine.
* config/i386/i386-passes.def: Move pass_stv2 and pass_rpad
after pre_reload pas_late_combine.
* config/i386/i386-protos.h (ix86_rpad_gate): New declare.
* config/i386/i386.cc (ix86_insn_cost): New function.
(TARGET_INSN_COST): Define.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512f-broadcast-pr87767-1.c: Adjus
testcase.
* gcc.target/i386/avx512f-broadcast-pr87767-5.c: Ditto.
* gcc.target/i386/avx512f-fmadd-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512f-fmsub-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c: Ditto.
* gcc.target/i386/avx512vl-broadcast-pr87767-1.c: Ditto.
* gcc.target/i386/avx512vl-broadcast-pr87767-5.c: Ditto.
* gcc.target/i386/pr91333.c: Ditto.
* gcc.target/i386/vect-strided-4.c: Ditto.
---
 gcc/config/i386/i386-features.cc   | 16 +++-
 gcc/config/i386/i386-options.cc|  4 
 gcc/config/i386/i386-passes.def|  4 ++--
 gcc/config/i386/i386-protos.h  |  1 +
 gcc/config/i386/i386.cc| 18 ++
 .../i386/avx512f-broadcast-pr87767-1.c |  4 ++--
 .../i386/avx512f-broadcast-pr87767-5.c |  1 -
 .../gcc.target/i386/avx512f-fmadd-sf-zmm-7.c   |  2 +-
 .../gcc.target/i386/avx512f-fmsub-sf-zmm-7.c   |  2 +-
 .../gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c  |  2 +-
 .../gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c  |  2 +-
 .../i386/avx512vl-broadcast-pr87767-1.c|  4 ++--
 .../i386/avx512vl-broadcast-pr87767-5.c|  2 --
 gcc/testsuite/gcc.target/i386/pr91333.c|  2 +-
 gcc/testsuite/gcc.target/i386/vect-strided-4.c |  2 +-
 15 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index 607d1991460..fc224ed06b0 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -2995,6 +2995,16 @@ make_pass_insert_endbr_and_patchable_area (gcc::context 
*ctxt)
   return new pass_insert_endbr_and_patchable_area (ctxt);
 }
 
+bool
+ix86_rpad_gate ()
+{
+  return (TARGET_AVX
+ && TARGET_SSE_PARTIAL_REG_DEPENDENCY
+ && TARGET_SSE_MATH
+ && optimize
+ && optimize_function_for_speed_p (cfun));
+}
+
 /* At entry of the nearest common dominator for basic blocks with
conversions/rcp/sqrt/rsqrt/round, generate a single
vxorps %xmmN, %xmmN, %xmmN
@@ -3232,11 +3242,7 @@ public:
   /* opt_pass methods: */
   bool gate (function *) final override
 {
-  return (TARGET_AVX
- && TARGET_SSE_PARTIAL_REG_DEPENDENCY
- && TARGET_SSE_MATH
- && optimize
- && optimize_function_for_speed_p (cfun));
+  return ix86_rpad_gate ();
 }
 
   unsigned int execute (function *) final override
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 9c12d498928..1ef2c71a7a2 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -1944,10 +1944,6 @@ ix86_override_options_after_change (void)
flag_cunroll_grow_size = flag_peel_loops || optimize >= 3;
 }
 
-  /* Late combine tends to undo some of the effects of STV and RPAD,
- by combining instructions back to their original form.  */
-  if (!OPTION_SET_P (flag_late_combine_instructions))
-flag_late_combine_instructions = 0;
 }
 
 /* Clear stack slot assignments remembered from previous functions.
diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
index 7d96766f7b9..2d29f65da88 100644
--- a/gcc/config/i386/i386-passes.def
+++ b/gcc/config/i386/i386-passes.def
@@ -25,11 +25,11 @@ along with GCC; see the file COPYING3.  If not see
  */
 
   INSERT_PASS_AFTER (pass_postreload_cse, 1, pass_insert_vzeroupper);
-  INSERT_PASS_AFTER (pass_combine, 1, pass_stv, false /* timode_p */);
+  INSERT_PASS_AFTER (pass_late_combine, 1, pass_stv, false /* timode_p */);
   /* Run the 64-bit STV pass before the CSE pass so that CONST0_RTX and
  CONSTM1_RTX generated by the STV pass can be CSEed.  */
   INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
 
   INSERT_PASS_BEFORE (pass_shorten_branches, 1, 
pass_insert_endbr_and

[PATCH 0/3][x86] Enable pass_late_combine for x86.

2024-06-27 Thread liuhongt
Because of the issue described in PR115610, late_combine is disabled by
default.The series try to solve the regressions and enable late_combine.
There're 4 regressions observed.

1. The first one is related to pass_stv2, because late_combine will restore
transformation did in the pass. Move the pass after pass_late_combine can
solve the issue.

2. The second one is related to pass_rpad, both pre_reload and post_reload
late_combine would restore the transformation. So besides moving pass_rpad
after pre_reload late_combine, target_insn_cost is defined to prevent
post_reload pass_late_combine to revert the optimziation did in pass_rpad.

3. The third one is related to avx512 kmask, lshirt + zero_extend are combined
into *si3_zext which doesn't support k alternative, and an extra move
between GPR and KMASK and regressed
gcc.target/i386/zero_extendkmask.c scan-assembler-not (?n)shr[bwl],
the solution is extending the pattern to ?k alternative just like what we did
before for other patterns.

4. The fourth one is fake, it's because pass_late_combine generates better code 
but
break scan assembly.
.i.e
Under 32-bit target, gcc used to generate broadcast from stack and
then do the real operation.
After enabling flate_combine, they're combined into embeded broadcast
operations.

Tested with SPEC2017, flate_combine reduces codesize by ~0.6%, which means
there're lots of small improvements.
Bootstrapped and regtested on x86_64-pc-linu-gnu{-m32,}.
Ok for trunk?


liuhongt (3):
  [avx512 testsuite] Define mask as extern instead of uninitialized
local variables.
  Extend lshifrtsi3_1_zext to ?k alternative.
  [x86] Enable flate-combine.

 gcc/config/i386/i386-features.cc  | 16 +++
 gcc/config/i386/i386-options.cc   |  4 ---
 gcc/config/i386/i386-passes.def   |  4 +--
 gcc/config/i386/i386-protos.h |  1 +
 gcc/config/i386/i386.cc   | 18 
 gcc/config/i386/i386.md   | 19 +
 gcc/config/i386/sse.md| 28 +++
 .../gcc.target/i386/avx512bitalg-vpopcntb.c   |  3 +-
 .../gcc.target/i386/avx512bitalg-vpopcntbvl.c |  4 +--
 .../gcc.target/i386/avx512bitalg-vpopcntw.c   |  2 +-
 .../gcc.target/i386/avx512bitalg-vpopcntwvl.c |  4 +--
 .../i386/avx512f-broadcast-pr87767-1.c|  4 +--
 .../i386/avx512f-broadcast-pr87767-5.c|  1 -
 .../gcc.target/i386/avx512f-fmadd-sf-zmm-7.c  |  2 +-
 .../gcc.target/i386/avx512f-fmsub-sf-zmm-7.c  |  2 +-
 .../gcc.target/i386/avx512f-fnmadd-sf-zmm-7.c |  2 +-
 .../gcc.target/i386/avx512f-fnmsub-sf-zmm-7.c |  2 +-
 .../i386/avx512vl-broadcast-pr87767-1.c   |  4 +--
 .../i386/avx512vl-broadcast-pr87767-5.c   |  2 --
 .../i386/avx512vpopcntdq-vpopcntd.c   |  5 ++--
 .../i386/avx512vpopcntdq-vpopcntq.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr91333.c   |  2 +-
 .../gcc.target/i386/vect-strided-4.c  |  2 +-
 23 files changed, 93 insertions(+), 40 deletions(-)

-- 
2.31.1



[PATCH 1/3] [avx512 testsuite] Define mask as extern instead of uninitialized local variables.

2024-06-27 Thread liuhongt
The testcases are supposed to scan for vpopcnt{b,w,d,q} operations
with k mask, but mask is defined as uninitialized local variable which
will be set as 0 at rtl expand phase.
And it's further simplified off by late_combine which caused scan assembly 
failure.
Move the definition of mask outside to make the testcases more stable.

gcc/testsuite/ChangeLog:

PR target/115610
* gcc.target/i386/avx512bitalg-vpopcntb.c: Define mask as
extern instead of uninitialized local variables.
* gcc.target/i386/avx512bitalg-vpopcntbvl.c: Ditto.
* gcc.target/i386/avx512bitalg-vpopcntw.c: Ditto.
* gcc.target/i386/avx512bitalg-vpopcntwvl.c: Ditto.
* gcc.target/i386/avx512vpopcntdq-vpopcntd.c: Ditto.
* gcc.target/i386/avx512vpopcntdq-vpopcntq.c: Ditto.
---
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c| 3 +--
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c  | 4 ++--
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c| 2 +-
 gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c  | 4 ++--
 gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c | 5 +++--
 gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c
index 44b82c0519d..66d24107c26 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntb.c
@@ -7,10 +7,9 @@
 #include 
 
 extern __m512i z, z1;
-
+extern __mmask16 msk;
 int foo ()
 {
-  __mmask16 msk;
   __m512i c = _mm512_popcnt_epi8 (z);
   asm volatile ("" : "+v" (c));
   c = _mm512_mask_popcnt_epi8 (z1, msk, z);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c
index 8c2dfaba9c6..8ab05653f7c 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntbvl.c
@@ -11,11 +11,11 @@
 
 extern __m256i y, y_1;
 extern __m128i x, x_1;
+extern __mmask32 msk32;
+extern __mmask16 msk16;
 
 int foo ()
 {
-  __mmask32 msk32;
-  __mmask16 msk16;
   __m256i c256 = _mm256_popcnt_epi8 (y);
   asm volatile ("" : "+v" (c256));
   c256 = _mm256_mask_popcnt_epi8 (y_1, msk32, y);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c
index 2ef8589f6c1..c741bf48a51 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntw.c
@@ -7,10 +7,10 @@
 #include 
 
 extern __m512i z, z1;
+extern __mmask16 msk;
 
 int foo ()
 {
-  __mmask16 msk;
   __m512i c = _mm512_popcnt_epi16 (z);
   asm volatile ("" : "+v" (c));
   c = _mm512_mask_popcnt_epi16 (z1, msk, z);
diff --git a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c 
b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c
index c976461b12e..79bb3c31e85 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bitalg-vpopcntwvl.c
@@ -11,11 +11,11 @@
 
 extern __m256i y, y_1;
 extern __m128i x, x_1;
+extern __mmask16 msk16;
+extern __mmask8 msk8;
 
 int foo ()
 {
-  __mmask16 msk16;
-  __mmask8 msk8;
   __m256i c256 = _mm256_popcnt_epi16 (y);
   asm volatile ("" : "+v" (c256));
   c256 = _mm256_mask_popcnt_epi16 (y_1, msk16, y);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c 
b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c
index b4d82f97032..776a4753d8e 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntd.c
@@ -15,11 +15,12 @@
 extern __m128i x, x_1;
 extern __m256i y, y_1;
 extern __m512i z, z_1;
+extern  __mmask16 msk;
+extern  __mmask8 msk8;
+
 
 int foo ()
 {
-  __mmask16 msk;
-  __mmask8 msk8;
   __m128i a = _mm_popcnt_epi32 (x);
   asm volatile ("" : "+v" (a));
   a = _mm_mask_popcnt_epi32 (x_1, msk8, x);
diff --git a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c 
b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c
index e87d6c999b6..c6314ac5deb 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vpopcntdq-vpopcntq.c
@@ -15,10 +15,10 @@
 extern __m128i x, x_1;
 extern __m256i y, y_1;
 extern __m512i z, z_1;
+extern __mmask8 msk; 
 
 int foo ()
 {
-  __mmask8 msk; 
   __m128i a = _mm_popcnt_epi64 (x);
   asm volatile ("" : "+v" (a));
   a = _mm_mask_popcnt_epi64 (x_1, msk, x);
-- 
2.31.1



[PATCH] Fix native_encode_vector_part for itype when TYPE_PRECISION (itype) == BITS_PER_UNIT

2024-06-27 Thread liuhongt
for the testcase in the PR115406, here is part of the dump.

  char D.4882;
  vector(1)  _1;
  vector(1) signed char _2;
  char _5;

   :
  _1 = { -1 };

When assign { -1 } to vector(1} {signed-boolean:8},
Since TYPE_PRECISION (itype) <= BITS_PER_UNIT, so it set each bit of dest
with each vector elemnet. But i think the bit setting should only apply for
TYPE_PRECISION (itype) < BITS_PER_UNIT. .i.e for vector(1).
, it will be assigned as -1, instead of 1.
Is there any specific reason vector(1)  is handled
differently from vector<1> ?

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR middle-end/115406
* fold-const.cc (native_encode_vector_part): Don't set each
bit to the dest when TYPE_PRECISION (itype) == BITS_PER_UNIT.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115406.c: New test.
---
 gcc/fold-const.cc|  2 +-
 gcc/testsuite/gcc.target/i386/pr115406.c | 23 +++
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115406.c

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 710d697c021..0f045f851d1 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -8077,7 +8077,7 @@ native_encode_vector_part (const_tree expr, unsigned char 
*ptr, int len,
 {
   tree itype = TREE_TYPE (TREE_TYPE (expr));
   if (VECTOR_BOOLEAN_TYPE_P (TREE_TYPE (expr))
-  && TYPE_PRECISION (itype) <= BITS_PER_UNIT)
+  && TYPE_PRECISION (itype) < BITS_PER_UNIT)
 {
   /* This is the only case in which elements can be smaller than a byte.
 Element 0 is always in the lsb of the containing byte.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr115406.c 
b/gcc/testsuite/gcc.target/i386/pr115406.c
new file mode 100644
index 000..623dff06fc3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115406.c
@@ -0,0 +1,23 @@
+/* { dg-do run } */
+/* { dg-options "-O0 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+typedef __attribute__((__vector_size__ (1))) char V;
+
+char
+foo (V v)
+{
+  return ((V) v == v)[0];
+}
+
+int
+main ()
+{
+  if (!__builtin_cpu_supports ("avx512f"))
+return 0;
+
+  char x = foo ((V) { });
+  if (x != -1)
+__builtin_abort ();
+  return 0;
+}
-- 
2.31.1



[PATCH 7/7] Remove vcond{, u, eq} expanders since they will be obsolete.

2024-06-27 Thread liuhongt
gcc/ChangeLog:

PR target/115517
* config/i386/mmx.md (vcondv2sf): Removed.
(vcond): Ditto.
(vcond): Ditto.
(vcondu): Ditto.
(vcondu): Ditto.
* config/i386/sse.md (vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcond): Ditto.
(vcondv2di): Ditto.
(vcondu): Ditto.
(vcondu): Ditto.
(vcondu): Ditto.
(vconduv2di): Ditto.
(vcondeqv2di): Ditto.
---
 gcc/config/i386/mmx.md |  97 ---
 gcc/config/i386/sse.md | 213 -
 2 files changed, 310 deletions(-)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 7262bf146c2..17c5205cae2 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1168,39 +1168,6 @@ (define_expand "vec_cmpv2sfv2si"
   DONE;
 })
 
-(define_expand "vcondv2sf"
-  [(set (match_operand:V2FI 0 "register_operand")
-   (if_then_else:V2FI
- (match_operator 3 ""
-   [(match_operand:V2SF 4 "nonimmediate_operand")
-(match_operand:V2SF 5 "nonimmediate_operand")])
- (match_operand:V2FI 1 "general_operand")
- (match_operand:V2FI 2 "general_operand")))]
-  "TARGET_MMX_WITH_SSE && ix86_partial_vec_fp_math"
-{
-  rtx ops[6];
-  ops[5] = gen_reg_rtx (V4SFmode);
-  ops[4] = gen_reg_rtx (V4SFmode);
-  ops[3] = gen_rtx_fmt_ee (GET_CODE (operands[3]), VOIDmode, ops[4], ops[5]);
-  ops[2] = lowpart_subreg (mode,
-  force_reg (mode, operands[2]),
-  mode);
-  ops[1] = lowpart_subreg (mode,
-  force_reg (mode, operands[1]),
-  mode);
-  ops[0] = gen_reg_rtx (mode);
-
-  emit_insn (gen_movq_v2sf_to_sse (ops[5], operands[5]));
-  emit_insn (gen_movq_v2sf_to_sse (ops[4], operands[4]));
-
-  bool ok = ix86_expand_fp_vcond (ops);
-  gcc_assert (ok);
-
-  emit_move_insn (operands[0], lowpart_subreg (mode, ops[0],
-  mode));
-  DONE;
-})
-
 (define_insn "@sse4_1_insertps_"
   [(set (match_operand:V2FI 0 "register_operand" "=Yr,*x,v")
(unspec:V2FI
@@ -4029,70 +3996,6 @@ (define_expand "vec_cmpu"
   DONE;
 })
 
-(define_expand "vcond"
-  [(set (match_operand:MMXMODE124 0 "register_operand")
-   (if_then_else:MMXMODE124
- (match_operator 3 ""
-   [(match_operand:MMXMODEI 4 "register_operand")
-(match_operand:MMXMODEI 5 "register_operand")])
- (match_operand:MMXMODE124 1)
- (match_operand:MMXMODE124 2)))]
-  "TARGET_MMX_WITH_SSE
-   && (GET_MODE_NUNITS (mode)
-   == GET_MODE_NUNITS (mode))"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "vcond"
-  [(set (match_operand:VI_16_32 0 "register_operand")
-   (if_then_else:VI_16_32
- (match_operator 3 ""
-   [(match_operand:VI_16_32 4 "register_operand")
-(match_operand:VI_16_32 5 "register_operand")])
- (match_operand:VI_16_32 1)
- (match_operand:VI_16_32 2)))]
-  "TARGET_SSE2"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "vcondu"
-  [(set (match_operand:MMXMODE124 0 "register_operand")
-   (if_then_else:MMXMODE124
- (match_operator 3 ""
-   [(match_operand:MMXMODEI 4 "register_operand")
-(match_operand:MMXMODEI 5 "register_operand")])
- (match_operand:MMXMODE124 1)
- (match_operand:MMXMODE124 2)))]
-  "TARGET_MMX_WITH_SSE
-   && (GET_MODE_NUNITS (mode)
-   == GET_MODE_NUNITS (mode))"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
-(define_expand "vcondu"
-  [(set (match_operand:VI_16_32 0 "register_operand")
-   (if_then_else:VI_16_32
- (match_operator 3 ""
-   [(match_operand:VI_16_32 4 "register_operand")
-(match_operand:VI_16_32 5 "register_operand")])
- (match_operand:VI_16_32 1)
- (match_operand:VI_16_32 2)))]
-  "TARGET_SSE2"
-{
-  bool ok = ix86_expand_int_vcond (operands);
-  gcc_assert (ok);
-  DONE;
-})
-
 (define_expand "vcond_mask_"
   [(set (match_operand:MMXMODE124 0 "register_operand")
(vec_merge:MMXMODE124
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index d86b6fa81c0..2d6b39c920f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4816,72 +4816,6 @@ (define_expand "vec_cmpeqv1tiv1ti"
   DONE;
 })
 
-(define_expand "vcond"
-  [(set (match_operand:V_512 0 "register_operand")
-   (if_then_else:V_512
- (match_operator 3 ""
-   [(match_operand:VF_512 4 "nonimmediate_operand")
-(match_operand:VF_512 5 "nonimmediate_operand")])
- (match_operand:V_512 1 "general_operand")
- (match_operand:V_512 2 "general_operand")))]
-  "TARGET_AVX512F
-   && (GET_MODE_NUNITS (mode)
-   == GE

[PATCH 5/7] Adjust testcase for the regressed testcases after obsolete of vcond{, u, eq}.

2024-06-27 Thread liuhongt
> Richard suggests that we implement the "obvious" transforms like
> inversion in the middle-end but if for example unsigned compares
> are not supported the us_minus + eq + negative trick isn't on
> that list.
>
> The main reason to restrict vec_cmp would be to avoid
> a <= b ? c : d going with an unsupported vec_cmp but instead
> do a > b ? d : c - the alternative is trying to fix this
> on the RTL side via combine.  I understand the non-native

Yes, I have a patch which can fix most regressions via pattern match
in combine.
Still there is a situation that is difficult to deal with, mainly the
optimization w/o sse4.1 . Because pblendvb/blendvps/blendvpd only
exists under sse4.1, w/o sse4.1, it takes 3
instructions (pand,pandn,por) to simulate the vcond_mask, and the
combine matches up to 4 instructions, which makes it currently
impossible to use the combine to recover those optimizations in the
vcond{,u,eq}.i.e min/max.

In the case of sse 4.1 and above, there is basically no regression anymore.

the regression testcases w/o sse4.1

FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++14  scan-assembler-times pcmpeqb 
2
FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++17  scan-assembler-times pcmpeqb 
2
FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++20  scan-assembler-times pcmpeqb 
2
FAIL: g++.target/i386/pr100637-1b.C  -std=gnu++98  scan-assembler-times pcmpeqb 
2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++14  scan-assembler-times pcmpeqw 
2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++17  scan-assembler-times pcmpeqw 
2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++20  scan-assembler-times pcmpeqw 
2
FAIL: g++.target/i386/pr100637-1w.C  -std=gnu++98  scan-assembler-times pcmpeqw 
2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++14  scan-assembler-times pcmpeqb 2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++17  scan-assembler-times pcmpeqb 2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++20  scan-assembler-times pcmpeqb 2
FAIL: g++.target/i386/pr103861-1.C  -std=gnu++98  scan-assembler-times pcmpeqb 2
FAIL: gcc.target/i386/pr88540.c scan-assembler minpd

gcc/testsuite/ChangeLog:

PR target/115517
* g++.target/i386/pr100637-1b.C: Add xfail and -mno-sse4.1.
* g++.target/i386/pr100637-1w.C: Ditto.
* g++.target/i386/pr103861-1.C: Ditto.
* gcc.target/i386/pr88540.c: Ditto.
* gcc.target/i386/pr103941-2.c: Add -mno-avx512f.
* g++.target/i386/sse4_1-pr100637-1b.C: New test.
* g++.target/i386/sse4_1-pr100637-1w.C: New test.
* g++.target/i386/sse4_1-pr103861-1.C: New test.
* gcc.target/i386/sse4_1-pr88540.c: New test.
---
 gcc/testsuite/g++.target/i386/pr100637-1b.C |  4 ++--
 gcc/testsuite/g++.target/i386/pr100637-1w.C |  4 ++--
 gcc/testsuite/g++.target/i386/pr103861-1.C  |  4 ++--
 .../g++.target/i386/sse4_1-pr100637-1b.C| 17 +
 .../g++.target/i386/sse4_1-pr100637-1w.C| 17 +
 .../g++.target/i386/sse4_1-pr103861-1.C | 17 +
 gcc/testsuite/gcc.target/i386/pr103941-2.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pr88540.c |  4 ++--
 gcc/testsuite/gcc.target/i386/sse4_1-pr88540.c  | 10 ++
 9 files changed, 70 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr100637-1b.C
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr100637-1w.C
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr103861-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/sse4_1-pr88540.c

diff --git a/gcc/testsuite/g++.target/i386/pr100637-1b.C 
b/gcc/testsuite/g++.target/i386/pr100637-1b.C
index 35b5df7c9dd..dccb8f5e712 100644
--- a/gcc/testsuite/g++.target/i386/pr100637-1b.C
+++ b/gcc/testsuite/g++.target/i386/pr100637-1b.C
@@ -1,6 +1,6 @@
 /* PR target/100637 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2" } */
+/* { dg-options "-O2 -msse2 -mno-sse4.1" } */
 
 typedef unsigned char __attribute__((__vector_size__ (4))) __v4qu;
 typedef char __attribute__((__vector_size__ (4))) __v4qi;
@@ -13,5 +13,5 @@ __v4qu us (__v4qi a, __v4qi b) { return (a > b) ? au : bu; }
 __v4qi su (__v4qu a, __v4qu b) { return (a > b) ? as : bs; }
 __v4qi ss (__v4qi a, __v4qi b) { return (a > b) ? as : bs; }
 
-/* { dg-final { scan-assembler-times "pcmpeqb" 2 } } */
+/* { dg-final { scan-assembler-times "pcmpeqb" 2 { xfail *-*-* } } } */
 /* { dg-final { scan-assembler-times "pcmpgtb" 2 } } */
diff --git a/gcc/testsuite/g++.target/i386/pr100637-1w.C 
b/gcc/testsuite/g++.target/i386/pr100637-1w.C
index a3ed06fddee..a0aab62db33 100644
--- a/gcc/testsuite/g++.target/i386/pr100637-1w.C
+++ b/gcc/testsuite/g++.target/i386/pr100637-1w.C
@@ -1,6 +1,6 @@
 /* PR target/100637 */
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2" } */
+/* { dg-options "-O2 -msse2 -mno-sse4.1" } */
 
 typedef unsigned short __attribute__((__vector_size__ (4))) __v2hu;
 typedef short __attribute__((__vector_size__ (4))) __v2hi;
@@ -13,5 +13,5 @@ __v2hu

[PATCH 2/7] Lower AVX512 kmask comparison back to AVX2 comparison when op_{true, false} is vector -1/0.

2024-06-27 Thread liuhongt
gcc/ChangeLog
PR target/115517
* config/i386/sse.md
(*_cvtmask2_not): New pre_reload
splitter.
(*_cvtmask2_not): Ditto.
(*avx2_pcmp3_6): Ditto.
(*avx2_pcmp3_7): Ditto.
---
 gcc/config/i386/sse.md | 97 ++
 1 file changed, 97 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1148ac84f3d..822159a869b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -9986,6 +9986,24 @@ (define_insn "*_cvtmask2"
   [(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_cvtmask2_not"
+  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
+   (vec_merge:VI12_AVX512VL
+ (match_operand:VI12_AVX512VL 2 "const0_operand")
+ (match_operand:VI12_AVX512VL 3 "vector_all_ones_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 4)
+   (not: (match_dup 1)))
+   (set (match_dup 0)
+   (vec_merge:VI12_AVX512VL
+ (match_dup 3)
+ (match_dup 2)
+ (match_dup 4)))]
+  "operands[4] = gen_reg_rtx (mode);")
+
 (define_expand "_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
(vec_merge:VI48_AVX512VL
@@ -10024,6 +10042,24 @@ (define_insn_and_split 
"*_cvtmask2"
(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_cvtmask2_not"
+  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
+   (vec_merge:VI48_AVX512VL
+ (match_operand:VI48_AVX512VL 2 "const0_operand")
+ (match_operand:VI48_AVX512VL 3 "vector_all_ones_operand")
+ (match_operand: 1 "register_operand")))]
+  "TARGET_AVX512F && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 4)
+   (not: (match_dup 1)))
+   (set (match_dup 0)
+   (vec_merge:VI48_AVX512VL
+ (match_dup 3)
+ (match_dup 2)
+ (match_dup 4)))]
+  "operands[4] = gen_reg_rtx (mode);")
+
 (define_insn "*_cvtmask2_pternlog_false_dep"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand" "=v")
(vec_merge:VI48_AVX512VL
@@ -17675,6 +17711,67 @@ (define_insn_and_split "*avx2_pcmp3_5"
 std::swap (operands[1], operands[2]);
 })
 
+(define_int_attr pcmp_usmin
+  [(UNSPEC_PCMP "smin") (UNSPEC_UNSIGNED_PCMP "umin")])
+
+(define_insn_and_split "*avx2_pcmp3_6"
+ [(set (match_operand:VI_128_256  0 "register_operand")
+   (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "vector_all_ones_operand")
+ (match_operand:VI_128_256 2 "const0_operand")
+ (unspec:
+   [(match_operand:VI_128_256 3 "nonimmediate_operand")
+(match_operand:VI_128_256 4 "nonimmediate_operand")
+(match_operand:SI 5 "const_0_to_7_operand")]
+UNSPEC_PCMP_ITER)))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()
+   && (INTVAL (operands[5]) == 2 || INTVAL (operands[5]) == 5)"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx dst_min = gen_reg_rtx (mode);
+
+  if (MEM_P (operands[3]) && MEM_P (operands[4]))
+operands[3] = force_reg (mode, operands[3]);
+  emit_insn (gen_3 (dst_min, operands[3], operands[4]));
+  rtx eq_op = INTVAL (operands[5]) == 2 ? operands[3] : operands[4];
+  emit_move_insn (operands[0], gen_rtx_EQ (mode, eq_op, dst_min));
+  DONE;
+})
+
+(define_insn_and_split "*avx2_pcmp3_7"
+ [(set (match_operand:VI_128_256  0 "register_operand")
+   (vec_merge:VI_128_256
+ (match_operand:VI_128_256 1 "const0_operand")
+ (match_operand:VI_128_256 2 "vector_all_ones_operand")
+ (unspec:
+   [(match_operand:VI_128_256 3 "nonimmediate_operand")
+(match_operand:VI_128_256 4 "nonimmediate_operand")
+(match_operand:SI 5 "const_0_to_7_operand")]
+UNSPEC_PCMP_ITER)))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()
+ /* NE is commutative.  */
+   && (INTVAL (operands[5]) == 4
+ /* LE, 3 must be register.  */
+   || INTVAL (operands[5]) == 2
+ /* NLT aka GE, 4 must be register and we swap operands.  */
+   || INTVAL (operands[5]) == 5)"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  if (INTVAL (operands[5]) == 5)
+std::swap (operands[3], operands[4]);
+
+  if (MEM_P (operands[3]))
+operands[3] = force_reg (mode, operands[3]);
+  enum rtx_code code = INTVAL (operands[5]) != 4 ? GT : EQ;
+  emit_move_insn (operands[0], gen_rtx_fmt_ee (code, mode,
+  operands[3], operands[4]));
+  DONE;
+})
+
 (define_expand "_eq3"
   [(set (match_operand: 0 "register_operand")
(unspec:
-- 
2.31.1



[PATCH 4/7] Add more splitter for mskmov with avx512 comparison.

2024-06-27 Thread liuhongt
gcc/ChangeLog:

PR target/115517
* config/i386/sse.md
(*_movmsk_lt_avx512): New
define_insn_and_split.
(*_movmsk_ext_lt_avx512):
Ditto.
(*_pmovmskb_lt_avx512): Ditto.
(*_pmovmskb_zext_lt_avx512): Ditto.
(*sse2_pmovmskb_ext_lt_avx512): Ditto.
(*pmovsk_kmask_v16qi_avx512): Ditto.
(*pmovsk_mask_v32qi_avx512): Ditto.
(*pmovsk_mask_cmp__avx512): Ditto.
(*pmovsk_ptest__avx512): Ditto.
---
 gcc/config/i386/sse.md | 232 +
 1 file changed, 209 insertions(+), 23 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 92f8b74999f..5996ad99606 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -10049,24 +10049,6 @@ (define_insn "*_cvtmask2"
   [(set_attr "prefix" "evex")
(set_attr "mode" "")])
 
-(define_insn_and_split "*_cvtmask2_not"
-  [(set (match_operand:VI12_AVX512VL 0 "register_operand")
-   (vec_merge:VI12_AVX512VL
- (match_operand:VI12_AVX512VL 2 "const0_operand")
- (match_operand:VI12_AVX512VL 3 "vector_all_ones_operand")
- (match_operand: 1 "register_operand")))]
-  "TARGET_AVX512BW && ix86_pre_reload_split ()"
-  "#"
-  "&& 1"
-  [(set (match_dup 4)
-   (not: (match_dup 1)))
-   (set (match_dup 0)
-   (vec_merge:VI12_AVX512VL
- (match_dup 3)
- (match_dup 2)
- (match_dup 4)))]
-  "operands[4] = gen_reg_rtx (mode);")
-
 (define_expand "_cvtmask2"
   [(set (match_operand:VI48_AVX512VL 0 "register_operand")
(vec_merge:VI48_AVX512VL
@@ -10106,10 +10088,10 @@ (define_insn_and_split 
"*_cvtmask2"
(set_attr "mode" "")])
 
 (define_insn_and_split "*_cvtmask2_not"
-  [(set (match_operand:VI48_AVX512VL 0 "register_operand")
-   (vec_merge:VI48_AVX512VL
- (match_operand:VI48_AVX512VL 2 "const0_operand")
- (match_operand:VI48_AVX512VL 3 "vector_all_ones_operand")
+  [(set (match_operand:VI1248_AVX512VLBW 0 "register_operand")
+   (vec_merge:VI1248_AVX512VLBW
+ (match_operand:VI1248_AVX512VLBW 2 "const0_operand")
+ (match_operand:VI1248_AVX512VLBW 3 "vector_all_ones_operand")
  (match_operand: 1 "register_operand")))]
   "TARGET_AVX512F && ix86_pre_reload_split ()"
   "#"
@@ -10117,7 +10099,7 @@ (define_insn_and_split 
"*_cvtmask2_not"
   [(set (match_dup 4)
(not: (match_dup 1)))
(set (match_dup 0)
-   (vec_merge:VI48_AVX512VL
+   (vec_merge:VI1248_AVX512VLBW
  (match_dup 3)
  (match_dup 2)
  (match_dup 4)))]
@@ -21753,6 +21735,30 @@ (define_insn_and_split 
"*_movmsk_lt"
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_movmsk_lt_avx512"
+  [(set (match_operand:SI 0 "register_operand" "=r,jr")
+   (unspec:SI
+ [(subreg:VF_128_256
+   (vec_merge:
+(match_operand: 3 "vector_all_ones_operand")
+(match_operand: 4 "const0_operand")
+(unspec:
+ [(match_operand: 1 "register_operand" "x,x")
+  (match_operand: 2 "const0_operand")
+  (const_int 1)]
+ UNSPEC_PCMP)) 0)]
+ UNSPEC_MOVMSK))]
+  "TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))]
+  "operands[1] = gen_lowpart (mode, operands[1]);"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "*_movmsk_ext_lt"
   [(set (match_operand:DI 0 "register_operand" "=r,jr")
(any_extend:DI
@@ -21772,6 +21778,31 @@ (define_insn_and_split 
"*_movmsk_ext_lt"
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "")])
 
+(define_insn_and_split 
"*_movmsk_ext_lt_avx512"
+  [(set (match_operand:DI 0 "register_operand" "=r,jr")
+   (any_extend:DI
+ (unspec:SI
+   [(subreg:VF_128_256
+ (vec_merge:
+  (match_operand: 3 "vector_all_ones_operand")
+  (match_operand: 4 "const0_operand")
+  (unspec:
+   [(match_operand: 1 "register_operand" "x,x")
+(match_operand: 2 "const0_operand")
+(const_int 1)]
+   UNSPEC_PCMP)) 0)]
+   UNSPEC_MOVMSK)))]
+  "TARGET_64BIT && TARGET_SSE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (any_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))]
+  "operands[1] = gen_lowpart (mode, operands[1]);"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "")])
+
 (define_insn_and_split "*_movmsk_shift"
   [(set (match_operand:SI 0 "register_operand" "=r,jr")
(unspec:SI
@@ -21961,6 +21992,34 @@ (define_insn_and_split "*_pmovmskb_lt"
(set_attr "prefix" "maybe_vex")
(set_attr "mode" "SI")])
 
+(define_insn_and_split "*_pmovmskb_lt_avx512"
+  [(set (match_operand:SI 0 "regist

[PATCH 6/7] [x86] Optimize a < 0 ? -1 : 0 to (signed)a >> 31.

2024-06-27 Thread liuhongt
Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
and x < 0 ? 1 : 0 into (unsigned) x >> 31.

Add define_insn_and_split for the optimization did in
ix86_expand_int_vcond.

gcc/ChangeLog:

PR target/115517
* config/i386/sse.md ("*ashr3_1"): New
define_insn_and_split.
(*avx512_ashr3_1): Ditto.
(*avx2_lshr3_1): Ditto.
(*avx2_lshr3_2): Ditto and add 2 combine splitter after
it.
* config/i386/mmx.md (mmxscalarsize): New mode attribute.
(*mmw_ashr3_1): New define_insn_and_split.
("mmx_3): Add a combine spiltter after it.
(*mmx_ashrv2hi3_1): New define_insn_and_plit, also add a
combine splitter after it.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-pr115517.c: New test.
* gcc.target/i386/avx512-pr115517.c: New test.
* g++.target/i386/avx2-pr115517.C: New test.
* g++.target/i386/avx512-pr115517.C: New test.
* gcc.target/i386/pr111023-2.c: Adjust testcase.
* gcc.target/i386/vect-div-1.c: Ditto.
---
 gcc/config/i386/mmx.md| 52 
 gcc/config/i386/sse.md| 83 +++
 gcc/testsuite/g++.target/i386/avx2-pr115517.C | 60 ++
 .../g++.target/i386/avx512-pr115517.C | 70 
 gcc/testsuite/gcc.target/i386/avx2-pr115517.c | 33 
 .../gcc.target/i386/avx512-pr115517.c | 70 
 gcc/testsuite/gcc.target/i386/pr111023-2.c|  4 +-
 gcc/testsuite/gcc.target/i386/vect-div-1.c|  3 +-
 8 files changed, 372 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/avx2-pr115517.C
 create mode 100644 gcc/testsuite/g++.target/i386/avx512-pr115517.C
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr115517.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512-pr115517.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index ea53f516cbb..7262bf146c2 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -135,6 +135,14 @@ (define_mode_attr mmxscalarmodelower
(V4HI "hi") (V2HI "hi")
(V8QI "qi")])
 
+(define_mode_attr mmxscalarsize
+  [(V1DI "64")
+   (V2SI "32") (V2SF "32")
+   (V4HF "16") (V4BF "16")
+   (V2HF "16") (V2BF "16")
+   (V4HI "16") (V2HI "16")
+   (V8QI "8")])
+
 (define_mode_attr Yv_Yw
   [(V8QI "Yw") (V4HI "Yw") (V2SI "Yv") (V1DI "Yv") (V2SF "Yv")])
 
@@ -3608,6 +3616,17 @@ (define_insn "mmx_ashr3"
(const_string "0")))
(set_attr "mode" "DI,TI,TI")])
 
+(define_insn_and_split "*mmx_ashr3_1"
+  [(set (match_operand:MMXMODE24 0 "register_operand")
+   (lt:MMXMODE24
+ (match_operand:MMXMODE24 1 "register_operand")
+ (match_operand:MMXMODE24 2 "const0_operand")))]
+  "TARGET_MMX_WITH_SSE && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (ashiftrt:MMXMODE24 (match_dup 1) (match_dup 3)))]
+  "operands[3] = gen_int_mode ( - 1, DImode);")
+
 (define_expand "ashr3"
   [(set (match_operand:MMXMODE24 0 "register_operand")
 (ashiftrt:MMXMODE24
@@ -3634,6 +3653,17 @@ (define_insn "mmx_3"
(const_string "0")))
(set_attr "mode" "DI,TI,TI")])
 
+(define_split
+  [(set (match_operand:MMXMODE248 0 "register_operand")
+   (and:MMXMODE248
+ (lt:MMXMODE248
+   (match_operand:MMXMODE248 1 "register_operand")
+   (match_operand:MMXMODE248 2 "const0_operand"))
+ (match_operand:MMXMODE248 3 "const1_operand")))]
+  "TARGET_MMX_WITH_SSE && ix86_pre_reload_split ()"
+  [(set (match_dup 0) (lshiftrt:MMXMODE248 (match_dup 1) (match_dup 4)))]
+  "operands[4] = gen_int_mode ( - 1, DImode);")
+
 (define_expand "3"
   [(set (match_operand:MMXMODE24 0 "register_operand")
 (any_lshift:MMXMODE24
@@ -3675,6 +3705,28 @@ (define_insn "v2hi3"
(const_string "0")))
(set_attr "mode" "TI")])
 
+(define_insn_and_split "*mmx_ashrv2hi3_1"
+  [(set (match_operand:V2HI 0 "register_operand")
+   (lt:V2HI
+ (match_operand:V2HI 1 "register_operand")
+ (match_operand:V2HI 2 "const0_operand")))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0) (ashiftrt:V2HI (match_dup 1) (match_dup 3)))]
+  "operands[3] = gen_int_mode (15, DImode);")
+
+(define_split
+  [(set (match_operand:V2HI 0 "register_operand")
+   (and:V2HI
+ (lt:V2HI
+   (match_operand:V2HI 1 "register_operand")
+   (match_operand:V2HI 2 "const0_operand"))
+ (match_operand:V2HI 3 "const1_operand")))]
+  "TARGET_SSE2 && ix86_pre_reload_split ()"
+  [(set (match_dup 0) (lshiftrt:V2HI (match_dup 1) (match_dup 4)))]
+  "operands[4] = gen_int_mode (15, DImode);")
+
 (define_expand "v8qi3"
   [(set (match_operand:V8QI 0 "register_operand")
(any_shift:V8QI (match_operand:V8QI 1 "register_operand")
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5996ad99606..d86b6fa81c0 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1

[PATCH 1/7] [x86] Add more splitters to match (unspec [op1 op2 (gt op3 constm1_operand)] UNSPEC_BLENDV)

2024-06-27 Thread liuhongt
These define_insn_and_split are needed after vcond{,u,eq} is obsolete.

gcc/ChangeLog:

PR target/115517
* config/i386/sse.md
(*_blendv_gt): New
define_insn_and_split.
(*_blendv_gtint):
Ditto.
(*_blendv_not_gtint):
Ditto.
(*_pblendvb_gt): Ditto.
(*_pblendvb_gt_subreg_not): Ditto.
---
 gcc/config/i386/sse.md | 130 +
 1 file changed, 130 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0be2dcd8891..1148ac84f3d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -23016,6 +23016,32 @@ (define_insn_and_split 
"*_blendv_lt"
(set_attr "btver2_decode" "vector,vector,vector") 
(set_attr "mode" "")])
 
+(define_insn_and_split "*_blendv_gt"
+  [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
+   (unspec:VF_128_256
+ [(match_operand:VF_128_256 1 "vector_operand" "Yrja,*xja,xjm")
+  (match_operand:VF_128_256 2 "register_operand" "0,0,x")
+  (gt:VF_128_256
+(match_operand: 3 "register_operand" "Yz,Yz,x")
+(match_operand: 4 "vector_all_ones_operand"))]
+ UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (unspec:VF_128_256
+[(match_dup 2) (match_dup 1) (match_dup 3)] UNSPEC_BLENDV))]
+  "operands[3] = gen_lowpart (mode, operands[3]);"
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "addr" "gpr16")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector,vector,vector")
+   (set_attr "mode" "")])
+
 (define_mode_attr ssefltmodesuffix
   [(V2DI "pd") (V4DI "pd") (V4SI "ps") (V8SI "ps")
(V2DF "pd") (V4DF "pd") (V4SF "ps") (V8SF "ps")])
@@ -23055,6 +23081,38 @@ (define_insn_and_split 
"*_blendv_ltint"
(set_attr "btver2_decode" "vector,vector,vector") 
(set_attr "mode" "")])
 
+(define_insn_and_split 
"*_blendv_gtint"
+  [(set (match_operand: 0 "register_operand" "=Yr,*x,x")
+   (unspec:
+ [(match_operand: 1 "vector_operand" "Yrja,*xja,xjm")
+  (match_operand: 2 "register_operand" "0,0,x")
+  (subreg:
+(gt:VI48_AVX
+  (match_operand:VI48_AVX 3 "register_operand" "Yz,Yz,x")
+  (match_operand:VI48_AVX 4 "vector_all_ones_operand")) 0)]
+ UNSPEC_BLENDV))]
+  "TARGET_SSE4_1"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0)
+   (unspec:
+[(match_dup 2) (match_dup 1) (match_dup 3)] UNSPEC_BLENDV))]
+{
+  operands[0] = gen_lowpart (mode, operands[0]);
+  operands[1] = gen_lowpart (mode, operands[1]);
+  operands[2] = gen_lowpart (mode, operands[2]);
+  operands[3] = gen_lowpart (mode, operands[3]);
+}
+  [(set_attr "isa" "noavx,noavx,avx")
+   (set_attr "type" "ssemov")
+   (set_attr "addr" "gpr16")
+   (set_attr "length_immediate" "1")
+   (set_attr "prefix_data16" "1,1,*")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "orig,orig,vex")
+   (set_attr "btver2_decode" "vector,vector,vector")
+   (set_attr "mode" "")])
+
 ;; PR target/100738: Transform vpcmpeqd + vpxor + vblendvps to vblendvps for 
inverted mask;
 (define_insn_and_split 
"*_blendv_not_ltint"
   [(set (match_operand: 0 "register_operand")
@@ -23082,6 +23140,32 @@ (define_insn_and_split 
"*_blendv_not_lt
   operands[3] = gen_lowpart (mode, operands[3]);
 })
 
+(define_insn_and_split 
"*_blendv_not_gtint"
+  [(set (match_operand: 0 "register_operand")
+   (unspec:
+ [(match_operand: 1 "vector_operand")
+  (match_operand: 2 "register_operand")
+  (subreg:
+(gt:VI48_AVX
+  (subreg:VI48_AVX
+  (not:
+(match_operand: 3 "register_operand")) 0)
+  (match_operand:VI48_AVX 4 "vector_all_ones_operand")) 0)]
+ UNSPEC_BLENDV))]
+  "TARGET_SSE4_1 && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (unspec:
+[(match_dup 1) (match_dup 2) (match_dup 3)] UNSPEC_BLENDV))]
+{
+  operands[0] = gen_lowpart (mode, operands[0]);
+  operands[2] = gen_lowpart (mode, operands[2]);
+  operands[1] = force_reg (mode,
+  gen_lowpart (mode, operands[1]));
+  operands[3] = gen_lowpart (mode, operands[3]);
+})
+
 (define_insn "_dp"
   [(set (match_operand:VF_128_256 0 "register_operand" "=Yr,*x,x")
(unspec:VF_128_256
@@ -23236,6 +23320,30 @@ (define_insn_and_split "*_pblendvb_lt"
(set_attr "btver2_decode" "vector,vector,vector")
(set_attr "mode" "")])
 
+(define_insn_and_split "*_pblendvb_gt"
+  [(set (match_operand:VI1_AVX2 0 "register_operand" "=Yr,*x,x")
+   (unspec:VI1_AVX2
+ [(match_operand:VI1_AVX2 1 "vector_operand" "Yrja,*xja,xjm")
+  (match_operand:VI1_AVX2 2 "register_operand" "0,0,x")
+  (gt:VI1_AVX2 (match_operand:VI

[PATCH 0/7][x86] Remove vcond{,u,eq} expanders.

2024-06-27 Thread liuhongt
There're several regressions after obsolete vcond{,u,eq},
Some regressions are due to the direct optimizations in
ix86_expand_{fp,int}_vcond..i.e ix86_expand_sse_fp_minmax.
Some regrssions are due to optimizations relies on canonicalization
in ix86_expand_{fp,int}_vcond.

This series add define_split or define_insn_and_split to restore
those optimizations at pass_combine. It fixed most regressions in GCC
testsuite except for ones compiled w/o sse4.1. W/o sse4.1 it takes 3
instrution for vector condition move, and pass_combine only supports
at most 4 instructions combination. One possible solution is add fake
"ssemovcc" instructions to help combine, and split that back to real
instruction. This series doesn't handle that, but just adjust testcases
to XFAIL.

I also test performance on SPEC2017 with different options set.
-march=sapphirerapids -O2
-march=x86-64-v3 -O2
-march=x86-64 -O2
-march=sapphirerapids -O2
Didn't observe obvious performance change, mostly same binaries.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Any comments?

liuhongt (7):
  [x86] Add more splitters to match (unspec [op1 op2 (gt op3
constm1_operand)] UNSPEC_BLENDV)
  Lower AVX512 kmask comparison back to AVX2 comparison when
op_{true,false} is vector -1/0.
  [x86] Match IEEE min/max with UNSPEC_IEEE_{MIN,MAX}.
  Add more splitter for mskmov with avx512 comparison.
  Adjust testcase for the regressed testcases after obsolete of
vcond{,u,eq}.
  [x86] Optimize a < 0 ? -1 : 0 to (signed)a >> 31.
  Remove vcond{,u,eq} expanders since they will be obsolete.

 gcc/config/i386/mmx.md| 149 ++--
 gcc/config/i386/sse.md| 772 +-
 gcc/testsuite/g++.target/i386/avx2-pr115517.C |  60 ++
 .../g++.target/i386/avx512-pr115517.C |  70 ++
 gcc/testsuite/g++.target/i386/pr100637-1b.C   |   4 +-
 gcc/testsuite/g++.target/i386/pr100637-1w.C   |   4 +-
 gcc/testsuite/g++.target/i386/pr103861-1.C|   4 +-
 .../g++.target/i386/sse4_1-pr100637-1b.C  |  17 +
 .../g++.target/i386/sse4_1-pr100637-1w.C  |  17 +
 .../g++.target/i386/sse4_1-pr103861-1.C   |  17 +
 gcc/testsuite/gcc.target/i386/avx2-pr115517.c |  33 +
 .../gcc.target/i386/avx512-pr115517.c |  70 ++
 gcc/testsuite/gcc.target/i386/pr103941-2.c|   2 +-
 gcc/testsuite/gcc.target/i386/pr111023-2.c|   4 +-
 gcc/testsuite/gcc.target/i386/pr88540.c   |   4 +-
 .../gcc.target/i386/sse4_1-pr88540.c  |  10 +
 gcc/testsuite/gcc.target/i386/vect-div-1.c|   3 +-
 17 files changed, 918 insertions(+), 322 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/avx2-pr115517.C
 create mode 100644 gcc/testsuite/g++.target/i386/avx512-pr115517.C
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr100637-1b.C
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr100637-1w.C
 create mode 100644 gcc/testsuite/g++.target/i386/sse4_1-pr103861-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr115517.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512-pr115517.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse4_1-pr88540.c

-- 
2.31.1



[PATCH 3/7] [x86] Match IEEE min/max with UNSPEC_IEEE_{MIN,MAX}.

2024-06-27 Thread liuhongt
These versions of the min/max patterns implement exactly the operations
   min = (op1 < op2 ? op1 : op2)
   max = (!(op1 < op2) ? op1 : op2)

gcc/ChangeLog:
PR target/115517
* config/i386/sse.md (*minmax3_1): New pre_reload
define_insn_and_split.
(*minmax3_2): Ditto.
---
 gcc/config/i386/sse.md | 63 ++
 1 file changed, 63 insertions(+)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 822159a869b..92f8b74999f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3064,6 +3064,69 @@ (define_insn 
"*3"
(set_attr "prefix" "")
(set_attr "mode" "")])
 
+(define_insn_and_split "*minmax3_1"
+  [(set (match_operand:VFH 0 "register_operand")
+   (vec_merge:VFH
+ (match_operand:VFH 1 "nonimmediate_operand")
+ (match_operand:VFH 2 "nonimmediate_operand")
+ (unspec:
+   [(match_operand:VFH 3 "nonimmediate_operand")
+(match_operand:VFH 4 "nonimmediate_operand")
+(match_operand:SI 5 "const_0_to_31_operand")]
+UNSPEC_PCMP)))]
+  "TARGET_SSE && ix86_pre_reload_split ()
+   && ((rtx_equal_p (operands[1], operands[3])
+   && rtx_equal_p (operands[2], operands[4]))
+   || (rtx_equal_p (operands[1], operands[4])
+  && rtx_equal_p (operands[2], operands[3])))
+   && (INTVAL (operands[5]) == 1 || INTVAL (operands[5]) == 14)"
+   "#"
+   "&& 1"
+   [(const_int 0)]
+ {
+   int u = UNSPEC_IEEE_MIN;
+   if ((INTVAL (operands[5]) == 1 && rtx_equal_p (operands[1], operands[4]))
+   || (INTVAL (operands[5]) == 14 && rtx_equal_p (operands[1], 
operands[3])))
+ u = UNSPEC_IEEE_MAX;
+
+   if (MEM_P (operands[1]))
+ operands[1] = force_reg (mode, operands[1]);
+   rtvec v = gen_rtvec (2, operands[1], operands[2]);
+   rtx tmp = gen_rtx_UNSPEC (mode, v, u);
+   emit_move_insn (operands[0], tmp);
+   DONE;
+ })
+
+(define_insn_and_split "*minmax3_2"
+  [(set (match_operand:VF_128_256 0 "register_operand")
+   (unspec:VF_128_256
+ [(match_operand:VF_128_256 1 "nonimmediate_operand")
+  (match_operand:VF_128_256 2 "nonimmediate_operand")
+  (lt:VF_128_256
+(match_operand:VF_128_256 3 "nonimmediate_operand")
+(match_operand:VF_128_256 4 "nonimmediate_operand"))]
+UNSPEC_BLENDV))]
+  "TARGET_SSE && ix86_pre_reload_split ()
+   && ((rtx_equal_p (operands[1], operands[3])
+   && rtx_equal_p (operands[2], operands[4]))
+   || (rtx_equal_p (operands[1], operands[4])
+  && rtx_equal_p (operands[2], operands[3])))"
+   "#"
+   "&& 1"
+   [(const_int 0)]
+ {
+   int u = UNSPEC_IEEE_MIN;
+   if (rtx_equal_p (operands[1], operands[3]))
+ u = UNSPEC_IEEE_MAX;
+
+   if (MEM_P (operands[2]))
+ force_reg (mode, operands[2]);
+   rtvec v = gen_rtvec (2, operands[2], operands[1]);
+   rtx tmp = gen_rtx_UNSPEC (mode, v, u);
+   emit_move_insn (operands[0], tmp);
+   DONE;
+ })
+
 ;; These versions of the min/max patterns implement exactly the operations
 ;;   min = (op1 < op2 ? op1 : op2)
 ;;   max = (!(op1 < op2) ? op1 : op2)
-- 
2.31.1



[PATCH V2] Fix wrong cost of MEM when addr is a lea.

2024-06-26 Thread liuhongt
> But rtx_cost invokes targetm.rtx_cost which allows to avoid that
> recursive processing at any level.  You're dealing with MEM [addr]
> here, so why's rtx_cost (addr, Pmode, MEM, 0, speed) not always
> the best way to deal with this?  Since this is the MEM [addr] case
> we know it's not LEA, no?
The patch restrict MEM rtx_cost reduction only for register_operand + disp.


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?


416.gamess regressed 4-6% on x86_64 since my r15-882-g1d6199e5f8c1c0.
The commit adjust rtx_cost of mem to reduce cost of (add op0 disp).
But Cost of ADDR could be cheaper than XEXP (addr, 0) when it's a lea.
It is the case in the PR, the patch adjust rtx_cost to only handle reg
+ disp, for other forms, they're basically all LEA which doesn't have
additional cost of ADD.

gcc/ChangeLog:

PR target/115462
* config/i386/i386.cc (ix86_rtx_costs): Make cost of MEM (reg +
disp) just a little bit more than MEM (reg).

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr115462.c: New test.
---
 gcc/config/i386/i386.cc  |  5 -
 gcc/testsuite/gcc.target/i386/pr115462.c | 22 ++
 2 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115462.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ccc24be6e..ef2a1e4f4f2 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22339,7 +22339,10 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 address_cost should be used, but it reduce cost too much.
 So current solution is make constant disp as cheap as possible.  */
  if (GET_CODE (addr) == PLUS
- && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
+ && x86_64_immediate_operand (XEXP (addr, 1), Pmode)
+ /* Only hanlde (reg + disp) since other forms of addr are mostly 
LEA,
+there's no additional cost for the plus of disp.  */
+ && register_operand (XEXP (addr, 0), Pmode))
{
  *total += 1;
  *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
diff --git a/gcc/testsuite/gcc.target/i386/pr115462.c 
b/gcc/testsuite/gcc.target/i386/pr115462.c
new file mode 100644
index 000..ad50a6382bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115462.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -fno-tree-vectorize -fno-pic" } */
+/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 3 } 
} */
+
+int
+foo (long indx, long indx2, long indx3, long indx4, long indx5, long indx6, 
long n, int* q)
+{
+  static int p1[1];
+  int* p2 = p1 + 1000;
+  int* p3 = p1 + 4000;
+  int* p4 = p1 + 8000;
+
+  for (long i = 0; i != n; i++)
+{
+  /* scan for  movl%edi, p1.0+3996(,%rax,4),
+p1.0+3996 should be propagted into the loop.  */
+  p2[indx++] = q[indx++];
+  p3[indx2++] = q[indx2++];
+  p4[indx3++] = q[indx3++];
+}
+  return p1[indx6] + p1[indx5];
+}
-- 
2.31.1



[PATCH] Fix wrong cost of MEM when addr is a lea.

2024-06-25 Thread liuhongt
416.gamess regressed 4-6% on x86_64 since my r15-882-g1d6199e5f8c1c0.
The commit adjust rtx_cost of mem to reduce cost of (add op0 disp).
But Cost of ADDR could be cheaper than XEXP (addr, 0) when it's a lea.
It is the case in the PR, the patch uses lower cost to enable more
simplication and fix the regression.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/115462
* config/i386/i386.cc (ix86_rtx_costs): Use cost of addr when
it's lower than rtx_cost (XEXP (addr, 0)) + 1.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr115462.c: New test.
---
 gcc/config/i386/i386.cc  |  9 +++--
 gcc/testsuite/gcc.target/i386/pr115462.c | 22 ++
 2 files changed, 29 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115462.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d4ccc24be6e..83dab8220dd 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22341,8 +22341,13 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
  if (GET_CODE (addr) == PLUS
  && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
{
- *total += 1;
- *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
+ /* PR115462: Cost of ADDR could be cheaper than XEXP (addr, 0)
+when it's a lea, use lower cost to enable more
+simplification.  */
+ unsigned cost1 = rtx_cost (addr, Pmode, MEM, 0, speed);
+ unsigned cost2 = rtx_cost (XEXP (addr, 0), Pmode,
+PLUS, 0, speed) + 1;
+ *total += MIN (cost1, cost2);
  return true;
}
}
diff --git a/gcc/testsuite/gcc.target/i386/pr115462.c 
b/gcc/testsuite/gcc.target/i386/pr115462.c
new file mode 100644
index 000..ad50a6382bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115462.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -fno-tree-vectorize -fno-pic" } */
+/* { dg-final { scan-assembler-times {(?n)movl[ \t]+.*, p1\.0\+[0-9]*\(,} 3 } 
} */
+
+int
+foo (long indx, long indx2, long indx3, long indx4, long indx5, long indx6, 
long n, int* q)
+{
+  static int p1[1];
+  int* p2 = p1 + 1000;
+  int* p3 = p1 + 4000;
+  int* p4 = p1 + 8000;
+
+  for (long i = 0; i != n; i++)
+{
+  /* scan for  movl%edi, p1.0+3996(,%rax,4),
+p1.0+3996 should be propagted into the loop.  */
+  p2[indx++] = q[indx++];
+  p3[indx2++] = q[indx2++];
+  p4[indx3++] = q[indx3++];
+}
+  return p1[indx6] + p1[indx5];
+}
-- 
2.31.1



[PATCH V3 Committed] [x86] Optimize a < 0 ? -1 : 0 to (signed)a >> 31.

2024-06-25 Thread liuhongt
Here's the patch committed.

Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
and x < 0 ? 1 : 0 into (unsigned) x >> 31.

Move the optimization did in ix86_expand_int_vcond to match.pd

gcc/ChangeLog:

PR target/114189
* match.pd: Simplify a < 0 ? -1 : 0 to (signed) >> 31 and a <
0 ? 1 : 0 to (unsigned) a >> 31 for vector integer type.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-pr115517.c: New test.
* gcc.target/i386/avx512-pr115517.c: New test.
* g++.target/i386/avx2-pr115517.C: New test.
* g++.target/i386/avx512-pr115517.C: New test.
* g++.dg/tree-ssa/pr88152-1.C: Adjust testcase.
---
 gcc/match.pd  | 30 
 gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C |  2 +-
 gcc/testsuite/g++.target/i386/avx2-pr115517.C | 60 
 .../g++.target/i386/avx512-pr115517.C | 70 +++
 gcc/testsuite/gcc.target/i386/avx2-pr115517.c | 33 +
 .../gcc.target/i386/avx512-pr115517.c | 70 +++
 6 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.target/i386/avx2-pr115517.C
 create mode 100644 gcc/testsuite/g++.target/i386/avx512-pr115517.C
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr115517.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512-pr115517.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 3d0689c9312..cf8a399a744 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5927,6 +5927,36 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(if (VECTOR_INTEGER_TYPE_P (type)
&& target_supports_op_p (type, MINMAX, optab_vector))
 (minmax @0 @1
+
+/* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+   and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
+(simplify
+  (vec_cond (lt @0 integer_zerop) integer_all_onesp integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
+   && !TYPE_UNSIGNED (TREE_TYPE (@0))
+   && tree_nop_conversion_p (type, TREE_TYPE (@0))
+   && target_supports_op_p (TREE_TYPE (@0), RSHIFT_EXPR, optab_scalar))
+(with
+  {
+   unsigned int prec = element_precision (TREE_TYPE (@0));
+  }
+(view_convert
+  (rshift @0 { build_int_cst (integer_type_node, prec - 1);})
+
+(simplify
+  (vec_cond (lt @0 integer_zerop) integer_onep integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
+   && !TYPE_UNSIGNED (TREE_TYPE (@0))
+   && tree_nop_conversion_p (type, TREE_TYPE (@0)))
+(with
+ {
+   unsigned int prec = element_precision (TREE_TYPE (@0));
+   tree utype = unsigned_type_for (TREE_TYPE (@0));
+ }
+ (if (target_supports_op_p (utype, RSHIFT_EXPR, optab_scalar))
+  (view_convert
+   (rshift (view_convert:utype @0)
+   { build_int_cst (integer_type_node, prec - 1);}))
 #endif
 
 (for cnd (cond vec_cond)
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
index 423ec897c1d..21299b886f0 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
@@ -1,7 +1,7 @@
 // PR target/88152
 // { dg-do compile }
 // { dg-options "-O2 -std=c++14 -fdump-tree-forwprop1" }
-// { dg-final { scan-tree-dump-times " (?:<|>=) \{ 0\[, ]" 120 "forwprop1" } }
+// { dg-final { scan-tree-dump-times " (?:(?:<|>=) \{ 0\[, \]|>> 
(?:7|15|31|63))" 120 "forwprop1" } }
 
 template 
 using V [[gnu::vector_size (sizeof (T) * N)]] = T;
diff --git a/gcc/testsuite/g++.target/i386/avx2-pr115517.C 
b/gcc/testsuite/g++.target/i386/avx2-pr115517.C
new file mode 100644
index 000..ec000c57542
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/avx2-pr115517.C
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times "vpsrlq" 2 } } */
+/* { dg-final { scan-assembler-times "vpsrld" 2 } } */
+/* { dg-final { scan-assembler-times "vpsrlw" 2 } } */
+
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+
+v8hi
+foo (v8hi a)
+{
+  v8hi const1_op = __extension__(v8hi){1,1,1,1,1,1,1,1};
+  v8hi const0_op = __extension__(v8hi){0,0,0,0,0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v16hi
+foo2 (v16hi a)
+{
+  v16hi const1_op = __extension__(v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+  v16hi const0_op = __extension__(v16hi){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v4si
+foo3 (v4si a)
+{
+  v4si const1_op = __extension__(v4si){1,1,1,1};
+  v4si const0_op = __extension__(v4si){0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v8si
+foo4 (v8si a)
+{
+  v8si const1_op = __extension__(v8si){1,1,1,1,1,1,1,1};
+  v8si const0_op = __extens

[PATCH V2] [x86] Optimize a < 0 ? -1 : 0 to (signed)a >> 31.

2024-06-23 Thread liuhongt
> I think the check for TYPE_UNSIGNED should be of TREE_TYPE (@0) rather
> than type here.

Changed

> Or maybe you need `types_match (type, TREE_TYPE (@0))` too.
And use tree_nop_conversion_p (type, TREE_TYPE (@0)) and add view_convert to 
rshift.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?


Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
and x < 0 ? 1 : 0 into (unsigned) x >> 31.

Move the optimization did in ix86_expand_int_vcond to match.pd

gcc/ChangeLog:

PR target/114189
* match.pd: Simplify a < 0 ? -1 : 0 to (signed) >> 31 and a <
0 ? 1 : 0 to (unsigned) a >> 31 for vector integer type.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-pr115517.c: New test.
* gcc.target/i386/avx512-pr115517.c: New test.
* g++.target/i386/avx2-pr115517.C: New test.
* g++.target/i386/avx512-pr115517.C: New test.
* g++.dg/tree-ssa/pr88152-1.C: Adjust testcase.
---
 gcc/match.pd  | 31 
 gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C |  2 +-
 gcc/testsuite/g++.target/i386/avx2-pr115517.C | 60 
 .../g++.target/i386/avx512-pr115517.C | 70 +++
 gcc/testsuite/gcc.target/i386/avx2-pr115517.c | 33 +
 .../gcc.target/i386/avx512-pr115517.c | 70 +++
 6 files changed, 265 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.target/i386/avx2-pr115517.C
 create mode 100644 gcc/testsuite/g++.target/i386/avx512-pr115517.C
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr115517.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512-pr115517.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 3d0689c9312..1d10451d0de 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5927,6 +5927,37 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(if (VECTOR_INTEGER_TYPE_P (type)
&& target_supports_op_p (type, MINMAX, optab_vector))
 (minmax @0 @1
+
+/* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+   and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
+(simplify
+  (vec_cond (lt @0 integer_zerop) integer_all_onesp integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
+   && !TYPE_UNSIGNED (TREE_TYPE (@0))
+   && tree_nop_conversion_p (type, TREE_TYPE (@0))
+   && target_supports_op_p (TREE_TYPE (@0), RSHIFT_EXPR, optab_scalar))
+(with
+  {
+   unsigned int prec = element_precision (TREE_TYPE (@0));
+  }
+(view_convert:type
+  (rshift @0 { build_int_cst (integer_type_node, prec - 1);})
+
+(simplify
+  (vec_cond (lt @0 integer_zerop) integer_onep integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (TREE_TYPE (@0))
+   && !TYPE_UNSIGNED (TREE_TYPE (@0))
+   && tree_nop_conversion_p (type, TREE_TYPE (@0))
+   && target_supports_op_p (unsigned_type_for (TREE_TYPE (@0)),
+   RSHIFT_EXPR, optab_scalar))
+(with
+  {
+   unsigned int prec = element_precision (TREE_TYPE (@0));
+   tree utype = unsigned_type_for (TREE_TYPE (@0));
+  }
+(view_convert:type
+  (rshift (view_convert:utype @0)
+ { build_int_cst (integer_type_node, prec - 1);})
 #endif
 
 (for cnd (cond vec_cond)
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
index 423ec897c1d..21299b886f0 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
@@ -1,7 +1,7 @@
 // PR target/88152
 // { dg-do compile }
 // { dg-options "-O2 -std=c++14 -fdump-tree-forwprop1" }
-// { dg-final { scan-tree-dump-times " (?:<|>=) \{ 0\[, ]" 120 "forwprop1" } }
+// { dg-final { scan-tree-dump-times " (?:(?:<|>=) \{ 0\[, \]|>> 
(?:7|15|31|63))" 120 "forwprop1" } }
 
 template 
 using V [[gnu::vector_size (sizeof (T) * N)]] = T;
diff --git a/gcc/testsuite/g++.target/i386/avx2-pr115517.C 
b/gcc/testsuite/g++.target/i386/avx2-pr115517.C
new file mode 100644
index 000..ec000c57542
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/avx2-pr115517.C
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times "vpsrlq" 2 } } */
+/* { dg-final { scan-assembler-times "vpsrld" 2 } } */
+/* { dg-final { scan-assembler-times "vpsrlw" 2 } } */
+
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+
+v8hi
+foo (v8hi a)
+{
+  v8hi const1_op = __extension__(v8hi){1,1,1,1,1,1,1,1};
+  v8hi const0_op = __extension__(v8hi){0,0,0,0,0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v16hi
+foo2 (v16hi a)
+{
+  v16hi const1_op = __extension__(v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+  v16hi const0_op = __extension__(v16hi){0,0,0,0,0,0,0,0

[PATCH] [match.pd] Optimize a < 0 ? -1 : 0 to (signed)a >> 31.

2024-06-20 Thread liuhongt
Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
and x < 0 ? 1 : 0 into (unsigned) x >> 31.

Move the optimization did in ix86_expand_int_vcond to match.pd

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}, aarch64-linux-gnu.
Ok for trunk?

gcc/ChangeLog:

PR target/114189
* match.pd: Simplify a < 0 ? -1 : 0 to (signed) >> 31 and a <
0 ? 1 : 0 to (unsigned) a >> 31 for vector integer type.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-pr115517.c: New test.
* gcc.target/i386/avx512-pr115517.c: New test.
* g++.target/i386/avx2-pr115517.C: New test.
* g++.target/i386/avx512-pr115517.C: New test.
* g++.dg/tree-ssa/pr88152-1.C: Adjust testcase.
---
 gcc/match.pd  | 28 
 gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C |  2 +-
 gcc/testsuite/g++.target/i386/avx2-pr115517.C | 60 
 .../g++.target/i386/avx512-pr115517.C | 70 +++
 gcc/testsuite/gcc.target/i386/avx2-pr115517.c | 33 +
 .../gcc.target/i386/avx512-pr115517.c | 70 +++
 6 files changed, 262 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.target/i386/avx2-pr115517.C
 create mode 100644 gcc/testsuite/g++.target/i386/avx512-pr115517.C
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr115517.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512-pr115517.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 3d0689c9312..41dd90493e7 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5927,6 +5927,34 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(if (VECTOR_INTEGER_TYPE_P (type)
&& target_supports_op_p (type, MINMAX, optab_vector))
 (minmax @0 @1
+
+/* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
+   and x < 0 ? 1 : 0 into (unsigned) x >> 31.  */
+(simplify
+  (vec_cond (lt @0 integer_zerop) integer_all_onesp integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (type)
+   && !TYPE_UNSIGNED (type)
+   && target_supports_op_p (type, RSHIFT_EXPR, optab_scalar))
+(with
+  {
+   unsigned int prec = element_precision (type);
+  }
+(rshift @0 { build_int_cst (integer_type_node, prec - 1);}
+
+(simplify
+  (vec_cond (lt @0 integer_zerop) integer_onep integer_zerop)
+   (if (VECTOR_INTEGER_TYPE_P (type)
+   && !TYPE_UNSIGNED (type)
+   && target_supports_op_p (unsigned_type_for (type),
+RSHIFT_EXPR, optab_scalar))
+(with
+  {
+   unsigned int prec = element_precision (type);
+   tree utype = unsigned_type_for (type);
+  }
+(view_convert:type
+  (rshift (view_convert:utype @0)
+ { build_int_cst (integer_type_node, prec - 1);})
 #endif
 
 (for cnd (cond vec_cond)
diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C 
b/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
index 423ec897c1d..21299b886f0 100644
--- a/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
+++ b/gcc/testsuite/g++.dg/tree-ssa/pr88152-1.C
@@ -1,7 +1,7 @@
 // PR target/88152
 // { dg-do compile }
 // { dg-options "-O2 -std=c++14 -fdump-tree-forwprop1" }
-// { dg-final { scan-tree-dump-times " (?:<|>=) \{ 0\[, ]" 120 "forwprop1" } }
+// { dg-final { scan-tree-dump-times " (?:(?:<|>=) \{ 0\[, \]|>> 
(?:7|15|31|63))" 120 "forwprop1" } }
 
 template 
 using V [[gnu::vector_size (sizeof (T) * N)]] = T;
diff --git a/gcc/testsuite/g++.target/i386/avx2-pr115517.C 
b/gcc/testsuite/g++.target/i386/avx2-pr115517.C
new file mode 100644
index 000..ec000c57542
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/avx2-pr115517.C
@@ -0,0 +1,60 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx2 -O2" } */
+/* { dg-final { scan-assembler-times "vpsrlq" 2 } } */
+/* { dg-final { scan-assembler-times "vpsrld" 2 } } */
+/* { dg-final { scan-assembler-times "vpsrlw" 2 } } */
+
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+
+v8hi
+foo (v8hi a)
+{
+  v8hi const1_op = __extension__(v8hi){1,1,1,1,1,1,1,1};
+  v8hi const0_op = __extension__(v8hi){0,0,0,0,0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v16hi
+foo2 (v16hi a)
+{
+  v16hi const1_op = __extension__(v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
+  v16hi const0_op = __extension__(v16hi){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v4si
+foo3 (v4si a)
+{
+  v4si const1_op = __extension__(v4si){1,1,1,1};
+  v4si const0_op = __extension__(v4si){0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v8si
+foo4 (v8si a)
+{
+  v8si const1_op = __extension__(v8si){1,1,1,1,1,1,1,1};
+  v8si const0_op = __extension__(v8si){0,0,0,0,0,0,0,0};
+  return a < const0_op ? const1_op : const0_op;
+}
+
+v2di
+foo3 (v2

[PATCH] Remove one_if_conv for latest Intel processors and Generic.

2024-06-13 Thread liuhongt
The tune is added by PR79390 for SciMark2 on Broadwell.
For latest GCC, with and without the -mtune-ctrl=^one_if_conv_insn.
GCC will generate the same binary for SciMark2. And for SPEC2017,
there's no big impact for SKX/CLX/ICX, and small improvements on SPR
and later.

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_ONE_IF_CONV_INSN): Remove
latest Intel processors and Generic.

Co-authored by: Lingling Kong 
---
 gcc/config/i386/x86-tune.def | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 0fa1484b48d..66512992b7b 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -346,8 +346,8 @@ DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", 
m_BDVER3 | m_BDVER4)
 /* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in
if-converted sequence to one.  */
 DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn",
- m_SILVERMONT | m_INTEL | m_CORE_ALL | m_GOLDMONT | m_GOLDMONT_PLUS
- | m_TREMONT | m_CORE_HYBRID | m_CORE_ATOM | m_ZHAOXIN | m_GENERIC)
+ m_SILVERMONT | m_HASWELL | m_SKYLAKE | m_GOLDMONT | m_GOLDMONT_PLUS
+ | m_TREMONT  | m_ZHAOXIN)
 
 /* X86_TUNE_AVOID_MFENCE: Use lock prefixed instructions instead of mfence.  */
 DEF_TUNE (X86_TUNE_AVOID_MFENCE, "avoid_mfence",
-- 
2.31.1



[PATCH Committed] Fix ICE due to REGNO of a SUBREG.

2024-06-12 Thread liuhongt
Use reg_or_subregno instead.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Committed as an obvious patch.

gcc/ChangeLog:

PR target/115452
* config/i386/i386-features.cc (scalar_chain::convert_op): Use
reg_or_subregno instead of REGNO to avoid ICE.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115452.c: New test.
---
 gcc/config/i386/i386-features.cc | 2 +-
 gcc/testsuite/gcc.target/i386/pr115452.c | 4 
 2 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115452.c

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index e3e004d5526..607d1991460 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -1054,7 +1054,7 @@ scalar_chain::convert_op (rtx *op, rtx_insn *insn)
 
   if (dump_file)
fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
-INSN_UID (insn), REGNO (tmp));
+INSN_UID (insn), reg_or_subregno (tmp));
 }
   else if (REG_P (*op))
 *op = gen_rtx_SUBREG (vmode, *op, 0);
diff --git a/gcc/testsuite/gcc.target/i386/pr115452.c 
b/gcc/testsuite/gcc.target/i386/pr115452.c
new file mode 100644
index 000..6c7935feb9f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115452.c
@@ -0,0 +1,4 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -msse2 -mstv -mno-bmi -mno-stackrealign -fdump-rtl-stv2" 
} */
+
+#include "pr70322-2.c"
-- 
2.31.1



[PATCH] Adjust ix86_rtx_costs for pternlog_operand_p.

2024-06-12 Thread liuhongt
r15-1100-gec985bc97a0157 improves handling of ternlog instructions,
now GCC can recognize lots of pternlog_operand with different
variants.

The patch adjust rtx_costs for that, so pass_combine can
reasonably generate more optimal vpternlog instructions.

.i.e
for avx512f-vpternlog-3.c, with the patch, 2 vpternlog are combined into one.


<   vpternlogd  $168, %zmm1, %zmm0, %zmm2
<   vpternlogd  $0x55, %zmm2, %zmm2, %zmm2
>   vpternlogd  $87, %zmm1, %zmm0, %zmm2
<   vpand   %xmm0, %xmm1, %xmm0
<   vpternlogd  $0x55, %zmm0, %zmm0, %zmm0
>   vpternlogd  $63, %zmm1, %zmm0, %zmm1
>   vmovdqa %xmm1, %xmm0
<   vpternlogd  $188, %zmm2, %zmm0, %zmm1
<   vpternlogd  $0x55, %zmm1, %zmm1, %zmm1
>   vpternlogd  $37, %zmm0, %zmm2, %zmm1

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_rtx_costs): Adjust rtx_cost for
pternlog_operand under AVX512, also adjust VEC_DUPLICATE
according since vec_dup:mem can't be that cheap.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx2-pr98461.c: Scan either notl or
vpternlog.
* gcc.target/i386/avx512f-pr96891-3.c: Also scan for inversed
condition.
* gcc.target/i386/avx512f-vpternlogd-3.c: Adjust vpternlog
number to 673.
* gcc.target/i386/avx512f-vpternlogd-4.c: Ditto.
* gcc.target/i386/avx512f-vpternlogd-5.c: Ditto.
* gcc.target/i386/sse2-v1ti-vne.c: Add -mno-avx512f.
---
 gcc/config/i386/i386.cc   | 39 ++-
 gcc/testsuite/gcc.target/i386/avx2-pr98461.c  |  2 +-
 .../gcc.target/i386/avx512f-pr96891-3.c   |  2 +-
 .../gcc.target/i386/avx512f-vpternlogd-3.c|  2 +-
 .../gcc.target/i386/avx512f-vpternlogd-4.c|  2 +-
 .../gcc.target/i386/avx512f-vpternlogd-5.c|  2 +-
 gcc/testsuite/gcc.target/i386/sse2-v1ti-vne.c |  2 +-
 7 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 173db213d14..9fb1ae575dd 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21571,6 +21571,31 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 = speed ? ix86_tune_cost : &ix86_size_cost;
   int src_cost;
 
+  /* Handling different vternlog variants.  */
+  if ((GET_MODE_SIZE (mode) == 64
+   ? (TARGET_AVX512F && TARGET_EVEX512)
+   : (TARGET_AVX512VL
+ || (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)))
+  && GET_MODE_SIZE (mode) >= 16
+  && outer_code_i == SET
+  && ternlog_operand (x, mode))
+{
+  rtx args[3];
+
+  args[0] = NULL_RTX;
+  args[1] = NULL_RTX;
+  args[2] = NULL_RTX;
+  int idx = ix86_ternlog_idx (x, args);
+  gcc_assert (idx >= 0);
+
+  *total = cost->sse_op;
+  for (int i = 0; i != 3; i++)
+   if (args[i])
+ *total += rtx_cost (args[i], GET_MODE (args[i]), UNSPEC, i, speed);
+  return true;
+}
+
+
   switch (code)
 {
 case SET:
@@ -22233,6 +22258,9 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   else if (XINT (x, 1) == UNSPEC_VTERNLOG)
{
  *total = cost->sse_op;
+ *total += rtx_cost (XVECEXP (x, 0, 0), mode, code, 0, speed);
+ *total += rtx_cost (XVECEXP (x, 0, 1), mode, code, 1, speed);
+ *total += rtx_cost (XVECEXP (x, 0, 2), mode, code, 2, speed);
  return true;
}
   else if (XINT (x, 1) == UNSPEC_PTEST)
@@ -22260,12 +22288,21 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
 
 case VEC_SELECT:
 case VEC_CONCAT:
-case VEC_DUPLICATE:
   /* ??? Assume all of these vector manipulation patterns are
 recognizable.  In which case they all pretty much have the
 same cost.  */
  *total = cost->sse_op;
  return true;
+case VEC_DUPLICATE:
+  *total = rtx_cost (XEXP (x, 0),
+GET_MODE (XEXP (x, 0)),
+VEC_DUPLICATE, 0, speed);
+  /* It's broadcast instruction, not embedded broadcasting.  */
+  if (outer_code == SET)
+   *total += cost->sse_op;
+
+ return true;
+
 case VEC_MERGE:
   mask = XEXP (x, 2);
   /* This is masked instruction, assume the same cost,
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr98461.c 
b/gcc/testsuite/gcc.target/i386/avx2-pr98461.c
index 15f49b864da..225f2ab00e5 100644
--- a/gcc/testsuite/gcc.target/i386/avx2-pr98461.c
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr98461.c
@@ -2,7 +2,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx2 -masm=att" } */
 /* { dg-final { scan-assembler-times "\tvpmovmskb\t" 6 } } */
-/* { dg-final { scan-assembler-times "\tnotl\t" 6 } } */
+/* { dg-final { scan-assembler-times "\t(?:notl|vpternlog\[dq\])\t" 6 } } */
 /* { dg-final { scan-assembler-not "\tvpcmpeq" } } */
 /* { dg-final { scan-assembler-not "\

[PATCH V2] Fix ICE in rtl check due to CONST_WIDE_INT in CONST_VECTOR_DUPLICATE_P

2024-06-11 Thread liuhongt
>
> I think if you only handle CONST_INT_P, you should check just for that, and
> in both places where you check for CONST_VECTOR_DUPLICATE_P (there is one
> spot 2 lines above this).
> So add
> && CONST_INT_P (XVECEXP (XEXP (op0, 1), 0, 0))
> and
> && CONST_INT_P (XVECEXP (op1, 0, 0))
> tests right below those && CONST_VECTOR_DUPLICATE_P (something) tests.
Changed.

The patch add extra check to make sure the component of CONST_VECTOR
is CONST_INT_P.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/115384
* simplify-rtx.cc (simplify_context::simplify_binary_operation_1):
Only do the simplification of (AND (ASHIFTRT A imm) mask)
to (LSHIFTRT A imm) when the component of const_vector is
CONST_INT_P.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115384.c: New test.
---
 gcc/simplify-rtx.cc  |  6 --
 gcc/testsuite/gcc.target/i386/pr115384.c | 12 
 2 files changed, 16 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115384.c

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 9bc3ef9ad9f..3ee95f74d3d 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4072,9 +4072,11 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
   if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
  && (CONST_INT_P (XEXP (op0, 1))
  || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
- && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
+ && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1))
+ && CONST_INT_P (XVECEXP (XEXP (op0, 1), 0, 0
  && GET_CODE (op1) == CONST_VECTOR
- && CONST_VECTOR_DUPLICATE_P (op1))
+ && CONST_VECTOR_DUPLICATE_P (op1)
+ && CONST_INT_P (XVECEXP (op1, 0, 0)))
{
  unsigned HOST_WIDE_INT shift_count
= (CONST_INT_P (XEXP (op0, 1))
diff --git a/gcc/testsuite/gcc.target/i386/pr115384.c 
b/gcc/testsuite/gcc.target/i386/pr115384.c
new file mode 100644
index 000..3ba7a0b8115
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115384.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O" } */
+
+typedef __attribute__((__vector_size__(sizeof(__int128 __int128 W;
+
+W w;
+
+void
+foo()
+{
+  w = w >> 4 & 18446744073709551600llu;
+}
-- 
2.31.1



[PATCH] Fix ICE in rtl check due to CONST_WIDE_INT in CONST_VECTOR_DUPLICATE_P

2024-06-10 Thread liuhongt
In theory, const_wide_int can also be handle with extra check for each 
components of the HOST_WIDE_INT array, and the check is need for both
shift and bit_and operands.
I assume the optimization opportnunity is rare, so the patch just add
extra check to make sure GET_MODE_INNER (mode) can fix into a
HOST_WIDE_INT.

gcc/ChangeLog:

PR target/115384
* simplify-rtx.cc (simplify_context::simplify_binary_operation_1):
Only do the simplification of (AND (ASHIFTRT A imm) mask)
to (LSHIFTRT A imm) when inner mode fits HOST_WIDE_INT.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115384.c: New test.
---
 gcc/simplify-rtx.cc  |  4 +++-
 gcc/testsuite/gcc.target/i386/pr115384.c | 12 
 2 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115384.c

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 9bc3ef9ad9f..4992bee7506 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4074,7 +4074,9 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
  || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
  && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
  && GET_CODE (op1) == CONST_VECTOR
- && CONST_VECTOR_DUPLICATE_P (op1))
+ && CONST_VECTOR_DUPLICATE_P (op1)
+ && (GET_MODE_PRECISION (GET_MODE_INNER (mode))
+ <= HOST_BITS_PER_WIDE_INT))
{
  unsigned HOST_WIDE_INT shift_count
= (CONST_INT_P (XEXP (op0, 1))
diff --git a/gcc/testsuite/gcc.target/i386/pr115384.c 
b/gcc/testsuite/gcc.target/i386/pr115384.c
new file mode 100644
index 000..3ba7a0b8115
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115384.c
@@ -0,0 +1,12 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O" } */
+
+typedef __attribute__((__vector_size__(sizeof(__int128 __int128 W;
+
+W w;
+
+void
+foo()
+{
+  w = w >> 4 & 18446744073709551600llu;
+}
-- 
2.31.1



[PATCH committed] Add additional option --param max-completely-peeled-insns=200 for power64*-*-*

2024-06-06 Thread liuhongt
gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr112325.c:Add additional option --param
max-completely-peeled-insns=200 for power64*-*-*.
---
 gcc/testsuite/gcc.dg/vect/pr112325.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c 
b/gcc/testsuite/gcc.dg/vect/pr112325.c
index dea6cca3b86..143903beab2 100644
--- a/gcc/testsuite/gcc.dg/vect/pr112325.c
+++ b/gcc/testsuite/gcc.dg/vect/pr112325.c
@@ -3,6 +3,7 @@
 /* { dg-require-effective-target vect_int } */
 /* { dg-require-effective-target vect_shift } */
 /* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+/* { dg-additional-options "--param max-completely-peeled-insns=200" { target 
powerpc64*-*-* } } */
 
 typedef unsigned short ggml_fp16_t;
 static float table_f32_f16[1 << 16];
-- 
2.31.1



[PATCH Committed] Refine testcase for power10.

2024-06-05 Thread liuhongt
For power10, there're extra 3 REG_EQUIV notes with (fix:SI. to avoid
the failure. Check (fix:SI is from the pattern not NOTE.

gcc/testsuite/ChangeLog:

PR target/115365
* gcc.dg/pr100927.c: Don't scan fix:SI from the note.
---
 gcc/testsuite/gcc.dg/pr100927.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/pr100927.c b/gcc/testsuite/gcc.dg/pr100927.c
index ea0e627befa..8a7d69c3831 100644
--- a/gcc/testsuite/gcc.dg/pr100927.c
+++ b/gcc/testsuite/gcc.dg/pr100927.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -ftrapping-math -fdump-tree-optimized -fdump-rtl-final" } 
*/
 /* { dg-final { scan-tree-dump-times {(?n)= \(int\)} 3 "optimized" } }  */
-/* { dg-final { scan-rtl-dump-times {(?n)\(fix:SI} 3 "final" } }  */
+/* { dg-final { scan-rtl-dump-times {(?n)^[ \t]*\(fix:SI} 3 "final" } }  */
 
 int
 foo_ofr ()
-- 
2.31.1



[V2 PATCH] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

2024-06-04 Thread liuhongt
> Can you add a testcase for this?  I don't mind if it's x86 specific and
> does a bit of asm scanning.
>
> Also note that the context for this patch has changed, so it won't
> automatically apply.  So be extra careful when updating so that it goes
> into the right place (all the more reason to have a testcase validating
> that the optimization works correctly).
>
>
> I think the patch itself is fine.  So further review is just for the
> testcase and should be easy.
rebased and add a testcase.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?


When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
of A, then it can be simplified to LSHIFTRT.

i.e Simplify
(and:v8hi
  (ashifrt:v8hi A 8)
  (const_vector 0xff x8))
to
(lshifrt:v8hi A 8)

gcc/ChangeLog:

PR target/114428
* simplify-rtx.cc
(simplify_context::simplify_binary_operation_1):
Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
specific mask.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114428-1.c: New test.
---
 gcc/simplify-rtx.cc| 25 ++
 gcc/testsuite/gcc.target/i386/pr114428-1.c | 39 ++
 2 files changed, 64 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114428-1.c

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 5caf1dfd957..05d410898b3 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4050,6 +4050,31 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
return tem;
}
 
+  /* (and:v4si
+  (ashiftrt:v4si A 16)
+  (const_vector: 0x x4))
+is just (lshiftrt:v4si A 16).  */
+  if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
+ && (CONST_INT_P (XEXP (op0, 1))
+ || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
+ && GET_CODE (op1) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (op1))
+   {
+ unsigned HOST_WIDE_INT shift_count
+   = (CONST_INT_P (XEXP (op0, 1))
+  ? UINTVAL (XEXP (op0, 1))
+  : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0)));
+ unsigned HOST_WIDE_INT inner_prec
+   = GET_MODE_PRECISION (GET_MODE_INNER (mode));
+
+ /* Avoid UD shift count.  */
+ if (shift_count < inner_prec
+ && (UINTVAL (XVECEXP (op1, 0, 0))
+ == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1))
+   return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP 
(op0, 1));
+   }
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
diff --git a/gcc/testsuite/gcc.target/i386/pr114428-1.c 
b/gcc/testsuite/gcc.target/i386/pr114428-1.c
new file mode 100644
index 000..927476f2269
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114428-1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psrld" 1 } } */
+/* { dg-final { scan-assembler-times "psrlq" 1 { target { ! ia32 } } } } */
+
+
+#define SHIFTC 12
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+
+v8hi
+foo1 (v8hi a)
+{
+  return
+(a >> (16 - SHIFTC)) & (__extension__(v8hi){(1<> (32 - SHIFTC)) & (__extension__(v4si){(1<> (long long)(64 - SHIFTC)) & (__extension__(v2di){(1ULL<

[PATCH] [x86] Adjust testcase for -march=cascadelake

2024-06-03 Thread liuhongt
Commit as an obvious patch.

gcc/testsuite/ChangeLog:

PR target/115299
* gcc.target/i386/pr86722.c: Also scan for blendvpd.
---
 gcc/testsuite/gcc.target/i386/pr86722.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr86722.c 
b/gcc/testsuite/gcc.target/i386/pr86722.c
index e266a1e56c2..95ddbd8ddb9 100644
--- a/gcc/testsuite/gcc.target/i386/pr86722.c
+++ b/gcc/testsuite/gcc.target/i386/pr86722.c
@@ -6,5 +6,5 @@ void f(double*d,double*e){
 *d=(*d<.5)?.7:0;
 }
 
-/* { dg-final { scan-assembler-times {(?n)(?:andnpd|andpd)} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)(?:andnpd|andpd|blendvpd)} 1 } } */
 /* { dg-final { scan-assembler-not "orpd" } } */
-- 
2.31.1



[PATCH] [x86] Add some preference for floating point rtl ifcvt when sse4.1 is not available

2024-06-02 Thread liuhongt
W/o TARGET_SSE4_1, it takes 3 instructions (pand, pandn and por) for
movdfcc/movsfcc, and could possibly fail cost comparison. Increase
branch cost could hurt performance for other modes, so specially add
some preference for floating point ifcvt.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_noce_conversion_profitable_p): Add
some preference for floating point ifcvt when SSE4.1 is not
available.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr115299.c: New test.
* gcc.target/i386/pr86722.c: Adjust testcase.
---
 gcc/config/i386/i386.cc  | 17 +
 gcc/testsuite/gcc.target/i386/pr115299.c | 10 ++
 gcc/testsuite/gcc.target/i386/pr86722.c  |  2 +-
 3 files changed, 28 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115299.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 1a0206ab573..271da127a89 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24879,6 +24879,23 @@ ix86_noce_conversion_profitable_p (rtx_insn *seq, 
struct noce_if_info *if_info)
return false;
}
 }
+
+  /* W/o TARGET_SSE4_1, it takes 3 instructions (pand, pandn and por)
+ for movdfcc/movsfcc, and could possibly fail cost comparison.
+ Increase branch cost will hurt performance for other modes, so
+ specially add some preference for floating point ifcvt.  */
+  if (!TARGET_SSE4_1 && if_info->x
+  && GET_MODE_CLASS (GET_MODE (if_info->x)) == MODE_FLOAT
+  && if_info->speed_p)
+{
+  unsigned cost = seq_cost (seq, true);
+
+  if (cost <= if_info->original_cost)
+   return true;
+
+  return cost <= (if_info->max_seq_cost + COSTS_N_INSNS (2));
+}
+
   return default_noce_conversion_profitable_p (seq, if_info);
 }
 
diff --git a/gcc/testsuite/gcc.target/i386/pr115299.c 
b/gcc/testsuite/gcc.target/i386/pr115299.c
new file mode 100644
index 000..53c5899136a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115299.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mno-sse4.1 -msse2" } */
+
+void f(double*d,double*e){
+  for(;d

[committed] [x86] Rename double_u with __double_u to avoid pulluting the namespace.

2024-05-30 Thread liuhongt
Committed as an obvious patch.

gcc/ChangeLog:

* config/i386/emmintrin.h (__double_u): Rename from double_u.
(_mm_load_sd): Replace double_u with __double_u.
(_mm_store_sd): Ditto.
(_mm_loadh_pd): Ditto.
(_mm_loadl_pd): Ditto.
* config/i386/xmmintrin.h (__float_u): Rename from float_u.
(_mm_load_ss): Ditto.
(_mm_store_ss): Ditto.
---
 gcc/config/i386/emmintrin.h | 10 +-
 gcc/config/i386/xmmintrin.h |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index fa301103daf..356ca218fcb 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -56,7 +56,7 @@ typedef double __m128d __attribute__ ((__vector_size__ (16), 
__may_alias__));
 /* Unaligned version of the same types.  */
 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), 
__may_alias__, __aligned__ (1)));
 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef double double_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef double __double_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
@@ -146,7 +146,7 @@ _mm_load1_pd (double const *__P)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_sd (double const *__P)
 {
-  return __extension__ (__m128d) { *(double_u *)__P, 0.0 };
+  return __extension__ (__m128d) { *(__double_u *)__P, 0.0 };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -181,7 +181,7 @@ _mm_storeu_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *(double_u *)__P = ((__v2df)__A)[0] ;
+  *(__double_u *)__P = ((__v2df)__A)[0] ;
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
@@ -974,13 +974,13 @@ _mm_unpacklo_pd (__m128d __A, __m128d __B)
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { ((__v2df)__A)[0], *(double_u*)__B };
+  return __extension__ (__m128d) { ((__v2df)__A)[0], *(__double_u*)__B };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return __extension__ (__m128d) { *(double_u*)__B, ((__v2df)__A)[1] };
+  return __extension__ (__m128d) { *(__double_u*)__B, ((__v2df)__A)[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h
index 87515ecb218..c90fc71331a 100644
--- a/gcc/config/i386/xmmintrin.h
+++ b/gcc/config/i386/xmmintrin.h
@@ -72,7 +72,7 @@ typedef float __m128 __attribute__ ((__vector_size__ (16), 
__may_alias__));
 
 /* Unaligned version of the same type.  */
 typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, 
__aligned__ (1)));
-typedef float float_u __attribute__ ((__may_alias__, __aligned__ (1)));
+typedef float __float_u __attribute__ ((__may_alias__, __aligned__ (1)));
 
 /* Internal data types for implementing the intrinsics.  */
 typedef float __v4sf __attribute__ ((__vector_size__ (16)));
@@ -910,7 +910,7 @@ _mm_set_ps1 (float __F)
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_load_ss (float const *__P)
 {
-  return __extension__ (__m128) (__v4sf){ *(float_u *)__P, 0.0f, 0.0f, 0.0f };
+  return __extension__ (__m128) (__v4sf){ *(__float_u *)__P, 0.0f, 0.0f, 0.0f 
};
 }
 
 /* Create a vector with all four elements equal to *P.  */
@@ -966,7 +966,7 @@ _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_store_ss (float *__P, __m128 __A)
 {
-  *(float_u *)__P = ((__v4sf)__A)[0];
+  *(__float_u *)__P = ((__v4sf)__A)[0];
 }
 
 extern __inline float __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
-- 
2.31.1



[PATCH] [x86] Support vcond_mask_qiqi and friends.

2024-05-28 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/sse.md (vcond_mask_): New expander.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr114125.c: New test.
---
 gcc/config/i386/sse.md   | 20 
 gcc/testsuite/gcc.target/i386/pr114125.c | 10 ++
 2 files changed, 30 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114125.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 0f4fbcb2c5d..7cd912eeeb1 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -4807,6 +4807,26 @@ (define_expand "vcond_mask_"
   DONE;
 })
 
+(define_expand "vcond_mask_"
+  [(match_operand:SWI1248_AVX512BW 0 "register_operand")
+   (match_operand:SWI1248_AVX512BW 1 "register_operand")
+   (match_operand:SWI1248_AVX512BW 2 "register_operand")
+   (match_operand:SWI1248_AVX512BW 3 "register_operand")]
+  "TARGET_AVX512F"
+{
+  /* (operand[1] & operand[3]) | (operand[2] & ~operand[3])  */
+  rtx op1 = gen_reg_rtx (mode);
+  rtx op2 = gen_reg_rtx (mode);
+  rtx op3 = gen_reg_rtx (mode);
+
+  emit_insn (gen_and3 (op1, operands[1], operands[3]));
+  emit_insn (gen_one_cmpl2 (op3, operands[3]));
+  emit_insn (gen_and3 (op2, operands[2], op3));
+  emit_insn (gen_ior3 (operands[0], op1, op2));
+
+  DONE;
+})
+
 ;
 ;;
 ;; Parallel floating point logical operations
diff --git a/gcc/testsuite/gcc.target/i386/pr114125.c 
b/gcc/testsuite/gcc.target/i386/pr114125.c
new file mode 100644
index 000..e63fbffe965
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114125.c
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=x86-64-v4 -fdump-tree-forwprop3-raw " } */
+
+typedef long vec __attribute__((vector_size(16)));
+vec f(vec x){
+  vec y = x < 10;
+  return y & (y == 0);
+}
+
+/* { dg-final { scan-tree-dump-not "_expr" "forwprop3" } } */
-- 
2.31.1



[PATCH V2] Reduce cost of MEM (A + imm).

2024-05-28 Thread liuhongt
> IMO, there is no need for CONST_INT_P condition, we should also allow
> symbol_ref, label_ref and const (all allowed by
> x86_64_immediate_operand predicate), these all decay to an immediate
> value.

Changed.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk.

For MEM, rtx_cost iterates each subrtx, and adds up the costs,
so for MEM (reg) and MEM (reg + 4), the former costs 5,
the latter costs 9, it is not accurate for x86. Ideally
address_cost should be used, but it reduce cost too much.
So current solution is make constant disp as cheap as possible.

gcc/ChangeLog:

PR target/67325
* config/i386/i386.cc (ix86_rtx_costs): Reduce cost of MEM (A
+ imm) to "cost of MEM (A)" + 1.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr67325.c: New test.
---
 gcc/config/i386/i386.cc | 18 +-
 gcc/testsuite/gcc.target/i386/pr67325.c |  7 +++
 2 files changed, 24 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr67325.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3e2a3a194f1..85d87b9f778 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22194,7 +22194,23 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   /* An insn that accesses memory is slightly more expensive
  than one that does not.  */
   if (speed)
-*total += 1;
+   {
+ *total += 1;
+ rtx addr = XEXP (x, 0);
+ /* For MEM, rtx_cost iterates each subrtx, and adds up the costs,
+so for MEM (reg) and MEM (reg + 4), the former costs 5,
+the latter costs 9, it is not accurate for x86. Ideally
+address_cost should be used, but it reduce cost too much.
+So current solution is make constant disp as cheap as possible.  */
+ if (GET_CODE (addr) == PLUS
+ && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
+   {
+ *total += 1;
+ *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
+ return true;
+   }
+   }
+
   return false;
 
 case ZERO_EXTRACT:
diff --git a/gcc/testsuite/gcc.target/i386/pr67325.c 
b/gcc/testsuite/gcc.target/i386/pr67325.c
new file mode 100644
index 000..c3c1e4c5b4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr67325.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "(?:sar|shr)" } } */
+
+int f(long*l){
+  return *l>>32;
+}
-- 
2.31.1



[PATCH][committed] [avx512] Fix predicate mismatch between vfcmaddcph's define_insn and define_expand.

2024-05-27 Thread liuhongt
When I applied Roger's patch [1], there's ICE due to it.
The patch fix the latent bug.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651365.html

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Pushed to trunk.

gcc/ChangeLog:

* config/i386/sse.md
(___mask): Align
operands' predicate with corresponding expander.
(__):
Ditto.
---
 gcc/config/i386/sse.md | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index b59c988fc31..0f4fbcb2c5d 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -6867,9 +6867,9 @@ (define_insn 
"___mask"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=&v")
(vec_merge:VHF_AVX512VL
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
-(match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")
-(match_operand:VHF_AVX512VL 3 "register_operand" "0")]
+   [(match_operand:VHF_AVX512VL 1 "" 
"v")
+(match_operand:VHF_AVX512VL 2 "" 
"")
+(match_operand:VHF_AVX512VL 3 "" "0")]
 UNSPEC_COMPLEX_F_C_MA)
  (match_dup 1)
  (unspec:
@@ -6892,8 +6892,8 @@ (define_expand "cmul3"
 (define_insn "__"
   [(set (match_operand:VHF_AVX512VL 0 "register_operand" "=&v")
  (unspec:VHF_AVX512VL
-   [(match_operand:VHF_AVX512VL 1 "nonimmediate_operand" "v")
-(match_operand:VHF_AVX512VL 2 "nonimmediate_operand" 
"")]
+   [(match_operand:VHF_AVX512VL 1 "" 
"v")
+(match_operand:VHF_AVX512VL 2 "" 
"")]
 UNSPEC_COMPLEX_F_C_MUL))]
   "TARGET_AVX512FP16 && "
 {
-- 
2.31.1



[PATCH] Reduce cost of MEM (A + imm).

2024-05-27 Thread liuhongt
For MEM, rtx_cost iterates each subrtx, and adds up the costs,
so for MEM (reg) and MEM (reg + 4), the former costs 5,
the latter costs 9, it is not accurate for x86. Ideally
address_cost should be used, but it reduce cost too much.
So current solution is make constant disp as cheap as possible.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR target/67325
* config/i386/i386.cc (ix86_rtx_costs): Reduce cost of MEM (A
+ imm) to "cost of MEM (A)" + 1.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr67325.c: New test.
---
 gcc/config/i386/i386.cc | 19 ++-
 gcc/testsuite/gcc.target/i386/pr67325.c |  7 +++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr67325.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3e2a3a194f1..3936223bd20 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22194,7 +22194,24 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   /* An insn that accesses memory is slightly more expensive
  than one that does not.  */
   if (speed)
-*total += 1;
+   {
+ *total += 1;
+ rtx addr = XEXP (x, 0);
+ /* For MEM, rtx_cost iterates each subrtx, and adds up the costs,
+so for MEM (reg) and MEM (reg + 4), the former costs 5,
+the latter costs 9, it is not accurate for x86. Ideally
+address_cost should be used, but it reduce cost too much.
+So current solution is make constant disp as cheap as possible.  */
+ if (GET_CODE (addr) == PLUS
+ && CONST_INT_P (XEXP (addr, 1))
+ && x86_64_immediate_operand (XEXP (addr, 1), Pmode))
+   {
+ *total += 1;
+ *total += rtx_cost (XEXP (addr, 0), Pmode, PLUS, 0, speed);
+ return true;
+   }
+   }
+
   return false;
 
 case ZERO_EXTRACT:
diff --git a/gcc/testsuite/gcc.target/i386/pr67325.c 
b/gcc/testsuite/gcc.target/i386/pr67325.c
new file mode 100644
index 000..c3c1e4c5b4d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr67325.c
@@ -0,0 +1,7 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not "(?:sar|shr)" } } */
+
+int f(long*l){
+  return *l>>32;
+}
-- 
2.31.1



[PATCH] Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.

2024-05-26 Thread liuhongt
Update in V2:
Guard constant folding for overflow value in
fold_convert_const_int_from_real with flag_trapping_math.
Add -fno-trapping-math to related testcases which warn for overflow
in conversion from floating point to integer.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

According to IEEE standard, for conversions from floating point to
integer. When a NaN or infinite operand cannot be represented in the
destination format and this cannot otherwise be indicated, the invalid
operation exception shall be signaled. When a numeric operand would
convert to an integer outside the range of the destination format, the
invalid operation exception shall be signaled if this situation cannot
otherwise be indicated.

The patch prevent simplication of the conversion from floating point
to integer for NAN/INF/out-of-range constant when flag_trapping_math.

gcc/ChangeLog:

PR rtl-optimization/100927
PR rtl-optimization/115161
PR rtl-optimization/115115
* simplify-rtx.cc (simplify_const_unary_operation): Prevent
simplication of FIX/UNSIGNED_FIX for NAN/INF/out-of-range
constant when flag_trapping_math.
* fold-const.cc (fold_convert_const_int_from_real): Don't fold
for overflow value when_trapping_math.

gcc/testsuite/ChangeLog:

* gcc.dg/pr100927.c: New test.
* c-c++-common/Wconversion-1.c: Add -fno-trapping-math.
* c-c++-common/dfp/convert-int-saturate.c: Ditto.
* g++.dg/ubsan/pr63956.C: Ditto.
* g++.dg/warn/Wconversion-real-integer.C: Ditto.
* gcc.c-torture/execute/20031003-1.c: Ditto.
* gcc.dg/Wconversion-complex-c99.c: Ditto.
* gcc.dg/Wconversion-real-integer.c: Ditto.
* gcc.dg/c90-const-expr-11.c: Ditto.
* gcc.dg/overflow-warn-8.c: Ditto.
---
 gcc/fold-const.cc | 13 +++-
 gcc/simplify-rtx.cc   | 23 +++---
 gcc/testsuite/c-c++-common/Wconversion-1.c|  2 +-
 .../c-c++-common/dfp/convert-int-saturate.c   |  1 +
 gcc/testsuite/g++.dg/ubsan/pr63956.C  |  7 -
 .../g++.dg/warn/Wconversion-real-integer.C|  2 +-
 .../gcc.c-torture/execute/20031003-1.c|  2 ++
 .../gcc.dg/Wconversion-complex-c99.c  |  2 +-
 .../gcc.dg/Wconversion-real-integer.c |  2 +-
 gcc/testsuite/gcc.dg/c90-const-expr-11.c  |  2 +-
 gcc/testsuite/gcc.dg/overflow-warn-8.c|  1 +
 gcc/testsuite/gcc.dg/pr100927.c   | 31 +++
 12 files changed, 77 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr100927.c

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 7b268964acc..0ba01984630 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -2246,7 +2246,18 @@ fold_convert_const_int_from_real (enum tree_code code, 
tree type, const_tree arg
   if (! overflow)
 val = real_to_integer (&r, &overflow, TYPE_PRECISION (type));
 
-  t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1));
+  /* According to IEEE standard, for conversions from floating point to
+ integer. When a NaN or infinite operand cannot be represented in the
+ destination format and this cannot otherwise be indicated, the invalid
+ operation exception shall be signaled. When a numeric operand would
+ convert to an integer outside the range of the destination format, the
+ invalid operation exception shall be signaled if this situation cannot
+ otherwise be indicated.  */
+  if (!flag_trapping_math || !overflow)
+t = force_fit_type (type, val, -1, overflow | TREE_OVERFLOW (arg1));
+  else
+t = NULL_TREE;
+
   return t;
 }
 
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 53f54d1d392..b7a770dad60 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -2256,14 +2256,25 @@ simplify_const_unary_operation (enum rtx_code code, 
machine_mode mode,
   switch (code)
{
case FIX:
+ /* According to IEEE standard, for conversions from floating point to
+integer. When a NaN or infinite operand cannot be represented in
+the destination format and this cannot otherwise be indicated, the
+invalid operation exception shall be signaled. When a numeric
+operand would convert to an integer outside the range of the
+destination format, the invalid operation exception shall be
+signaled if this situation cannot otherwise be indicated.  */
  if (REAL_VALUE_ISNAN (*x))
-   return const0_rtx;
+   return flag_trapping_math ? NULL_RTX : const0_rtx;
+
+ if (REAL_VALUE_ISINF (*x) && flag_trapping_math)
+   return NULL_RTX;
 
  /* Test against the signed upper bound.  */
  wmax = wi::max_value (width, SIGNED);
  real_from_integer (&t, VOIDmode, wmax, SIGNED);
  if (real_less (&t, x))
-   return immed_wide_int_const (wmax, mode);
+   r

[PATCH] Fix typo in the testcase.

2024-05-24 Thread liuhongt
Committed as an obvious patch.

gcc/testsuite/ChangeLog:

PR target/114148
* gcc.target/i386/pr106010-7b.c: Refine testcase.
---
 gcc/testsuite/gcc.target/i386/pr106010-7b.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr106010-7b.c 
b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
index 26482cc10f5..917e56e45f7 100644
--- a/gcc/testsuite/gcc.target/i386/pr106010-7b.c
+++ b/gcc/testsuite/gcc.target/i386/pr106010-7b.c
@@ -34,11 +34,11 @@ avx_test (void)
 p_init[i] = i % 2 + 3;
 
   memcpy (pd_src, p_init, 2 * N * sizeof (double));
-  memcpy (ps_dst, p_init, 2 * N * sizeof (float));
-  memcpy (epi64_dst, p_init, 2 * N * sizeof (long long));
-  memcpy (epi32_dst, p_init, 2 * N * sizeof (int));
-  memcpy (epi16_dst, p_init, 2 * N * sizeof (short));
-  memcpy (epi8_dst, p_init, 2 * N * sizeof (char));
+  memcpy (ps_src, p_init, 2 * N * sizeof (float));
+  memcpy (epi64_src, p_init, 2 * N * sizeof (long long));
+  memcpy (epi32_src, p_init, 2 * N * sizeof (int));
+  memcpy (epi16_src, p_init, 2 * N * sizeof (short));
+  memcpy (epi8_src, p_init, 2 * N * sizeof (char));
 
   foo_pd (pd_dst, pd_src[0]);
   foo_ps (ps_dst, ps_src[0]);
-- 
2.31.1



[V3 PATCH] Don't reduce estimated unrolled size for innermost loop.

2024-05-24 Thread liuhongt
Update in V3:
> Since this was about vectorization can you instead add a testcase to
> gcc.dg/vect/ and check for
> vectorization to happen?
Move to vect/pr112325.c.
>
> I believe the if (unr_insn <= 0) check can go as well.
Removed.

> as said, you want to do
>
>   curolli = false;
>
> after the above since we are iterating and for a subsequent unrolling
> of an outer loop
> of an unrolled inner loop we _do_ want to apply the 2/3 reduction
> since there's likely
> inter-loop redundancies exposed (as happens in SPEC calculix for example).
>
> Not sure if that changes any of the testsuite outcome - it possibly avoids the
> gcc.dg/vect/pr69783.c FAIL?
Yes, it avoids that, cunrolli is set to false when CHANGED is true.

> Not sure about the arm fallout.
It's the same reason as pr69783.c, there's subsequent unrolling of an outer loop
of an unrolled inner loop, and since inner loop is completely unrolled,
outer_loop->inner is false and escape from the check.
The change also fix 2 arm fallouts.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

For the innermost loop, after completely loop unroll, it will most likely
not be able to reduce the body size to 2/3. The current 2/3 reduction
will make some of the larger loops completely unrolled during
cunrolli, which will then result in them not being able to be
vectorized. It also increases the register pressure.

The patch move the 2/3 reduction from estimated_unrolled_size to
tree_unroll_loops_completely.

gcc/ChangeLog:

PR tree-optimization/112325
* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
2 / 3 loop body size reduction to ..
(try_unroll_loop_completely): .. here, add it for the check of
body size shrink, and the check of comparison against
param_max_completely_peeled_insns when
(!cunrolli ||loop->inner).
(canonicalize_loop_induction_variables): Add new parameter
cunrolli and pass down.
(tree_unroll_loops_completely_1): Ditto.
(canonicalize_induction_variables): Pass cunrolli as false to
canonicalize_loop_induction_variables.
(tree_unroll_loops_completely): Set cunrolli to true at
beginning and set it to false after CHANGED is true.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/pr112325.c: New test.
---
 gcc/testsuite/gcc.dg/vect/pr112325.c | 59 
 gcc/tree-ssa-loop-ivcanon.cc | 46 +++---
 2 files changed, 83 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr112325.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr112325.c 
b/gcc/testsuite/gcc.dg/vect/pr112325.c
new file mode 100644
index 000..71cf4099253
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr112325.c
@@ -0,0 +1,59 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+unsigned short s;
+__builtin_memcpy(&s, &f, sizeof(unsigned short));
+return table_f32_f16[s];
+}
+
+typedef struct {
+ggml_fp16_t d;
+ggml_fp16_t m;
+unsigned char qh[4];
+unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+float d;
+float s;
+char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * 
restrict vx, const void * restrict vy) {
+const int qk = 32;
+const int nb = n / qk;
+
+const block_q5_1 * restrict x = vx;
+const block_q8_1 * restrict y = vy;
+
+float sumf = 0.0;
+
+for (int i = 0; i < nb; i++) {
+unsigned qh;
+__builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+int sumi = 0;
+
+for (int j = 0; j < qk/2; ++j) {
+const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+}
+
+sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + 
ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+}
+
+*s = sumf;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf017137260..216e81ef15f 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -437,11 +437,7 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge 
edge_to_cancel,
It is (NUNROLL + 1) * size of loop body with taking into account
the fact that in last copy everything after exit conditional
is dead and that some instructions will be eliminated after
-   peeling.
-
-   L

[V2 PATCH] Don't reduce estimated unrolled size for innermost loop at cunrolli.

2024-05-21 Thread liuhongt
>> Hard to find a default value satisfying all testcases.
>> some require loop unroll with 7 insns increment, some don't want loop
>> unroll w/ 5 insn increment.
>> The original 2/3 reduction happened to meet all those testcases(or the
>> testcases are constructed based on the old 2/3).
>> Can we define the parameter as the size of the loop, below the size we
>> still do the reduction, so the small loop can be unrolled?

>Yeah, that's also a sensible possibility.  Does it work to have a parameter
>for the unrolled body size?  Thus, amend the existing
>--param max-completely-peeled-insns with a --param
>max-completely-peeled-insns-nogrowth?

Update V2:
It's still hard to find a default value for loop boday size. So I move the
2 / 3 reduction from estimated_unrolled_size to try_unroll_loop_completely.
For the check of body size shrink, 2 / 3 reduction is added, so small loops
can still be unrolled.
For the check of comparison between body size and 
param_max_completely_peeled_insns,
2 / 3 is conditionally added for loop->inner || !cunrolli.
Then the patch avoid gcc testsuite regression, and also prevent big inner loop
completely unrolled at cunrolli.

--

For the innermost loop, after completely loop unroll, it will most likely
not be able to reduce the body size to 2/3. The current 2/3 reduction
will make some of the larger loops completely unrolled during
cunrolli, which will then result in them not being able to be
vectorized. It also increases the register pressure. The patch move
from estimated_unrolled_size to
the 2/3 reduction at cunrolli.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/112325
* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Move the
2 / 3 loop body size reduction to ..
(try_unroll_loop_completely): .. here, add it for the check of
body size shrink, and the check of comparison against
param_max_completely_peeled_insns when
(!cunrolli ||loop->inner).
(canonicalize_loop_induction_variables): Add new parameter
cunrolli and pass down.
(tree_unroll_loops_completely_1): Ditto.
(tree_unroll_loops_completely): Ditto.
(canonicalize_induction_variables): Handle new parameter.
(pass_complete_unrolli::execute): Ditto.
(pass_complete_unroll::execute): Ditto.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/pr112325.c: New test.
* gcc.dg/vect/pr69783.c: Add extra option --param
max-completely-peeled-insns=300.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 
 gcc/testsuite/gcc.dg/vect/pr69783.c  |  2 +-
 gcc/tree-ssa-loop-ivcanon.cc | 45 ++-
 3 files changed, 83 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
new file mode 100644
index 000..14208b3e7f8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+unsigned short s;
+__builtin_memcpy(&s, &f, sizeof(unsigned short));
+return table_f32_f16[s];
+}
+
+typedef struct {
+ggml_fp16_t d;
+ggml_fp16_t m;
+unsigned char qh[4];
+unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+float d;
+float s;
+char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * 
restrict vx, const void * restrict vy) {
+const int qk = 32;
+const int nb = n / qk;
+
+const block_q5_1 * restrict x = vx;
+const block_q8_1 * restrict y = vy;
+
+float sumf = 0.0;
+
+for (int i = 0; i < nb; i++) {
+unsigned qh;
+__builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+int sumi = 0;
+
+for (int j = 0; j < qk/2; ++j) {
+const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+}
+
+sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + 
ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+}
+
+*s = sumf;
+}
+
+/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param 
max-completely-peel-times limit reached} "cunrolli"} } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c 
b/gcc/testsuite/gcc.dg/vect/pr69783.c
index 5df95d0ce4e..a1f75514d72 100644
--- a/gcc/testsuite/gcc.dg/vect/pr69783.c
+++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective

[PATCH] Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.

2024-05-21 Thread liuhongt
According to IEEE standard, for conversions from floating point to
integer. When a NaN or infinite operand cannot be represented in the
destination format and this cannot otherwise be indicated, the invalid
operation exception shall be signaled. When a numeric operand would
convert to an integer outside the range of the destination format, the
invalid operation exception shall be signaled if this situation cannot
otherwise be indicated.

The patch prevent simplication of the conversion from floating point
to integer for NAN/INF/out-of-range constant when flag_trapping_math.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

PR rtl-optimization/100927
PR rtl-optimization/115161
PR rtl-optimization/115115
* simplify-rtx.cc (simplify_const_unary_operation): Prevent
simplication of FIX/UNSIGNED_FIX for NAN/INF/out-of-range
constant when flag_trapping_math.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr100927.c: New test.
---
 gcc/simplify-rtx.cc  | 23 
 gcc/testsuite/gcc.target/i386/pr100927.c | 27 
 2 files changed, 46 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr100927.c

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 53f54d1d392..b7a770dad60 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -2256,14 +2256,25 @@ simplify_const_unary_operation (enum rtx_code code, 
machine_mode mode,
   switch (code)
{
case FIX:
+ /* According to IEEE standard, for conversions from floating point to
+integer. When a NaN or infinite operand cannot be represented in
+the destination format and this cannot otherwise be indicated, the
+invalid operation exception shall be signaled. When a numeric
+operand would convert to an integer outside the range of the
+destination format, the invalid operation exception shall be
+signaled if this situation cannot otherwise be indicated.  */
  if (REAL_VALUE_ISNAN (*x))
-   return const0_rtx;
+   return flag_trapping_math ? NULL_RTX : const0_rtx;
+
+ if (REAL_VALUE_ISINF (*x) && flag_trapping_math)
+   return NULL_RTX;
 
  /* Test against the signed upper bound.  */
  wmax = wi::max_value (width, SIGNED);
  real_from_integer (&t, VOIDmode, wmax, SIGNED);
  if (real_less (&t, x))
-   return immed_wide_int_const (wmax, mode);
+   return (flag_trapping_math
+   ? NULL_RTX : immed_wide_int_const (wmax, mode));
 
  /* Test against the signed lower bound.  */
  wmin = wi::min_value (width, SIGNED);
@@ -2276,13 +2287,17 @@ simplify_const_unary_operation (enum rtx_code code, 
machine_mode mode,
 
case UNSIGNED_FIX:
  if (REAL_VALUE_ISNAN (*x) || REAL_VALUE_NEGATIVE (*x))
-   return const0_rtx;
+   return flag_trapping_math ? NULL_RTX : const0_rtx;
+
+ if (REAL_VALUE_ISINF (*x) && flag_trapping_math)
+   return NULL_RTX;
 
  /* Test against the unsigned upper bound.  */
  wmax = wi::max_value (width, UNSIGNED);
  real_from_integer (&t, VOIDmode, wmax, UNSIGNED);
  if (real_less (&t, x))
-   return immed_wide_int_const (wmax, mode);
+   return (flag_trapping_math
+   ? NULL_RTX : immed_wide_int_const (wmax, mode));
 
  return immed_wide_int_const (real_to_integer (x, &fail, width),
   mode);
diff --git a/gcc/testsuite/gcc.target/i386/pr100927.c 
b/gcc/testsuite/gcc.target/i386/pr100927.c
new file mode 100644
index 000..b137396c30f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr100927.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-msse2 -O2 -ftrapping-math" } */
+/* { dg-final { scan-assembler-times "cvttps2dq" 3 } }  */
+
+#include 
+
+__m128i foo_ofr() {
+  const __m128i iv = _mm_set_epi32(0x4f00, 0x4f00, 0x4f00, 
0x4f00);
+  const __m128  fv = _mm_castsi128_ps(iv);
+  const __m128i riv = _mm_cvttps_epi32(fv);
+  return riv;
+}
+
+__m128i foo_nan() {
+  const __m128i iv = _mm_set_epi32(0xff81, 0xff81, 0xff81, 
0xff81);
+  const __m128  fv = _mm_castsi128_ps(iv);
+  const __m128i riv = _mm_cvttps_epi32(fv);
+  return riv;
+}
+
+__m128i foo_inf() {
+  const __m128i iv = _mm_set_epi32(0xff80, 0xff80, 0xff80, 
0xff80);
+  const __m128  fv = _mm_castsi128_ps(iv);
+  const __m128i riv = _mm_cvttps_epi32(fv);
+  return riv;
+}
+
-- 
2.31.1



[PATCH 2/2] [x86] Adjust rtx_cost for MEM to enable more simplication

2024-05-20 Thread liuhongt
For CONST_VECTOR_DUPLICATE_P in constant_pool, it is just broadcast or
variants in ix86_vector_duplicate_simode_const.
Adjust the cost to COSTS_N_INSNS (2) + speed which should be a little
bit larger than broadcast.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:
PR target/114428
* config/i386/i386.cc (ix86_rtx_costs): Adjust cost for
CONST_VECTOR_DUPLICATE_P in constant_pool.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114428.c: New test.
---
 gcc/config/i386/i386-expand.cc   |  2 +-
 gcc/config/i386/i386-protos.h|  1 +
 gcc/config/i386/i386.cc  | 13 +
 gcc/testsuite/gcc.target/i386/pr114428.c | 18 ++
 4 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114428.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4e16aedc5c1..d96c365e144 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -588,7 +588,7 @@ ix86_expand_move (machine_mode mode, rtx operands[])
 
 /* OP is a memref of CONST_VECTOR, return scalar constant mem
if CONST_VECTOR is a vec_duplicate, else return NULL.  */
-static rtx
+rtx
 ix86_broadcast_from_constant (machine_mode mode, rtx op)
 {
   int nunits = GET_MODE_NUNITS (mode);
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..90712769200 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -107,6 +107,7 @@ extern void ix86_expand_clear (rtx);
 extern void ix86_expand_move (machine_mode, rtx[]);
 extern void ix86_expand_vector_move (machine_mode, rtx[]);
 extern void ix86_expand_vector_move_misalign (machine_mode, rtx[]);
+extern rtx ix86_broadcast_from_constant (machine_mode, rtx);
 extern rtx ix86_fixup_binary_operands (enum rtx_code, machine_mode,
   rtx[], bool = false);
 extern void ix86_fixup_binary_operands_no_copy (enum rtx_code, machine_mode,
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b4838b7939e..fdd9343e47a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -22197,6 +22197,19 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
   return true;
 
 case MEM:
+  /* CONST_VECTOR_DUPLICATE_P in constant_pool is just broadcast.
+or variants in ix86_vector_duplicate_simode_const.  */
+
+  if (GET_MODE_SIZE (mode) >= 16
+ && VECTOR_MODE_P (mode)
+ && SYMBOL_REF_P (XEXP (x, 0))
+ && CONSTANT_POOL_ADDRESS_P (XEXP (x, 0))
+ && ix86_broadcast_from_constant (mode, x))
+   {
+ *total = COSTS_N_INSNS (2) + speed;
+ return true;
+   }
+
   /* An insn that accesses memory is slightly more expensive
  than one that does not.  */
   if (speed)
diff --git a/gcc/testsuite/gcc.target/i386/pr114428.c 
b/gcc/testsuite/gcc.target/i386/pr114428.c
new file mode 100644
index 000..bbbc5a080f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114428.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -mno-avx512f -O2" } */
+/* { dg-final { scan-assembler-not "vpsra[dw]" } } */
+
+void
+foo2 (char* __restrict a, short* b)
+{
+  for (int i = 0; i != 32; i++)
+a[i] = b[i] >> (short)8;
+}
+
+void
+foo3 (char* __restrict a, short* b)
+{
+  for (int i = 0; i != 16; i++)
+a[i] = b[i] >> (short)8;
+}
+
-- 
2.31.1



[PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

2024-05-20 Thread liuhongt
When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
of A, then it can be simplified to LSHIFTRT.

i.e Simplify
(and:v8hi
  (ashifrt:v8hi A 8)
  (const_vector 0xff x8))
to
(lshifrt:v8hi A 8)

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok of trunk?

gcc/ChangeLog:

PR target/114428
* simplify-rtx.cc
(simplify_context::simplify_binary_operation_1):
Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
specific mask.
---
 gcc/simplify-rtx.cc | 25 +
 1 file changed, 25 insertions(+)

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 53f54d1d392..6c91409200e 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4021,6 +4021,31 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
return tem;
}
 
+  /* (and:v4si
+  (ashiftrt:v4si A 16)
+  (const_vector: 0x x4))
+is just (lshiftrt:v4si A 16).  */
+  if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
+ && (CONST_INT_P (XEXP (op0, 1))
+ || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
+ && GET_CODE (op1) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (op1))
+   {
+ unsigned HOST_WIDE_INT shift_count
+   = (CONST_INT_P (XEXP (op0, 1))
+  ? UINTVAL (XEXP (op0, 1))
+  : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0)));
+ unsigned HOST_WIDE_INT inner_prec
+   = GET_MODE_PRECISION (GET_MODE_INNER (mode));
+
+ /* Avoid UD shift count.  */
+ if (shift_count < inner_prec
+ && (UINTVAL (XVECEXP (op1, 0, 0))
+ == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1))
+   return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP 
(op0, 1));
+   }
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
-- 
2.31.1



[PATCH] Use pblendw instead of pand to clear upper 16 bits.

2024-05-16 Thread liuhongt
For vec_pack_truncv8si/v4si w/o AVX512,
(const_vector:v4si (const_int 0x) x4) is used as mask to clear
upper 16 bits, but vpblendw with zero_vector can also be used, and
zero vector is cheaper than (const_vector:v4si (const_int 0x) x4).

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:
PR target/114427
* config/i386/i386-expand.cc (expand_vec_perm_even_odd_pack):
Use pblendw instead of pand to clear upper bits.

gcc/testsuite/ChangeLog:
* gcc.target/i386/pr114427.c: New test.
---
 gcc/config/i386/i386-expand.cc   | 34 +---
 gcc/testsuite/gcc.target/i386/pr114427.c | 18 +
 2 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114427.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 4e16aedc5c1..231e9321d81 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22918,6 +22918,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
 {
   rtx op, dop0, dop1, t;
   unsigned i, odd, c, s, nelt = d->nelt;
+  int pblendw_i = 0;
   bool end_perm = false;
   machine_mode half_mode;
   rtx (*gen_and) (rtx, rtx, rtx);
@@ -22939,6 +22940,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv2si3;
   gen_pack = gen_mmx_packusdw;
   gen_shift = gen_lshrv2si3;
+  pblendw_i = 0x5;
   break;
 case E_V8HImode:
   /* Required for "pack".  */
@@ -22950,6 +22952,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv4si3;
   gen_pack = gen_sse4_1_packusdw;
   gen_shift = gen_lshrv4si3;
+  pblendw_i = 0x55;
   break;
 case E_V8QImode:
   /* No check as all instructions are SSE2.  */
@@ -22978,6 +22981,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d 
*d)
   gen_and = gen_andv8si3;
   gen_pack = gen_avx2_packusdw;
   gen_shift = gen_lshrv8si3;
+  pblendw_i = 0x;
   end_perm = true;
   break;
 case E_V32QImode:
@@ -23013,10 +23017,32 @@ expand_vec_perm_even_odd_pack (struct 
expand_vec_perm_d *d)
   dop1 = gen_reg_rtx (half_mode);
   if (odd == 0)
 {
-  t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
-  t = force_reg (half_mode, t);
-  emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
-  emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+  /* Use pblendw since const_vector 0 should be cheaper than
+const_vector 0x.  */
+  if (d->vmode == V4HImode
+ || d->vmode == E_V8HImode
+ || d->vmode == E_V16HImode)
+   {
+ rtx dop0_t = gen_reg_rtx (d->vmode);
+ rtx dop1_t = gen_reg_rtx (d->vmode);
+ t = gen_reg_rtx (d->vmode);
+ emit_move_insn (t, CONST0_RTX (d->vmode));
+
+ emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
+GEN_INT (pblendw_i)));
+ emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
+GEN_INT (pblendw_i)));
+
+ emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
+ emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
+   }
+  else
+   {
+ t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
+ t = force_reg (half_mode, t);
+ emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+ emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+   }
 }
   else
 {
diff --git a/gcc/testsuite/gcc.target/i386/pr114427.c 
b/gcc/testsuite/gcc.target/i386/pr114427.c
new file mode 100644
index 000..58b66db7fff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114427.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=x86-64-v3 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "vpand" } } */
+/* { dg-final { scan-assembler-not "65535" } } */
+
+void
+foo (int* a, short* __restrict b, int* c)
+{
+for (int i = 0; i != 16; i++)
+  b[i] = c[i] + a[i];
+}
+
+void
+foo1 (int* a, short* __restrict b, int* c)
+{
+for (int i = 0; i != 8; i++)
+  b[i] = c[i] + a[i];
+}
-- 
2.31.1



[PATCH] [x86] Set d.one_operand_p to true when TARGET_SSSE3 in ix86_expand_vecop_qihi_partial.

2024-05-15 Thread liuhongt
pshufb is available under TARGET_SSSE3, so
ix86_expand_vec_perm_const_1 must return true when TARGET_SSSE3.
w/o TARGET_SSSE3, if we set one_operand_p to true, ix86_expand_vec_perm_const_1 
could return false.

With the patch under -march=x86-64-v2

v8qi
foo (v8qi a)
{
  return a >> 5;
}

<   pmovsxbw%xmm0, %xmm0
<   psraw   $5, %xmm0
<   pshufb  .LC0(%rip), %xmm0
---
>   movdqa  %xmm0, %xmm1
>   pcmpeqd %xmm0, %xmm0
>   pmovsxbw%xmm1, %xmm1
>   psrlw   $8, %xmm0
>   psraw   $5, %xmm1
>   pand%xmm1, %xmm0
>   packuswb%xmm0, %xmm0

Although there's a memory load from constant pool, but it should be
better when it's inside a loop. The load from constant pool can be
hoist out. it's 1 instruction vs 4 instructions.

<   pshufb  .LC0(%rip), %xmm0

vs.

>   pcmpeqd %xmm0, %xmm0
>   psrlw   $8, %xmm0
>   pand%xmm1, %xmm0
>   packuswb%xmm0, %xmm0


Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk.

gcc/ChangeLog:

PR target/114514
* config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial):
Set d.one_operand_p to true when TARGET_SSSE3.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114514-shufb.c: New test.
---
 gcc/config/i386/i386-expand.cc|  2 +-
 .../gcc.target/i386/pr114514-shufb.c  | 35 +++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114514-shufb.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index ab6631f51e3..ae2e9ab4e05 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24394,7 +24394,7 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx 
dest, rtx op1, rtx op2)
   d.op0 = d.op1 = qres;
   d.vmode = V16QImode;
   d.nelt = 16;
-  d.one_operand_p = false;
+  d.one_operand_p = TARGET_SSSE3;
   d.testing_p = false;
 
   for (i = 0; i < d.nelt; ++i)
diff --git a/gcc/testsuite/gcc.target/i386/pr114514-shufb.c 
b/gcc/testsuite/gcc.target/i386/pr114514-shufb.c
new file mode 100644
index 000..71fdc9d8daf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114514-shufb.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-msse4.1 -O2 -mno-avx512f" } */
+/* { dg-final { scan-assembler-not "packuswb" } }  */
+/* { dg-final { scan-assembler-times "pshufb" 4 { target { ! ia32 } } } }  */
+/* { dg-final { scan-assembler-times "pshufb" 6 { target  ia32 } } }  */
+
+typedef unsigned char v8uqi __attribute__((vector_size(8)));
+typedef  char v8qi __attribute__((vector_size(8)));
+typedef unsigned char v4uqi __attribute__((vector_size(4)));
+typedef  char v4qi __attribute__((vector_size(4)));
+
+v8qi
+foo (v8qi a)
+{
+  return a >> 5;
+}
+
+v8uqi
+foo1 (v8uqi a)
+{
+  return a >> 5;
+}
+
+v4qi
+foo2 (v4qi a)
+{
+  return a >> 5;
+}
+
+v4uqi
+foo3 (v4uqi a)
+{
+  return a >> 5;
+}
+
-- 
2.31.1



[PATCH] [x86] Optimize ashift >> 7 to vpcmpgtb for vector int8.

2024-05-14 Thread liuhongt
Since there is no corresponding instruction, the shift operation for
vector int8 is implemented using the instructions for vector int16,
but for some special shift counts, it can be transformed into vpcmpgtb.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/114514
* config/i386/i386-expand.cc
(ix86_expand_vec_shift_qihi_constant): Optimize ashift >> 7 to
vpcmpgtb.
(ix86_expand_vecop_qihi_partial): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114514-shift.c: New test.
---
 gcc/config/i386/i386-expand.cc| 32 
 .../gcc.target/i386/pr114514-shift.c  | 49 +++
 2 files changed, 81 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114514-shift.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1ab22fe7973..ab6631f51e3 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24182,6 +24182,28 @@ ix86_expand_vec_shift_qihi_constant (enum rtx_code 
code,
 return false;
 
   gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+
+
+  if (shift_amount == 7
+  && code == ASHIFTRT)
+{
+  if (qimode == V16QImode
+ || qimode == V32QImode)
+   {
+ rtx zero = gen_reg_rtx (qimode);
+ emit_move_insn (zero, CONST0_RTX (qimode));
+ emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
+   }
+  else
+   {
+ gcc_assert (qimode == V64QImode);
+ rtx kmask = gen_reg_rtx (DImode);
+ emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1));
+ emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask));
+   }
+  return true;
+}
+
   /* Record sign bit.  */
   xor_constant = 1 << (8 - shift_amount - 1);
 
@@ -24292,6 +24314,16 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, 
rtx dest, rtx op1, rtx op2)
   return;
 }
 
+  if (CONST_INT_P (op2)
+  && code == ASHIFTRT
+  && INTVAL (op2) == 7)
+{
+  rtx zero = gen_reg_rtx (qimode);
+  emit_move_insn (zero, CONST0_RTX (qimode));
+  emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
+  return;
+}
+
   switch (code)
 {
 case MULT:
diff --git a/gcc/testsuite/gcc.target/i386/pr114514-shift.c 
b/gcc/testsuite/gcc.target/i386/pr114514-shift.c
new file mode 100644
index 000..cf8b32b3b1d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114514-shift.c
@@ -0,0 +1,49 @@
+/* { dg-do compile  } */
+/* { dg-options "-mavx512vl -mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "vpxor" 4 } } */
+/* { dg-final { scan-assembler-times "vpcmpgtb" 4 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "vpcmpgtb" 5 { target  ia32 } } } */
+/* { dg-final { scan-assembler-times "vpmovb2m" 1 } } */
+/* { dg-final { scan-assembler-times "vpmovm2b" 1 } } */
+
+
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef char v64qi __attribute__((vector_size(64)));
+typedef char v8qi __attribute__((vector_size(8)));
+typedef char v4qi __attribute__((vector_size(4)));
+
+v4qi
+__attribute__((noipa))
+foo1 (v4qi a)
+{
+  return a >> 7;
+}
+
+v8qi
+__attribute__((noipa))
+foo2 (v8qi a)
+{
+  return a >> 7;
+}
+
+v16qi
+__attribute__((noipa))
+foo3 (v16qi a)
+{
+  return a >> 7;
+}
+
+v32qi
+__attribute__((noipa))
+foo4 (v32qi a)
+{
+  return a >> 7;
+}
+
+v64qi
+__attribute__((noipa))
+foo5 (v64qi a)
+{
+  return a >> 7;
+}
-- 
2.31.1



[PATCH] Don't reduce estimated unrolled size for innermost loop.

2024-05-12 Thread liuhongt
As testcase in the PR, O3 cunrolli may prevent vectorization for the
innermost loop and increase register pressure.
The patch removes the 1/3 reduction of unr_insn for innermost loop for UL_ALL.
ul != UR_ALL is needed since some small loop complete unrolling at O2 relies
the reduction.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
No big impact for SPEC2017.
Ok for trunk?

gcc/ChangeLog:

PR tree-optimization/112325
* tree-ssa-loop-ivcanon.cc (estimated_unrolled_size): Add 2
new parameters: loop and ul, and remove unr_insns reduction
for innermost loop.
(try_unroll_loop_completely): Pass loop and ul to
estimated_unrolled_size.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/pr112325.c: New test.
* gcc.dg/vect/pr69783.c: Add extra option --param
max-completely-peeled-insns=300.
---
 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c | 57 
 gcc/testsuite/gcc.dg/vect/pr69783.c  |  2 +-
 gcc/tree-ssa-loop-ivcanon.cc | 16 +--
 3 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr112325.c

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
new file mode 100644
index 000..14208b3e7f8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr112325.c
@@ -0,0 +1,57 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-cunrolli-details" } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+unsigned short s;
+__builtin_memcpy(&s, &f, sizeof(unsigned short));
+return table_f32_f16[s];
+}
+
+typedef struct {
+ggml_fp16_t d;
+ggml_fp16_t m;
+unsigned char qh[4];
+unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+float d;
+float s;
+char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * 
restrict vx, const void * restrict vy) {
+const int qk = 32;
+const int nb = n / qk;
+
+const block_q5_1 * restrict x = vx;
+const block_q8_1 * restrict y = vy;
+
+float sumf = 0.0;
+
+for (int i = 0; i < nb; i++) {
+unsigned qh;
+__builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+int sumi = 0;
+
+for (int j = 0; j < qk/2; ++j) {
+const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+}
+
+sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + 
ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+}
+
+*s = sumf;
+}
+
+/* { dg-final { scan-tree-dump {(?n)Not unrolling loop [1-9] \(--param 
max-completely-peel-times limit reached} "cunrolli"} } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr69783.c 
b/gcc/testsuite/gcc.dg/vect/pr69783.c
index 5df95d0ce4e..a1f75514d72 100644
--- a/gcc/testsuite/gcc.dg/vect/pr69783.c
+++ b/gcc/testsuite/gcc.dg/vect/pr69783.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
 /* { dg-require-effective-target vect_float } */
-/* { dg-additional-options "-Ofast -funroll-loops" } */
+/* { dg-additional-options "-Ofast -funroll-loops --param 
max-completely-peeled-insns=300" } */
 
 #define NXX 516
 #define NYY 516
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index bf017137260..5e0eca647a1 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -444,7 +444,9 @@ tree_estimate_loop_size (class loop *loop, edge exit, edge 
edge_to_cancel,
 
 static unsigned HOST_WIDE_INT
 estimated_unrolled_size (struct loop_size *size,
-unsigned HOST_WIDE_INT nunroll)
+unsigned HOST_WIDE_INT nunroll,
+enum unroll_level ul,
+class loop* loop)
 {
   HOST_WIDE_INT unr_insns = ((nunroll)
 * (HOST_WIDE_INT) (size->overall
@@ -453,7 +455,15 @@ estimated_unrolled_size (struct loop_size *size,
 unr_insns = 0;
   unr_insns += size->last_iteration - 
size->last_iteration_eliminated_by_peeling;
 
-  unr_insns = unr_insns * 2 / 3;
+  /* For innermost loop, loop body is not likely to be simplied as much as 1/3.
+ and may increase a lot of register pressure.
+ UL != UL_ALL is need to unroll small loop at O2.  */
+  class loop *loop_father = loop_outer (loop);
+  if (loop->inner || !loop_father
+  || loop_father->latch == EXIT_BLOCK_PTR_FOR_FN (cfun)
+  || ul != UL_ALL)
+unr_insns = unr_insns * 2 / 3;
+
   if (unr_insns <= 0)
 unr_insns = 1;
 
@@ -837,7 +847,7 @@ try_unroll_loop_completely (class loop *loop,
 
  unsigned HOST_WIDE_INT ninsns = size.overall;
  unsigned HOST_WIDE_INT unr_insns
-   = es

[PATCH] Don't assert for IFN_COND_{MIN, MAX} in vect_transform_reduction

2024-04-29 Thread liuhongt
The Fortran standard does not specify what the result of the MAX
and MIN intrinsics are if one of the arguments is a NaN. So it
should be ok to tranform reduction for IFN_COND_MIN with vectorized
COND_MIN and REDUC_MIN.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk and backport to GCC14?

gcc/ChangeLog:

PR 114883
* tree-vect-loop.cc (vect_transform_reduction): Don't assert
for IFN_COND_{MIN, MAX}.

gcc/testsuite/ChangeLog:

* gfortran.dg/pr114883.f90: New test.
---
 gcc/testsuite/gfortran.dg/pr114883.f90 | 191 +
 gcc/tree-vect-loop.cc  |   3 +-
 2 files changed, 193 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gfortran.dg/pr114883.f90

diff --git a/gcc/testsuite/gfortran.dg/pr114883.f90 
b/gcc/testsuite/gfortran.dg/pr114883.f90
new file mode 100644
index 000..86b664a521e
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr114883.f90
@@ -0,0 +1,191 @@
+! { dg-do compile }
+! { dg-options "-O3" }
+! { dg-additional-options "-march=x86-64-v4" { target { x86_64-*-* i?86-*-* } 
} }
+
+module ndrop
+
+
+  implicit none
+
+  private
+  save
+
+  public dropmixnuc
+
+  real(8) :: npv(1011) ! number per volume concentration
+  real(8) :: alogsig(1011) ! natl log of geometric standard dev of aerosol
+
+  type qqcw_type
+ real(8), pointer :: fldcw(:,:)
+  end type qqcw_type
+
+contains
+
+  subroutine dropmixnuc(lchnk, ncol, temp,  &
+   cldn,cldo, &
+   raer, dtmicro   &
+   ) 
+implicit none
+
+! input
+
+integer, intent(in) :: lchnk! chunk identifier
+integer, intent(in) :: ncol ! number of columns
+!  type(physics_state), intent(in) :: state  ! Physics state 
variables
+real(8), intent(in) :: dtmicro ! time step for microphysics (s)
+real(8), intent(in) :: temp(1,1011)! temperature (K)
+real(8), intent(in) :: cldo(1,1011)! cloud fraction on previous time 
step
+real(8), intent(in) :: cldn(1,1011)! cloud fraction
+real(8), intent(in) :: raer(1,1011,1011) ! aerosol mass, number mixing 
ratios
+
+
+type(qqcw_type) :: QQCW(1011)
+
+real(8) depvel(1,1011)! deposition velocity for droplets (m/s)
+real(8) wtke(1,1011) ! turbulent vertical velocity at base of layer k 
(m/s)
+real(8) wtke_cen(1,1011) ! turbulent vertical velocity at center of layer 
k (m/s)
+real(8) zn(1011) ! g/pdel (m2/g) for layer
+real(8) zs(1011) ! inverse of distance between levels (m)
+real(8), parameter :: zkmin=0.01_8,zkmax=100._8
+real(8) cs(1,1011)  ! air density (kg/m3)
+real(8) dz(1,1011)  ! geometric thickness of layers (m)
+real(8) zero
+
+real(8) wdiab   ! diabatic vertical velocity
+real(8), parameter :: wmixmin = 0.1 ! minimum turbulence vertical velocity 
(m/s)
+!   real(8), parameter :: wmixmin = 0.2 ! minimum turbulence vertical 
velocity (m/s)
+!  real(8), parameter :: wmixmin = 1.0 ! minimum turbulence vertical 
velocity (m/s)
+real(8) ekk(0:1011)   ! density*diffusivity for droplets (kg/m3 m2/s)
+real(8), parameter :: sq2pi=2.5066283_8
+real(8) dtinv
+
+integer km1,kp1
+real(8) wbar,wmix,wmin,wmax
+real(8) dumc
+real(8) fac_srflx
+real(8) surfrate(1011) ! surface exchange rate (/s)
+real(8) surfratemax  ! max surfrate for all species treated here
+real(8) dtmin,tinv,dtt
+integer nsubmix,nsubmix_bnd
+integer i,k,m
+real(8) dtmix
+real(8) pi
+integer nnew,nsav,ntemp
+real(8) ekkp(1011),ekkm(1011) ! zn*zs*density*diffusivity
+integer count_submix(100)
+save count_submix
+real(8) nsource(1,1011)! droplet number source (#/kg/s)
+real(8) ndropmix(1,1011)   ! droplet number mixing (#/kg/s)
+real(8) ndropcol(1)   ! column droplet number (#/m2)
+
+real(8) na(1),va(1),hy(1)
+real(8) naermod(1011) ! (/m3)
+real(8) hygro(1011)  ! hygroscopicity of aerosol mode
+real(8) vaerosol(1011) ! interstit+activated aerosol volume conc (cm3/cm3)
+real(8) :: taumix_internal_1011_inv ! 1/(internal mixing time scale for 
k=1011) (1/s)
+real(8) :: cldo_tmp, cldn_tmp
+real(8) :: tau_cld_regenerate
+
+integer ixndrop, l
+integer, parameter :: psat=6 ! number of supersaturations to calc ccn 
concentration
+real(8)  :: supersat(psat)= & ! supersaturation (%) to determine ccn 
concentration
+ (/0.02,0.05,0.1,0.2,0.5,1.0/)
+real(8) ccn(1,1011,psat)! number conc of aerosols activated at 
supersat
+character(len=8), dimension(psat) :: ccn_name(psat)= &
+ (/'CCN1','CCN2','CCN3','CCN4','CCN5','CCN6'/)
+real(8) arg
+integer phase ! phase of aerosol
+
+
+
+arg = 1.0_8
+zero=0._8
+
+
+pi = 4._8*atan(1.0_8)
+dtinv=1./dtmicro
+
+depvel(:,:) = 0.0_8! droplet number is done in pkg_cld_sediment, 
aerosols 

[PATCH] [x86] Optimize 64-bit vector permutation with punpcklqdq + 128-bit vector pshuf.

2024-04-27 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}
Ready push to trunk.

gcc/ChangeLog:

PR target/113090
* config/i386/i386-expand.cc
(expand_vec_perm_punpckldq_pshuf): New function.
(ix86_expand_vec_perm_const_1): Try
expand_vec_perm_punpckldq_pshuf for sequence of 2
instructions.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr113090.c: New test.
---
 gcc/config/i386/i386-expand.cc   | 71 
 gcc/testsuite/gcc.target/i386/pr113090.c | 25 +
 2 files changed, 96 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr113090.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 8bb8f21e686..fd49d866004 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -20813,6 +20813,74 @@ expand_vec_perm_pshuflw_pshufhw (struct 
expand_vec_perm_d *d)
   return true;
 }
 
+/* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle.  */
+static bool
+expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
+{
+  if (GET_MODE_BITSIZE (d->vmode) != 64
+  || !TARGET_MMX_WITH_SSE
+  || d->one_operand_p)
+return false;
+
+  machine_mode widen_vmode;
+  switch (d->vmode)
+{
+/* pshufd.  */
+case E_V2SImode:
+  widen_vmode = V4SImode;
+  break;
+
+/* pshufd.  */
+case E_V2SFmode:
+  widen_vmode = V4SFmode;
+  break;
+
+case E_V4HImode:
+  widen_vmode = V8HImode;
+  /* pshufb.  */
+  if (!TARGET_SSSE3)
+   return false;
+  break;
+
+case E_V8QImode:
+  /* pshufb.  */
+  widen_vmode = V16QImode;
+  if (!TARGET_SSSE3)
+   return false;
+  break;
+
+default:
+  return false;
+}
+
+  if (d->testing_p)
+return true;
+
+  struct expand_vec_perm_d dperm;
+  dperm.target = gen_reg_rtx (widen_vmode);
+  rtx op0 = gen_reg_rtx (widen_vmode);
+  emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
+  dperm.op0 = op0;
+  dperm.op1 = op0;
+  dperm.vmode = widen_vmode;
+  unsigned nelt = GET_MODE_NUNITS (widen_vmode);
+  dperm.nelt = nelt;
+  dperm.one_operand_p = true;
+  dperm.testing_p = false;
+
+  for (unsigned i = 0; i != nelt / 2; i++)
+{
+  dperm.perm[i] = d->perm[i];
+  dperm.perm[i + nelt / 2] = d->perm[i];
+}
+
+  gcc_assert (expand_vec_perm_1 (&dperm));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode,
+dperm.target,
+dperm.vmode));
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
the permutation using the SSSE3 palignr instruction.  This succeeds
when all of the elements in PERM fit within one vector and we merely
@@ -23325,6 +23393,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
   if (expand_vec_perm_shufps_shufps (d))
 return true;
 
+  if (expand_vec_perm_punpckldq_pshuf (d))
+return true;
+
   /* Try sequences of three instructions.  */
 
   if (expand_vec_perm_even_odd_pack (d))
diff --git a/gcc/testsuite/gcc.target/i386/pr113090.c 
b/gcc/testsuite/gcc.target/i386/pr113090.c
new file mode 100644
index 000..0f0b7cc0084
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113090.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -msse4.1" } */
+/* { dg-final { scan-assembler-times "pshufd" 3 } } */
+
+typedef int v2si __attribute__((vector_size(8)));
+typedef short v4hi __attribute__((vector_size(8)));
+typedef char v8qi __attribute__((vector_size(8)));
+
+v2si
+foo (v2si a, v2si b)
+{
+return __builtin_shufflevector (a, b, 1, 2);
+}
+
+v4hi
+foo1 (v4hi a, v4hi b)
+{
+  return __builtin_shufflevector (a, b, 2, 3, 4, 5);
+}
+
+v8qi
+foo2 (v8qi a, v8qi b)
+{
+  return __builtin_shufflevector (a, b, 4, 5, 6, 7, 8, 9, 10, 11);
+}
-- 
2.31.1



[PATCH 1/2] [x86] Support dot_prod optabs for 64-bit vector.

2024-04-27 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

PR target/113079
* config/i386/mmx.md (usdot_prodv8qi): New expander.
(sdot_prodv8qi): Ditto.
(udot_prodv8qi): Ditto.
(usdot_prodv4hi): Ditto.
(udot_prodv4hi): Ditto.
(sdot_prodv4hi): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr113079.c: New test.
* gcc.target/i386/pr113079-2.c: New test.
* gcc.target/i386/sse4-pr113079-2.c: New test.
---
 gcc/config/i386/mmx.md| 195 ++
 gcc/testsuite/gcc.target/i386/pr113079-2.c| 161 +++
 gcc/testsuite/gcc.target/i386/pr113079.c  |  57 +
 .../gcc.target/i386/sse4-pr113079-2.c | 158 ++
 4 files changed, 571 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr113079-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr113079.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse4-pr113079-2.c

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 9a8d6030d8b..5f342497885 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -6342,6 +6342,201 @@ (define_expand "usadv8qi"
   DONE;
 })
 
+(define_expand "usdot_prodv8qi"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V8QI 1 "register_operand")
+   (match_operand:V8QI 2 "register_operand")
+   (match_operand:V2SI 3 "register_operand")]
+  "TARGET_MMX_WITH_SSE && TARGET_SSE4_1"
+{
+  operands[1] = force_reg (V8QImode, operands[1]);
+  operands[2] = force_reg (V8QImode, operands[2]);
+  operands[3] = force_reg (V2SImode, operands[3]);
+
+  if ((TARGET_AVX512VNNI && TARGET_AVX512VL)
+ || TARGET_AVXVNNI)
+{
+  rtx op1 = lowpart_subreg (V16QImode, operands[1], V8QImode);
+  rtx op2 = lowpart_subreg (V16QImode, operands[2], V8QImode);
+  rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_usdot_prodv16qi (op0, op1, op2, op3));
+  emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
+ }
+   else
+ {
+  rtx op1 = gen_reg_rtx (V8HImode);
+  rtx op2 = gen_reg_rtx (V8HImode);
+  rtx op3 = gen_reg_rtx (V4SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+  rtx op0_1 = gen_reg_rtx (V4SImode);
+
+  emit_move_insn (op3, CONST0_RTX (V4SImode));
+  emit_insn (gen_zero_extendv8qiv8hi2 (op1, operands[1]));
+  emit_insn (gen_extendv8qiv8hi2 (op2, operands[2]));
+  emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3));
+
+  /* vec_perm (op0, 2, 3, 0, 1);  */
+  emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78)));
+  emit_insn (gen_addv4si3 (op0, op0, op0_1));
+  emit_insn (gen_addv2si3 (operands[0], operands[3],
+  lowpart_subreg (V2SImode, op0, V4SImode)));
+ }
+DONE;
+})
+
+(define_expand "sdot_prodv8qi"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V8QI 1 "register_operand")
+   (match_operand:V8QI 2 "register_operand")
+   (match_operand:V2SI 3 "register_operand")]
+  "TARGET_MMX_WITH_SSE && TARGET_SSE4_1"
+{
+  operands[1] = force_reg (V8QImode, operands[1]);
+  operands[2] = force_reg (V8QImode, operands[2]);
+  operands[3] = force_reg (V2SImode, operands[3]);
+
+  if (TARGET_AVXVNNIINT8)
+{
+  rtx op1 = lowpart_subreg (V16QImode, operands[1], V8QImode);
+  rtx op2 = lowpart_subreg (V16QImode, operands[2], V8QImode);
+  rtx op3 = lowpart_subreg (V4SImode, operands[3], V2SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_sdot_prodv16qi (op0, op1, op2, op3));
+  emit_move_insn (operands[0], lowpart_subreg (V2SImode, op0, V4SImode));
+}
+  else
+{
+  rtx op1 = gen_reg_rtx (V8HImode);
+  rtx op2 = gen_reg_rtx (V8HImode);
+  rtx op3 = gen_reg_rtx (V4SImode);
+  rtx op0 = gen_reg_rtx (V4SImode);
+  rtx op0_1 = gen_reg_rtx (V4SImode);
+
+  emit_move_insn (op3, CONST0_RTX (V4SImode));
+  emit_insn (gen_extendv8qiv8hi2 (op1, operands[1]));
+  emit_insn (gen_extendv8qiv8hi2 (op2, operands[2]));
+  emit_insn (gen_sdot_prodv8hi (op0, op1, op2, op3));
+
+  /* vec_perm (op0, 2, 3, 0, 1);  */
+  emit_insn (gen_sse2_pshufd (op0_1, op0, GEN_INT (78)));
+  emit_insn (gen_addv4si3 (op0, op0, op0_1));
+  emit_insn (gen_addv2si3 (operands[0], operands[3],
+  lowpart_subreg (V2SImode, op0, V4SImode)));
+}
+  DONE;
+
+})
+
+(define_expand "udot_prodv8qi"
+  [(match_operand:V2SI 0 "register_operand")
+   (match_operand:V8QI 1 "register_operand")
+   (match_operand:V8QI 2 "register_operand")
+   (match_operand:V2SI 3 "register_operand")]
+  "TARGET_MMX_WITH_SSE && TARGET_SSE4_1"
+{
+  operands[1] = force_reg (V8QImode, operands[1]);
+  operands[2] = force_reg (V8QImode, operands[2]);
+  operands[3] = force_reg (V2SImode, operands[3]);
+
+  if (TARGET_AVXVNNIINT8)
+{
+  rtx op1 =

[PATCH 2/2] Extend usdot_prodv*qi with vpmaddwd when AVXVNNI/AVX512VNNI is not available.

2024-04-27 Thread liuhongt
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ready push to trunk.

gcc/ChangeLog:

* config/i386/sse.md (usdot_prodv*qi): Extend to VI1_AVX512
with vpmaddwd when avxvnni/avx512vnni is not available.
---
 gcc/config/i386/sse.md | 55 +++---
 1 file changed, 41 insertions(+), 14 deletions(-)

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 1bf50726e83..f57f36ae380 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -29955,21 +29955,48 @@ (define_insn "vpshldv__maskz_1"
 
 (define_expand "usdot_prod"
   [(match_operand: 0 "register_operand")
-   (match_operand:VI1_AVX512VNNI 1 "register_operand")
-   (match_operand:VI1_AVX512VNNI 2 "register_operand")
+   (match_operand:VI1_AVX512 1 "register_operand")
+   (match_operand:VI1_AVX512 2 "register_operand")
(match_operand: 3 "register_operand")]
-  "(( == 64 && TARGET_EVEX512)
-|| ((TARGET_AVX512VNNI && TARGET_AVX512VL)
-   || TARGET_AVXVNNI))"
-{
-  operands[1] = lowpart_subreg (mode,
-   force_reg (mode, operands[1]),
-   mode);
-  operands[2] = lowpart_subreg (mode,
-   force_reg (mode, operands[2]),
-   mode);
-  emit_insn (gen_vpdpbusd_ (operands[0], operands[3],
- operands[1], operands[2]));
+  "TARGET_SSE2"
+{
+  if ( == 64
+ ? TARGET_AVX512VNNI
+ : ((TARGET_AVX512VNNI && TARGET_AVX512VL) || TARGET_AVXVNNI))
+{
+  operands[1] = lowpart_subreg (mode,
+   force_reg (mode, operands[1]),
+   mode);
+  operands[2] = lowpart_subreg (mode,
+   force_reg (mode, operands[2]),
+   mode);
+  emit_insn (gen_vpdpbusd_ (operands[0], operands[3],
+ operands[1], operands[2]));
+}
+  else
+{
+  /* Emulate with vpdpwssd.  */
+  rtx op1_lo = gen_reg_rtx (mode);
+  rtx op1_hi = gen_reg_rtx (mode);
+  rtx op2_lo = gen_reg_rtx (mode);
+  rtx op2_hi = gen_reg_rtx (mode);
+
+  emit_insn (gen_vec_unpacku_lo_ (op1_lo, operands[1]));
+  emit_insn (gen_vec_unpacks_lo_ (op2_lo, operands[2]));
+  emit_insn (gen_vec_unpacku_hi_ (op1_hi, operands[1]));
+  emit_insn (gen_vec_unpacks_hi_ (op2_hi, operands[2]));
+
+  rtx res1 = gen_reg_rtx (mode);
+  rtx res2 = gen_reg_rtx (mode);
+  rtx sum = gen_reg_rtx (mode);
+
+  emit_move_insn (sum, CONST0_RTX (mode));
+  emit_insn (gen_sdot_prod (res1, op1_lo,
+   op2_lo, sum));
+  emit_insn (gen_sdot_prod (res2, op1_hi,
+   op2_hi, operands[3]));
+  emit_insn (gen_add3 (operands[0], res1, res2));
+}
   DONE;
 })
 
-- 
2.31.1



  1   2   3   4   5   6   >