[PATCH 3/4] [PATCH 3/4] x86: Properly handle USE_VECTOR_FP_CONVERTS/USE_VECTOR_CONVERTS

2021-09-15 Thread lili.cui--- via Gcc-patches
From: "H.J. Lu" 

Check TARGET_USE_VECTOR_FP_CONVERTS or TARGET_USE_VECTOR_CONVERTS when
handling avx_partial_xmm_update attribute.  Don't convert AVX partial
XMM register update if vector packed SSE conversion should be used.

gcc/

PR target/101900
* config/i386/i386-features.c (remove_partial_avx_dependency):
Check TARGET_USE_VECTOR_FP_CONVERTS and TARGET_USE_VECTOR_CONVERTS
before generating vxorps.

gcc/

PR target/101900
* testsuite/gcc.target/i386/pr101900-1.c: New test.
* testsuite/gcc.target/i386/pr101900-2.c: Likewise.
* testsuite/gcc.target/i386/pr101900-3.c: Likewise.
---
 gcc/config/i386/i386-features.c| 21 ++---
 gcc/testsuite/gcc.target/i386/pr101900-1.c | 18 ++
 gcc/testsuite/gcc.target/i386/pr101900-2.c | 18 ++
 gcc/testsuite/gcc.target/i386/pr101900-3.c | 19 +++
 4 files changed, 73 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101900-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101900-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101900-3.c

diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index 5a99ea7c046..ae5ea02a002 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -2210,15 +2210,30 @@ remove_partial_avx_dependency (void)
  != AVX_PARTIAL_XMM_UPDATE_TRUE)
continue;
 
- if (!v4sf_const0)
-   v4sf_const0 = gen_reg_rtx (V4SFmode);
-
  /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
 SI -> SF, SI -> DF, DI -> SF, DI -> DF, to vec_dup and
 vec_merge with subreg.  */
  rtx src = SET_SRC (set);
  rtx dest = SET_DEST (set);
  machine_mode dest_mode = GET_MODE (dest);
+ machine_mode src_mode;
+
+ if (TARGET_USE_VECTOR_FP_CONVERTS)
+   {
+ src_mode = GET_MODE (XEXP (src, 0));
+ if (src_mode == E_SFmode || src_mode == E_DFmode)
+   continue;
+   }
+
+ if (TARGET_USE_VECTOR_CONVERTS)
+   {
+ src_mode = GET_MODE (XEXP (src, 0));
+ if (src_mode == E_SImode || src_mode == E_DImode)
+   continue;
+   }
+
+ if (!v4sf_const0)
+   v4sf_const0 = gen_reg_rtx (V4SFmode);
 
  rtx zero;
  machine_mode dest_vecmode;
diff --git a/gcc/testsuite/gcc.target/i386/pr101900-1.c 
b/gcc/testsuite/gcc.target/i386/pr101900-1.c
new file mode 100644
index 000..0a45f8e340a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101900-1.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mfpmath=sse 
-mtune-ctrl=use_vector_fp_converts" } */
+
+extern float f;
+extern double d;
+extern int i;
+
+void
+foo (void)
+{
+  d = f;
+  f = i;
+}
+
+/* { dg-final { scan-assembler "vcvtps2pd" } } */
+/* { dg-final { scan-assembler "vcvtsi2ssl" } } */
+/* { dg-final { scan-assembler-not "vcvtss2sd" } } */
+/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr101900-2.c 
b/gcc/testsuite/gcc.target/i386/pr101900-2.c
new file mode 100644
index 000..c8b2d1da5ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101900-2.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mfpmath=sse 
-mtune-ctrl=use_vector_converts" } */
+
+extern float f;
+extern double d;
+extern int i;
+
+void
+foo (void)
+{
+  d = f;
+  f = i;
+}
+
+/* { dg-final { scan-assembler "vcvtss2sd" } } */
+/* { dg-final { scan-assembler "vcvtdq2ps" } } */
+/* { dg-final { scan-assembler-not "vcvtsi2ssl" } } */
+/* { dg-final { scan-assembler-times "vxorps\[^\n\r\]*xmm\[0-9\]" 1 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr101900-3.c 
b/gcc/testsuite/gcc.target/i386/pr101900-3.c
new file mode 100644
index 000..6ee565b5bd4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101900-3.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=skylake -mfpmath=sse 
-mtune-ctrl=use_vector_fp_converts,use_vector_converts" } */
+
+extern float f;
+extern double d;
+extern int i;
+
+void
+foo (void)
+{
+  d = f;
+  f = i;
+}
+
+/* { dg-final { scan-assembler "vcvtps2pd" } } */
+/* { dg-final { scan-assembler "vcvtdq2ps" } } */
+/* { dg-final { scan-assembler-not "vcvtss2sd" } } */
+/* { dg-final { scan-assembler-not "vcvtsi2ssl" } } */
+/* { dg-final { scan-assembler-not "vxorps" } } */
-- 
2.17.1



[PATCH 2/4] [PATCH 2/4] x86: Update memcpy/memset inline strategies for -mtune=tremont

2021-09-15 Thread lili.cui--- via Gcc-patches
From: "H.J. Lu" 

Simply memcpy and memset inline strategies to avoid branches for
-mtune=tremont:

1. Create Tremont cost model from generic cost model.
2. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
   load and store for up to 16 * 16 (256) bytes when the data size is
   fixed and known.
3. Inline only if data size is known to be <= 256.
   a. Use "rep movsb/stosb" with simple code sequence if the data size
  is a constant.
   b. Use loop if data size is not a constant.
4. Use memcpy/memset libray function if data size is unknown or > 256.

* config/i386/i386-options.c (processor_cost_table): Use
tremont_cost for Tremont.
* config/i386/x86-tune-costs.h (tremont_memcpy): New.
(tremont_memset): Likewise.
(tremont_cost): Likewise.
* config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
Enable for Tremont.
---
 gcc/config/i386/i386-options.c   |   2 +-
 gcc/config/i386/x86-tune-costs.h | 124 +++
 gcc/config/i386/x86-tune.def |   2 +-
 3 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index c0006b3674b..e7a3bd4aaea 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -724,7 +724,7 @@ static const struct processor_costs *processor_cost_table[] 
=
   _cost,
   _cost,
   _cost,
-  _cost,
+  _cost,
   _cost,
   _cost,
   _cost,
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index ffe810f2bcb..93644be9cb3 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {
   "16",/* Func alignment.  */
 };
 
+static stringop_algs tremont_memcpy[2] = {
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+{256, loop, false},
+{-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+{256, loop, false},
+{-1, libcall, false;
+static stringop_algs tremont_memset[2] = {
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+{256, loop, false},
+{-1, libcall, false}}},
+  {libcall,
+   {{256, rep_prefix_1_byte, true},
+{256, loop, false},
+{-1, libcall, false;
+static const
+struct processor_costs tremont_cost = {
+  {
+  /* Start of register allocator costs.  integer->integer move cost is 2. */
+  6,/* cost for loading QImode using movzbl */
+  {6, 6, 6},   /* cost of loading integer registers
+  in QImode, HImode and SImode.
+  Relative to reg-reg move (2).  */
+  {6, 6, 6},   /* cost of storing integer registers */
+  4,   /* cost of reg,reg fld/fst */
+  {6, 6, 12},  /* cost of loading fp registers
+  in SFmode, DFmode and XFmode */
+  {6, 6, 12},  /* cost of storing fp registers
+  in SFmode, DFmode and XFmode */
+  2,   /* cost of moving MMX register */
+  {6, 6},  /* cost of loading MMX registers
+  in SImode and DImode */
+  {6, 6},  /* cost of storing MMX registers
+  in SImode and DImode */
+  2, 3, 4, /* cost of moving XMM,YMM,ZMM register 
*/
+  {6, 6, 6, 10, 15},   /* cost of loading SSE registers
+  in 32,64,128,256 and 512-bit */
+  {6, 6, 6, 10, 15},   /* cost of storing SSE registers
+  in 32,64,128,256 and 512-bit */
+  6, 6,/* SSE->integer and integer->SSE moves 
*/
+  6, 6,/* mask->integer and integer->mask 
moves */
+  {6, 6, 6},   /* cost of loading mask register
+  in QImode, HImode, SImode.  */
+  {6, 6, 6},   /* cost if storing mask register
+  in QImode, HImode, SImode.  */
+  2,   /* cost of moving mask register.  */
+  /* End of register allocator costs.  */
+  },
+
+  COSTS_N_INSNS (1),   /* cost of an add instruction */
+  /* Setting cost to 2 makes our current implementation of synth_mult result in
+ use of unnecessary temporary registers causing regression on several
+ SPECfp benchmarks.  */
+  COSTS_N_INSNS (1) + 1,   /* cost of a lea instruction */
+  COSTS_N_INSNS (1),   /* variable shift costs */
+  COSTS_N_INSNS (1),   /* constant shift costs */
+  

[PATCH 4/4] [PATCH 4/4] x86: Add TARGET_SSE_PARTIAL_REG_[FP_]CONVERTS_DEPENDENCY

2021-09-15 Thread lili.cui--- via Gcc-patches
From: "H.J. Lu" 

1. Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY with
TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY in SSE FP to FP splitters.
2. Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY with
TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY in SSE INT to FP splitters.
3.  Also check TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY and
TARGET_SSE_PARTIAL_REG_DEPENDENCY when handling avx_partial_xmm_update
attribute.  Don't convert AVX partial XMM register update if there is no
partial SSE register dependency for SSE conversion.

gcc/

* config/i386/i386-features.c (remove_partial_avx_dependency):
Also check TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY and
and TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY before generating
vxorps.
* config/i386/i386.h (TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY):
New.
(TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY): Likewise.
* config/i386/i386.md (SSE FP to FP splitters): Replace
TARGET_SSE_PARTIAL_REG_DEPENDENCY with
TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY.
(SSE INT to FP splitter): Replace TARGET_SSE_PARTIAL_REG_DEPENDENCY
with TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY.
* config/i386/x86-tune.def
(X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY): New.
(X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY): Likewise.

gcc/testsuite/

* gcc.target/i386/avx-covert-1.c: New file.
* gcc.target/i386/avx-fp-covert-1.c: Likewise.
* gcc.target/i386/avx-int-covert-1.c: Likewise.
* gcc.target/i386/sse-covert-1.c: Likewise.
* gcc.target/i386/sse-fp-covert-1.c: Likewise.
* gcc.target/i386/sse-int-covert-1.c: Likewise.
---
 gcc/config/i386/i386-features.c   |  6 --
 gcc/config/i386/i386.h|  4 
 gcc/config/i386/i386.md   |  9 ++---
 gcc/config/i386/x86-tune.def  | 15 +++
 gcc/testsuite/gcc.target/i386/avx-covert-1.c  | 19 +++
 .../gcc.target/i386/avx-fp-covert-1.c | 15 +++
 .../gcc.target/i386/avx-int-covert-1.c| 14 ++
 gcc/testsuite/gcc.target/i386/sse-covert-1.c  | 19 +++
 .../gcc.target/i386/sse-fp-covert-1.c | 15 +++
 .../gcc.target/i386/sse-int-covert-1.c| 14 ++
 10 files changed, 125 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-covert-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-fp-covert-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-int-covert-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-covert-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-fp-covert-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-int-covert-1.c

diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c
index ae5ea02a002..91bfa06d4bf 100644
--- a/gcc/config/i386/i386-features.c
+++ b/gcc/config/i386/i386-features.c
@@ -2218,14 +2218,16 @@ remove_partial_avx_dependency (void)
  machine_mode dest_mode = GET_MODE (dest);
  machine_mode src_mode;
 
- if (TARGET_USE_VECTOR_FP_CONVERTS)
+ if (TARGET_USE_VECTOR_FP_CONVERTS
+ || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
{
  src_mode = GET_MODE (XEXP (src, 0));
  if (src_mode == E_SFmode || src_mode == E_DFmode)
continue;
}
 
- if (TARGET_USE_VECTOR_CONVERTS)
+ if (TARGET_USE_VECTOR_CONVERTS
+ || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
{
  src_mode = GET_MODE (XEXP (src, 0));
  if (src_mode == E_SImode || src_mode == E_DImode)
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index e76bb55c080..ec60b89753e 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -334,6 +334,10 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_PARTIAL_REG_DEPENDENCY]
 #define TARGET_SSE_PARTIAL_REG_DEPENDENCY \
ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY]
+#define TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY \
+   ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY]
+#define TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY \
+   ix86_tune_features[X86_TUNE_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY]
 #define TARGET_SSE_UNALIGNED_LOAD_OPTIMAL \
ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
 #define TARGET_SSE_UNALIGNED_STORE_OPTIMAL \
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 13f6f57cdcc..c82a9dc1f67 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -4535,7 +4535,8 @@
 (float_extend:DF
   (match_operand:SF 1 "nonimmediate_operand")))]
   "!TARGET_AVX
-   && TARGET_SSE_PARTIAL_REG_DEPENDENCY && epilogue_completed
+   && 

[PATCH 1/4] [PATCH 1/4] x86: Update -mtune=tremont

2021-09-15 Thread lili.cui--- via Gcc-patches
From: "H.J. Lu" 

Initial -mtune=tremont update

1. Use Haswell scheduling model.
2. Assume that stack engine allows to execute push instructions in
parall.
3. Prepare for scheduling pass as -mtune=generic.
4. Use the same issue rate as -mtune=generic.
5. Enable partial_reg_dependency.
6. Disable accumulate_outgoing_args
7. Enable use_leave
8. Enable push_memory
9. Disable four_jump_limit
10. Disable opt_agu
11. Disable avoid_lea_for_addr
12. Disable avoid_mem_opnd_for_cmove
13. Enable misaligned_move_string_pro_epilogues
14. Enable use_cltd
16. Enable avoid_false_dep_for_bmi
17. Enable avoid_mfence
18. Disable expand_abs
19. Enable sse_typeless_stores
20. Enable sse_load0_by_pxor
21. Disable split_mem_opnd_for_fp_converts
22. Disable slow_pshufb
23. Enable partial_reg_dependency

This is the first patch to tune for Tremont.  With all patches applied,
performance impacts on SPEC CPU 2017 are:

500.perlbench_r 1.81%
502.gcc_r   0.57%
505.mcf_r   1.16%
520.omnetpp_r   0.00%
523.xalancbmk_r 0.00%
525.x264_r  4.55%
531.deepsjeng_r 0.00%
541.leela_r 0.39%
548.exchange2_r 1.13%
557.xz_r0.00%
geomean for intrate 0.95%
503.bwaves_r0.00%
507.cactuBSSN_r 6.94%
508.namd_r  12.37%
510.parest_r1.01%
511.povray_r3.70%
519.lbm_r   36.61%
521.wrf_r   8.79%
526.blender_r   2.91%
527.cam4_r  6.23%
538.imagick_r   0.28%
544.nab_r   21.99%
549.fotonik3d_r 3.63%
554.roms_r  -1.20%
geomean for fprate  7.50%

gcc/ChangeLog

* common/config/i386/i386-common.c: Use Haswell scheduling model
for Tremont.
* config/i386/i386.c (ix86_sched_init_global): Prepare for Tremont
scheduling pass.
* config/i386/x86-tune-sched.c (ix86_issue_rate): Change Tremont
issue rate to 4.
(ix86_adjust_cost): Handle Tremont.
* config/i386/x86-tune.def (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY):
Enable for Tremont.
(X86_TUNE_USE_LEAVE): Likewise.
(X86_TUNE_PUSH_MEMORY): Likewise.
(X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES): Likewise.
(X86_TUNE_USE_CLTD): Likewise.
(X86_TUNE_AVOID_FALSE_DEP_FOR_BMI): Likewise.
(X86_TUNE_AVOID_MFENCE): Likewise.
(X86_TUNE_SSE_TYPELESS_STORES): Likewise.
(X86_TUNE_SSE_LOAD0_BY_PXOR): Likewise.
(X86_TUNE_ACCUMULATE_OUTGOING_ARGS): Disable for Tremont.
(X86_TUNE_FOUR_JUMP_LIMIT): Likewise.
(X86_TUNE_OPT_AGU): Likewise.
(X86_TUNE_AVOID_LEA_FOR_ADDR): Likewise.
(X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE): Likewise.
(X86_TUNE_EXPAND_ABS): Likewise.
(X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS): Likewise.
(X86_TUNE_SLOW_PSHUFB): Likewise.
---
 gcc/common/config/i386/i386-common.c |  2 +-
 gcc/config/i386/i386.c   |  1 +
 gcc/config/i386/x86-tune-sched.c |  2 ++
 gcc/config/i386/x86-tune.def | 37 ++--
 4 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/gcc/common/config/i386/i386-common.c 
b/gcc/common/config/i386/i386-common.c
index 00c65ba15ab..2c9e1ccbc6e 100644
--- a/gcc/common/config/i386/i386-common.c
+++ b/gcc/common/config/i386/i386-common.c
@@ -1935,7 +1935,7 @@ const pta processor_alias_table[] =
 M_CPU_TYPE (INTEL_GOLDMONT), P_PROC_SSE4_2},
   {"goldmont-plus", PROCESSOR_GOLDMONT_PLUS, CPU_GLM, PTA_GOLDMONT_PLUS,
 M_CPU_TYPE (INTEL_GOLDMONT_PLUS), P_PROC_SSE4_2},
-  {"tremont", PROCESSOR_TREMONT, CPU_GLM, PTA_TREMONT,
+  {"tremont", PROCESSOR_TREMONT, CPU_HASWELL, PTA_TREMONT,
 M_CPU_TYPE (INTEL_TREMONT), P_PROC_SSE4_2},
   {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL,
 M_CPU_TYPE (INTEL_KNL), P_PROC_AVX512F},
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 7b173bc0beb..2927e2884c9 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -16976,6 +16976,7 @@ ix86_sched_init_global (FILE *, int, int)
 case PROCESSOR_NEHALEM:
 case PROCESSOR_SANDYBRIDGE:
 case PROCESSOR_HASWELL:
+case PROCESSOR_TREMONT:
 case PROCESSOR_GENERIC:
   /* Do not perform multipass scheduling for pre-reload schedule
  to save compile time.  */
diff --git a/gcc/config/i386/x86-tune-sched.c b/gcc/config/i386/x86-tune-sched.c
index 2e5ee4e..56ada99a450 100644
--- a/gcc/config/i386/x86-tune-sched.c
+++ b/gcc/config/i386/x86-tune-sched.c
@@ -71,6 +71,7 @@ ix86_issue_rate (void)
 case PROCESSOR_NEHALEM:
 case PROCESSOR_SANDYBRIDGE:
 case PROCESSOR_HASWELL:
+case PROCESSOR_TREMONT:
 case PROCESSOR_GENERIC:
   return 4;
 
@@ -429,6 +430,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn 
*dep_insn, int cost,
 case PROCESSOR_NEHALEM:
 case PROCESSOR_SANDYBRIDGE:
 case PROCESSOR_HASWELL:
+case PROCESSOR_TREMONT:
 case 

[PATCH 0/4] Update mtune=tremont

2021-09-15 Thread lili.cui--- via Gcc-patches
From: "Cui,Lili" 

Hi,

I have four patches for tremont tuning, With all patches applied,
performance impacts on SPEC CPU 2017 are:

500.perlbench_r 1.81%
502.gcc_r   0.57%
505.mcf_r   1.16%
520.omnetpp_r   0.00%
523.xalancbmk_r 0.00%
525.x264_r  4.55%
531.deepsjeng_r 0.00%
541.leela_r 0.39%
548.exchange2_r 1.13%
557.xz_r0.00%
geomean for intrate 0.95%
503.bwaves_r0.00%
507.cactuBSSN_r 6.94%
508.namd_r  12.37%
510.parest_r1.01%
511.povray_r3.70%
519.lbm_r   36.61%
521.wrf_r   8.79%
526.blender_r   2.91%
527.cam4_r  6.23%
538.imagick_r   0.28%
544.nab_r   21.99%
549.fotonik3d_r 3.63%
554.roms_r  -1.20%
geomean for fprate  7.50%

Bootstrapped and regtested on x86_64-linux-gnu{-m32,-m64}.
Ok for master?

  x86: Update -mtune=tremont
  x86: Update memcpy/memset inline strategies for -mtune=tremont
  x86: Properly handle USE_VECTOR_FP_CONVERTS/USE_VECTOR_CONVERTS
  x86: Add TARGET_SSE_PARTIAL_REG_[FP_]CONVERTS_DEPENDENCY

 gcc/common/config/i386/i386-common.c  |   2 +-
 gcc/config/i386/i386-features.c   |  23 +++-
 gcc/config/i386/i386-options.c|   2 +-
 gcc/config/i386/i386.c|   1 +
 gcc/config/i386/i386.h|   4 +
 gcc/config/i386/i386.md   |   9 +-
 gcc/config/i386/x86-tune-costs.h  | 124 ++
 gcc/config/i386/x86-tune-sched.c  |   2 +
 gcc/config/i386/x86-tune.def  |  52 +---
 gcc/testsuite/gcc.target/i386/avx-covert-1.c  |  19 +++
 .../gcc.target/i386/avx-fp-covert-1.c |  15 +++
 .../gcc.target/i386/avx-int-covert-1.c|  14 ++
 gcc/testsuite/gcc.target/i386/pr101900-1.c|  18 +++
 gcc/testsuite/gcc.target/i386/pr101900-2.c|  18 +++
 gcc/testsuite/gcc.target/i386/pr101900-3.c|  19 +++
 gcc/testsuite/gcc.target/i386/sse-covert-1.c  |  19 +++
 .../gcc.target/i386/sse-fp-covert-1.c |  15 +++
 .../gcc.target/i386/sse-int-covert-1.c|  14 ++
 18 files changed, 344 insertions(+), 26 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-covert-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-fp-covert-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx-int-covert-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101900-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101900-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101900-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-covert-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-fp-covert-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/sse-int-covert-1.c

-- 
2.17.1

Thanks,
Lili.