Re: [PATCH 1/4] [PATCH 1/4] x86: Update -mtune=tremont
On Wed, Sep 15, 2021 at 10:09 AM wrote: > > From: "H.J. Lu" > > Initial -mtune=tremont update > > 1. Use Haswell scheduling model. > 2. Assume that stack engine allows to execute push&pop instructions in > parall. > 3. Prepare for scheduling pass as -mtune=generic. > 4. Use the same issue rate as -mtune=generic. > 5. Enable partial_reg_dependency. > 6. Disable accumulate_outgoing_args > 7. Enable use_leave > 8. Enable push_memory > 9. Disable four_jump_limit > 10. Disable opt_agu > 11. Disable avoid_lea_for_addr > 12. Disable avoid_mem_opnd_for_cmove > 13. Enable misaligned_move_string_pro_epilogues > 14. Enable use_cltd > 16. Enable avoid_false_dep_for_bmi > 17. Enable avoid_mfence > 18. Disable expand_abs > 19. Enable sse_typeless_stores > 20. Enable sse_load0_by_pxor > 21. Disable split_mem_opnd_for_fp_converts > 22. Disable slow_pshufb > 23. Enable partial_reg_dependency > > This is the first patch to tune for Tremont. With all patches applied, > performance impacts on SPEC CPU 2017 are: > > 500.perlbench_r 1.81% > 502.gcc_r 0.57% > 505.mcf_r 1.16% > 520.omnetpp_r 0.00% > 523.xalancbmk_r 0.00% > 525.x264_r 4.55% > 531.deepsjeng_r 0.00% > 541.leela_r 0.39% > 548.exchange2_r 1.13% > 557.xz_r0.00% > geomean for intrate 0.95% > 503.bwaves_r0.00% > 507.cactuBSSN_r 6.94% > 508.namd_r 12.37% > 510.parest_r1.01% > 511.povray_r3.70% > 519.lbm_r 36.61% > 521.wrf_r 8.79% > 526.blender_r 2.91% > 527.cam4_r 6.23% > 538.imagick_r 0.28% > 544.nab_r 21.99% > 549.fotonik3d_r 3.63% > 554.roms_r -1.20% > geomean for fprate 7.50% > > gcc/ChangeLog > > * common/config/i386/i386-common.c: Use Haswell scheduling model > for Tremont. > * config/i386/i386.c (ix86_sched_init_global): Prepare for Tremont > scheduling pass. > * config/i386/x86-tune-sched.c (ix86_issue_rate): Change Tremont > issue rate to 4. > (ix86_adjust_cost): Handle Tremont. > * config/i386/x86-tune.def (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY): > Enable for Tremont. > (X86_TUNE_USE_LEAVE): Likewise. > (X86_TUNE_PUSH_MEMORY): Likewise. > (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES): Likewise. > (X86_TUNE_USE_CLTD): Likewise. > (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI): Likewise. > (X86_TUNE_AVOID_MFENCE): Likewise. > (X86_TUNE_SSE_TYPELESS_STORES): Likewise. > (X86_TUNE_SSE_LOAD0_BY_PXOR): Likewise. > (X86_TUNE_ACCUMULATE_OUTGOING_ARGS): Disable for Tremont. > (X86_TUNE_FOUR_JUMP_LIMIT): Likewise. > (X86_TUNE_OPT_AGU): Likewise. > (X86_TUNE_AVOID_LEA_FOR_ADDR): Likewise. > (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE): Likewise. > (X86_TUNE_EXPAND_ABS): Likewise. > (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS): Likewise. > (X86_TUNE_SLOW_PSHUFB): Likewise. OK. (Tuning patches are kind of obvious). Thanks, Uros. > --- > gcc/common/config/i386/i386-common.c | 2 +- > gcc/config/i386/i386.c | 1 + > gcc/config/i386/x86-tune-sched.c | 2 ++ > gcc/config/i386/x86-tune.def | 37 ++-- > 4 files changed, 23 insertions(+), 19 deletions(-) > > diff --git a/gcc/common/config/i386/i386-common.c > b/gcc/common/config/i386/i386-common.c > index 00c65ba15ab..2c9e1ccbc6e 100644 > --- a/gcc/common/config/i386/i386-common.c > +++ b/gcc/common/config/i386/i386-common.c > @@ -1935,7 +1935,7 @@ const pta processor_alias_table[] = > M_CPU_TYPE (INTEL_GOLDMONT), P_PROC_SSE4_2}, >{"goldmont-plus", PROCESSOR_GOLDMONT_PLUS, CPU_GLM, PTA_GOLDMONT_PLUS, > M_CPU_TYPE (INTEL_GOLDMONT_PLUS), P_PROC_SSE4_2}, > - {"tremont", PROCESSOR_TREMONT, CPU_GLM, PTA_TREMONT, > + {"tremont", PROCESSOR_TREMONT, CPU_HASWELL, PTA_TREMONT, > M_CPU_TYPE (INTEL_TREMONT), P_PROC_SSE4_2}, >{"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL, > M_CPU_TYPE (INTEL_KNL), P_PROC_AVX512F}, > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 7b173bc0beb..2927e2884c9 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -16976,6 +16976,7 @@ ix86_sched_init_global (FILE *, int, int) > case PROCESSOR_NEHALEM: > case PROCESSOR_SANDYBRIDGE: > case PROCESSOR_HASWELL: > +case PROCESSOR_TREMONT: > case PROCESSOR_GENERIC: >/* Do not perform multipass scheduling for pre-reload schedule > to save compile time. */ > diff --git a/gcc/config/i386/x86-tune-sched.c > b/gcc/config/i386/x86-tune-sched.c > index 2e5ee4e..56ada99a450 100644 > --- a/gcc/config/i386/x86-tune-sched.c > +++ b/gcc/config/i386/x86-tune-sched.c > @@ -71,6 +71,7 @@ ix86_issue_rate (void) > case PROCESSOR_NEHALEM: > case PROCESSOR_SANDYBRIDGE: >
[PATCH 1/4] [PATCH 1/4] x86: Update -mtune=tremont
From: "H.J. Lu" Initial -mtune=tremont update 1. Use Haswell scheduling model. 2. Assume that stack engine allows to execute push&pop instructions in parall. 3. Prepare for scheduling pass as -mtune=generic. 4. Use the same issue rate as -mtune=generic. 5. Enable partial_reg_dependency. 6. Disable accumulate_outgoing_args 7. Enable use_leave 8. Enable push_memory 9. Disable four_jump_limit 10. Disable opt_agu 11. Disable avoid_lea_for_addr 12. Disable avoid_mem_opnd_for_cmove 13. Enable misaligned_move_string_pro_epilogues 14. Enable use_cltd 16. Enable avoid_false_dep_for_bmi 17. Enable avoid_mfence 18. Disable expand_abs 19. Enable sse_typeless_stores 20. Enable sse_load0_by_pxor 21. Disable split_mem_opnd_for_fp_converts 22. Disable slow_pshufb 23. Enable partial_reg_dependency This is the first patch to tune for Tremont. With all patches applied, performance impacts on SPEC CPU 2017 are: 500.perlbench_r 1.81% 502.gcc_r 0.57% 505.mcf_r 1.16% 520.omnetpp_r 0.00% 523.xalancbmk_r 0.00% 525.x264_r 4.55% 531.deepsjeng_r 0.00% 541.leela_r 0.39% 548.exchange2_r 1.13% 557.xz_r0.00% geomean for intrate 0.95% 503.bwaves_r0.00% 507.cactuBSSN_r 6.94% 508.namd_r 12.37% 510.parest_r1.01% 511.povray_r3.70% 519.lbm_r 36.61% 521.wrf_r 8.79% 526.blender_r 2.91% 527.cam4_r 6.23% 538.imagick_r 0.28% 544.nab_r 21.99% 549.fotonik3d_r 3.63% 554.roms_r -1.20% geomean for fprate 7.50% gcc/ChangeLog * common/config/i386/i386-common.c: Use Haswell scheduling model for Tremont. * config/i386/i386.c (ix86_sched_init_global): Prepare for Tremont scheduling pass. * config/i386/x86-tune-sched.c (ix86_issue_rate): Change Tremont issue rate to 4. (ix86_adjust_cost): Handle Tremont. * config/i386/x86-tune.def (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY): Enable for Tremont. (X86_TUNE_USE_LEAVE): Likewise. (X86_TUNE_PUSH_MEMORY): Likewise. (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES): Likewise. (X86_TUNE_USE_CLTD): Likewise. (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI): Likewise. (X86_TUNE_AVOID_MFENCE): Likewise. (X86_TUNE_SSE_TYPELESS_STORES): Likewise. (X86_TUNE_SSE_LOAD0_BY_PXOR): Likewise. (X86_TUNE_ACCUMULATE_OUTGOING_ARGS): Disable for Tremont. (X86_TUNE_FOUR_JUMP_LIMIT): Likewise. (X86_TUNE_OPT_AGU): Likewise. (X86_TUNE_AVOID_LEA_FOR_ADDR): Likewise. (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE): Likewise. (X86_TUNE_EXPAND_ABS): Likewise. (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS): Likewise. (X86_TUNE_SLOW_PSHUFB): Likewise. --- gcc/common/config/i386/i386-common.c | 2 +- gcc/config/i386/i386.c | 1 + gcc/config/i386/x86-tune-sched.c | 2 ++ gcc/config/i386/x86-tune.def | 37 ++-- 4 files changed, 23 insertions(+), 19 deletions(-) diff --git a/gcc/common/config/i386/i386-common.c b/gcc/common/config/i386/i386-common.c index 00c65ba15ab..2c9e1ccbc6e 100644 --- a/gcc/common/config/i386/i386-common.c +++ b/gcc/common/config/i386/i386-common.c @@ -1935,7 +1935,7 @@ const pta processor_alias_table[] = M_CPU_TYPE (INTEL_GOLDMONT), P_PROC_SSE4_2}, {"goldmont-plus", PROCESSOR_GOLDMONT_PLUS, CPU_GLM, PTA_GOLDMONT_PLUS, M_CPU_TYPE (INTEL_GOLDMONT_PLUS), P_PROC_SSE4_2}, - {"tremont", PROCESSOR_TREMONT, CPU_GLM, PTA_TREMONT, + {"tremont", PROCESSOR_TREMONT, CPU_HASWELL, PTA_TREMONT, M_CPU_TYPE (INTEL_TREMONT), P_PROC_SSE4_2}, {"knl", PROCESSOR_KNL, CPU_SLM, PTA_KNL, M_CPU_TYPE (INTEL_KNL), P_PROC_AVX512F}, diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 7b173bc0beb..2927e2884c9 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -16976,6 +16976,7 @@ ix86_sched_init_global (FILE *, int, int) case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: case PROCESSOR_HASWELL: +case PROCESSOR_TREMONT: case PROCESSOR_GENERIC: /* Do not perform multipass scheduling for pre-reload schedule to save compile time. */ diff --git a/gcc/config/i386/x86-tune-sched.c b/gcc/config/i386/x86-tune-sched.c index 2e5ee4e..56ada99a450 100644 --- a/gcc/config/i386/x86-tune-sched.c +++ b/gcc/config/i386/x86-tune-sched.c @@ -71,6 +71,7 @@ ix86_issue_rate (void) case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: case PROCESSOR_HASWELL: +case PROCESSOR_TREMONT: case PROCESSOR_GENERIC: return 4; @@ -429,6 +430,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: case PROCESSOR_HASWELL: +case PROCESSOR_TREMONT: case PROCESSOR_GENE