memset inline strategies for -mtune=tremont

Uros Bizjak via Gcc-patches Wed, 15 Sep 2021 23:37:06 -0700

On Wed, Sep 15, 2021 at 10:10 AM <lili....@intel.com> wrote:
>
> From: "H.J. Lu" <hjl.to...@gmail.com>
>
> Simply memcpy and memset inline strategies to avoid branches for
> -mtune=tremont:
>
> 1. Create Tremont cost model from generic cost model.
> 2. With MOVE_RATIO and CLEAR_RATIO == 17, GCC will use integer/vector
>    load and store for up to 16 * 16 (256) bytes when the data size is
>    fixed and known.
> 3. Inline only if data size is known to be <= 256.
>    a. Use "rep movsb/stosb" with simple code sequence if the data size
>       is a constant.
>    b. Use loop if data size is not a constant.
> 4. Use memcpy/memset libray function if data size is unknown or > 256.
>
>         * config/i386/i386-options.c (processor_cost_table): Use
>         tremont_cost for Tremont.
>         * config/i386/x86-tune-costs.h (tremont_memcpy): New.
>         (tremont_memset): Likewise.
>         (tremont_cost): Likewise.
>         * config/i386/x86-tune.def (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
>         Enable for Tremont.


OK, and also obvious as a tuning patch.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-options.c   |   2 +-
>  gcc/config/i386/x86-tune-costs.h | 124 +++++++++++++++++++++++++++++++
>  gcc/config/i386/x86-tune.def     |   2 +-
>  3 files changed, 126 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
> index c0006b3674b..e7a3bd4aaea 100644
> --- a/gcc/config/i386/i386-options.c
> +++ b/gcc/config/i386/i386-options.c
> @@ -724,7 +724,7 @@ static const struct processor_costs 
> *processor_cost_table[] =
>    &slm_cost,
>    &slm_cost,
>    &slm_cost,
> -  &slm_cost,
> +  &tremont_cost,
>    &slm_cost,
>    &slm_cost,
>    &skylake_cost,
> diff --git a/gcc/config/i386/x86-tune-costs.h 
> b/gcc/config/i386/x86-tune-costs.h
> index ffe810f2bcb..93644be9cb3 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -2734,6 +2734,130 @@ struct processor_costs slm_cost = {
>    "16",                                        /* Func alignment.  */
>  };
>
> +static stringop_algs tremont_memcpy[2] = {
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}},
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}}};
> +static stringop_algs tremont_memset[2] = {
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}},
> +  {libcall,
> +   {{256, rep_prefix_1_byte, true},
> +    {256, loop, false},
> +    {-1, libcall, false}}}};
> +static const
> +struct processor_costs tremont_cost = {
> +  {
> +  /* Start of register allocator costs.  integer->integer move cost is 2. */
> +  6,                                /* cost for loading QImode using movzbl 
> */
> +  {6, 6, 6},                           /* cost of loading integer registers
> +                                          in QImode, HImode and SImode.
> +                                          Relative to reg-reg move (2).  */
> +  {6, 6, 6},                           /* cost of storing integer registers 
> */
> +  4,                                   /* cost of reg,reg fld/fst */
> +  {6, 6, 12},                          /* cost of loading fp registers
> +                                          in SFmode, DFmode and XFmode */
> +  {6, 6, 12},                          /* cost of storing fp registers
> +                                          in SFmode, DFmode and XFmode */
> +  2,                                   /* cost of moving MMX register */
> +  {6, 6},                              /* cost of loading MMX registers
> +                                          in SImode and DImode */
> +  {6, 6},                              /* cost of storing MMX registers
> +                                          in SImode and DImode */
> +  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM 
> register */
> +  {6, 6, 6, 10, 15},                   /* cost of loading SSE registers
> +                                          in 32,64,128,256 and 512-bit */
> +  {6, 6, 6, 10, 15},                   /* cost of storing SSE registers
> +                                          in 32,64,128,256 and 512-bit */
> +  6, 6,                                /* SSE->integer and integer->SSE 
> moves */
> +  6, 6,                                /* mask->integer and integer->mask 
> moves */
> +  {6, 6, 6},                           /* cost of loading mask register
> +                                          in QImode, HImode, SImode.  */
> +  {6, 6, 6},                   /* cost if storing mask register
> +                                          in QImode, HImode, SImode.  */
> +  2,                                   /* cost of moving mask register.  */
> +  /* End of register allocator costs.  */
> +  },
> +
> +  COSTS_N_INSNS (1),                   /* cost of an add instruction */
> +  /* Setting cost to 2 makes our current implementation of synth_mult result 
> in
> +     use of unnecessary temporary registers causing regression on several
> +     SPECfp benchmarks.  */
> +  COSTS_N_INSNS (1) + 1,               /* cost of a lea instruction */
> +  COSTS_N_INSNS (1),                   /* variable shift costs */
> +  COSTS_N_INSNS (1),                   /* constant shift costs */
> +  {COSTS_N_INSNS (3),                  /* cost of starting multiply for QI */
> +   COSTS_N_INSNS (4),                  /*                               HI */
> +   COSTS_N_INSNS (3),                  /*                               SI */
> +   COSTS_N_INSNS (4),                  /*                               DI */
> +   COSTS_N_INSNS (4)},                 /*                            other */
> +  0,                                   /* cost of multiply per each bit set 
> */
> +  {COSTS_N_INSNS (16),                 /* cost of a divide/mod for QI */
> +   COSTS_N_INSNS (22),                 /*                          HI */
> +   COSTS_N_INSNS (30),                 /*                          SI */
> +   COSTS_N_INSNS (74),                 /*                          DI */
> +   COSTS_N_INSNS (74)},                        /*                          
> other */
> +  COSTS_N_INSNS (1),                   /* cost of movsx */
> +  COSTS_N_INSNS (1),                   /* cost of movzx */
> +  8,                                   /* "large" insn */
> +  17,                                  /* MOVE_RATIO */
> +  17,                                  /* CLEAR_RATIO */
> +  {6, 6, 6},                           /* cost of loading integer registers
> +                                          in QImode, HImode and SImode.
> +                                          Relative to reg-reg move (2).  */
> +  {6, 6, 6},                           /* cost of storing integer registers 
> */
> +  {6, 6, 6, 10, 15},                   /* cost of loading SSE register
> +                                          in 32bit, 64bit, 128bit, 256bit 
> and 512bit */
> +  {6, 6, 6, 10, 15},                   /* cost of storing SSE register
> +                                          in 32bit, 64bit, 128bit, 256bit 
> and 512bit */
> +  {6, 6, 6, 10, 15},                   /* cost of unaligned loads.  */
> +  {6, 6, 6, 10, 15},                   /* cost of unaligned storess.  */
> +  2, 3, 4,                             /* cost of moving XMM,YMM,ZMM 
> register */
> +  6,                                   /* cost of moving SSE register to 
> integer.  */
> +  18, 6,                               /* Gather load static, per_elt.  */
> +  18, 6,                               /* Gather store static, per_elt.  */
> +  32,                                  /* size of l1 cache.  */
> +  512,                                 /* size of l2 cache.  */
> +  64,                                  /* size of prefetch block */
> +  6,                                   /* number of parallel prefetches */
> +  /* Benchmarks shows large regressions on K8 sixtrack benchmark when this
> +     value is increased to perhaps more appropriate value of 5.  */
> +  3,                                   /* Branch cost */
> +  COSTS_N_INSNS (3),                   /* cost of FADD and FSUB insns.  */
> +  COSTS_N_INSNS (5),                   /* cost of FMUL instruction.  */
> +  COSTS_N_INSNS (17),                  /* cost of FDIV instruction.  */
> +  COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
> +  COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
> +  COSTS_N_INSNS (14),                  /* cost of FSQRT instruction.  */
> +
> +  COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
> +  COSTS_N_INSNS (3),                   /* cost of ADDSS/SD SUBSS/SD insns.  
> */
> +  COSTS_N_INSNS (4),                   /* cost of MULSS instruction.  */
> +  COSTS_N_INSNS (5),                   /* cost of MULSD instruction.  */
> +  COSTS_N_INSNS (5),                   /* cost of FMA SS instruction.  */
> +  COSTS_N_INSNS (5),                   /* cost of FMA SD instruction.  */
> +  COSTS_N_INSNS (13),                  /* cost of DIVSS instruction.  */
> +  COSTS_N_INSNS (17),                  /* cost of DIVSD instruction.  */
> +  COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
> +  COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
> +  1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  
> */
> +  tremont_memcpy,
> +  tremont_memset,
> +  COSTS_N_INSNS (4),                   /* cond_taken_branch_cost.  */
> +  COSTS_N_INSNS (2),                   /* cond_not_taken_branch_cost.  */
> +  "16:11:8",                           /* Loop alignment.  */
> +  "16:11:8",                           /* Jump alignment.  */
> +  "0:0:8",                             /* Label alignment.  */
> +  "16",                                        /* Func alignment.  */
> +};
> +
>  static stringop_algs intel_memcpy[2] = {
>    {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
>    {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index 385e275bbd9..088edb6c4ca 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -273,7 +273,7 @@ DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", 
> m_386 | m_P4_NOCONA)
>     move/set sequences of bytes with known size.  */
>  DEF_TUNE (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB,
>           "prefer_known_rep_movsb_stosb",
> -         m_SKYLAKE | m_ALDERLAKE | m_CORE_AVX512)
> +         m_SKYLAKE | m_ALDERLAKE | m_TREMONT | m_CORE_AVX512)
>
>  /* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of
>     compact prologues and epilogues by issuing a misaligned moves.  This
> --
> 2.17.1
>

Re: [PATCH 2/4] [PATCH 2/4] x86: Update memcpy/memset inline strategies for -mtune=tremont

Reply via email to