Re: Zen4 tuning part 1 - cost tables

Richard Biener via Gcc-patches Tue, 06 Dec 2022 02:29:18 -0800

On Tue, Dec 6, 2022 at 11:01 AM Jan Hubicka via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi
> this patch updates cost of znver4 mostly based on data measued by Agner Fog.
> Compared to previous generations x87 became bit slower which is probably not
> big deal (and we have minimal benchmarking coverage for it).  One interesting
> improvement is reducation of FMA cost.  I also updated costs of AVX256
> loads/stores  based on latencies (not throughput which is twice of avx256).
> Overall AVX512 vectorization seems to improve noticeably some of TSVC
> benchmarks but since internally 512 vectors are split to 256 vectors it is
> somewhat risky and does not win in SPEC scores (mostly by regressing 
> benchmarks
> with loop that have small trip count like x264 and exchange), so for now I am
> going to set AVX256_OPTIMAL tune but I am still playing with it.  We improved
> since ZNVER1 on choosing vectorization size and also have vectorized
> prologues/epilogues so it may be possible to make avx512 small win overall.
>
> In general I would like to keep cost tables latency based unless we have
> a good reason to not do so.  There are some interesting diferences in
> znver3 tables that I also patched and seems performance neutral.  I will
> send that separately.
>
> Bootstrapped/regtested x86_64-linux, also benchmarked on SPEC2017 along
> with AVX512 tuning.  I plan to commit it tomorrow unless there are some
> comments.
>
> Honza
>
>         * x86-tune-costs.h (znver4_cost): Upate costs of FP and SSE moves,
>         division multiplication, gathers, L2 cache size, and more complex
>         FP instrutions.
> diff --git a/gcc/config/i386/x86-tune-costs.h 
> b/gcc/config/i386/x86-tune-costs.h
> index f01b8ee9eef..3a6ce02f093 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -1867,9 +1868,9 @@ struct processor_costs znver4_cost = {
>    {8, 8, 8},                           /* cost of storing integer
>                                            registers.  */
>    2,                                   /* cost of reg,reg fld/fst.  */
> -  {6, 6, 16},                          /* cost of loading fp registers
> +  {14, 14, 17},                                /* cost of loading fp 
> registers
>                                            in SFmode, DFmode and XFmode.  */
> -  {8, 8, 16},                          /* cost of storing fp registers
> +  {12, 12, 16},                                /* cost of storing fp 
> registers
>                                            in SFmode, DFmode and XFmode.  */
>    2,                                   /* cost of moving MMX register.  */
>    {6, 6},                              /* cost of loading MMX registers
> @@ -1878,13 +1879,13 @@ struct processor_costs znver4_cost = {
>                                            in SImode and DImode.  */
>    2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
>                                            register.  */
> -  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
> +  {6, 6, 10, 10, 12},                  /* cost of loading SSE registers
>                                            in 32,64,128,256 and 512-bit.  */
> -  {8, 8, 8, 8, 16},                    /* cost of storing SSE registers
> +  {8, 8, 8, 12, 12},                   /* cost of storing SSE registers
>                                            in 32,64,128,256 and 512-bit.  */
> -  6, 6,                                        /* SSE->integer and 
> integer->SSE
> +  6, 8,                                        /* SSE->integer and 
> integer->SSE
>                                            moves.  */
> -  8, 8,                                /* mask->integer and integer->mask 
> moves */
> +  8, 8,                                        /* mask->integer and 
> integer->mask moves */
>    {6, 6, 6},                           /* cost of loading mask register
>                                            in QImode, HImode, SImode.  */
>    {8, 8, 8},                           /* cost if storing mask register
> @@ -1894,6 +1895,7 @@ struct processor_costs znver4_cost = {
>    },
>
>    COSTS_N_INSNS (1),                   /* cost of an add instruction.  */
> +  /* TODO: Lea with 3 components has cost 2.  */
>    COSTS_N_INSNS (1),                   /* cost of a lea instruction.  */
>    COSTS_N_INSNS (1),                   /* variable shift costs.  */
>    COSTS_N_INSNS (1),                   /* constant shift costs.  */
> @@ -1904,11 +1906,11 @@ struct processor_costs znver4_cost = {
>     COSTS_N_INSNS (3)},                 /*                      other.  */
>    0,                                   /* cost of multiply per each bit
>                                            set.  */
> -  {COSTS_N_INSNS (9),                  /* cost of a divide/mod for QI.  */
> -   COSTS_N_INSNS (10),                 /*                          HI.  */
> -   COSTS_N_INSNS (12),                 /*                          SI.  */
> -   COSTS_N_INSNS (17),                 /*                          DI.  */
> -   COSTS_N_INSNS (17)},                        /*                          
> other.  */
> +  {COSTS_N_INSNS (12),                 /* cost of a divide/mod for QI.  */
> +   COSTS_N_INSNS (13),                 /*                          HI.  */
> +   COSTS_N_INSNS (13),                 /*                          SI.  */
> +   COSTS_N_INSNS (18),                 /*                          DI.  */
> +   COSTS_N_INSNS (18)},                        /*                          
> other.  */
>    COSTS_N_INSNS (1),                   /* cost of movsx.  */
>    COSTS_N_INSNS (1),                   /* cost of movzx.  */
>    8,                                   /* "large" insn.  */
> @@ -1919,22 +1921,22 @@ struct processor_costs znver4_cost = {
>                                            Relative to reg-reg move (2).  */
>    {8, 8, 8},                           /* cost of storing integer
>                                            registers.  */
> -  {6, 6, 6, 6, 12},                    /* cost of loading SSE registers
> +  {6, 6, 10, 10, 12},                  /* cost of loading SSE registers
>                                            in 32bit, 64bit, 128bit, 256bit 
> and 512bit */
> -  {8, 8, 8, 8, 16},                    /* cost of storing SSE register
> +  {8, 8, 8, 12, 12},                   /* cost of storing SSE register
>                                            in 32bit, 64bit, 128bit, 256bit 
> and 512bit */
> -  {6, 6, 6, 6, 12},                    /* cost of unaligned loads.  */
> -  {8, 8, 8, 8, 16},                    /* cost of unaligned stores.  */
> -  2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
> +  {6, 6, 6, 6, 6},                     /* cost of unaligned loads.  */
> +  {8, 8, 8, 8, 8},                     /* cost of unaligned stores.  */
> +  2, 2, 2,                             /* cost of moving XMM,YMM,ZMM
>                                            register.  */
>    6,                                   /* cost of moving SSE register to 
> integer.  */
> -  /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
> -     throughput 9.  Approx 7 uops do not depend on vector size and every load
> -     is 4 uops.  */
> -  14, 8,                               /* Gather load static, per_elt.  */
> -  14, 10,                              /* Gather store static, per_elt.  */
> +  /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops,
> +     throughput 5.  Approx 7 uops do not depend on vector size and every load
> +     is 5 uops.  */
> +  14, 10,                              /* Gather load static, per_elt.  */
> +  14, 20,                              /* Gather store static, per_elt.  */
>    32,                                  /* size of l1 cache.  */
> -  512,                                 /* size of l2 cache.  */
> +  1024,                                        /* size of l2 cache.  */
>    64,                                  /* size of prefetch block.  */
>    /* New AMD processors never drop prefetches; if they cannot be performed
>       immediately, they are queued.  We set number of simultaneous prefetches
> @@ -1943,26 +1945,26 @@ struct processor_costs znver4_cost = {
>       time).  */
>    100,                                 /* number of parallel prefetches.  */
>    3,                                   /* Branch cost.  */
> -  COSTS_N_INSNS (5),                   /* cost of FADD and FSUB insns.  */
> -  COSTS_N_INSNS (5),                   /* cost of FMUL instruction.  */
> +  COSTS_N_INSNS (7),                   /* cost of FADD and FSUB insns.  */
> +  COSTS_N_INSNS (7),                   /* cost of FMUL instruction.  */
>    /* Latency of fdiv is 8-15.  */
>    COSTS_N_INSNS (15),                  /* cost of FDIV instruction.  */
>    COSTS_N_INSNS (1),                   /* cost of FABS instruction.  */
>    COSTS_N_INSNS (1),                   /* cost of FCHS instruction.  */
>    /* Latency of fsqrt is 4-10.  */


the above comment looks like it needs updating as well

> -  COSTS_N_INSNS (10),                  /* cost of FSQRT instruction.  */
> +  COSTS_N_INSNS (25),                  /* cost of FSQRT instruction.  */
>
>    COSTS_N_INSNS (1),                   /* cost of cheap SSE instruction.  */
>    COSTS_N_INSNS (3),                   /* cost of ADDSS/SD SUBSS/SD insns.  
> */
>    COSTS_N_INSNS (3),                   /* cost of MULSS instruction.  */
>    COSTS_N_INSNS (3),                   /* cost of MULSD instruction.  */
> -  COSTS_N_INSNS (5),                   /* cost of FMA SS instruction.  */
> -  COSTS_N_INSNS (5),                   /* cost of FMA SD instruction.  */
> -  COSTS_N_INSNS (10),                  /* cost of DIVSS instruction.  */
> +  COSTS_N_INSNS (4),                   /* cost of FMA SS instruction.  */
> +  COSTS_N_INSNS (4),                   /* cost of FMA SD instruction.  */
> +  COSTS_N_INSNS (13),                  /* cost of DIVSS instruction.  */
>    /* 9-13.  */
>    COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
> -  COSTS_N_INSNS (10),                  /* cost of SQRTSS instruction.  */
> -  COSTS_N_INSNS (15),                  /* cost of SQRTSD instruction.  */
> +  COSTS_N_INSNS (15),                  /* cost of SQRTSS instruction.  */
> +  COSTS_N_INSNS (21),                  /* cost of SQRTSD instruction.  */
>    /* Zen can execute 4 integer operations per cycle.  FP operations
>       take 3 cycles and it can execute 2 integer additions and 2
>       multiplications thus reassociation may make sense up to with of 6.

Re: Zen4 tuning part 1 - cost tables

Reply via email to