On Tue, Dec 6, 2022 at 11:01 AM Jan Hubicka via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi > this patch updates cost of znver4 mostly based on data measued by Agner Fog. > Compared to previous generations x87 became bit slower which is probably not > big deal (and we have minimal benchmarking coverage for it). One interesting > improvement is reducation of FMA cost. I also updated costs of AVX256 > loads/stores based on latencies (not throughput which is twice of avx256). > Overall AVX512 vectorization seems to improve noticeably some of TSVC > benchmarks but since internally 512 vectors are split to 256 vectors it is > somewhat risky and does not win in SPEC scores (mostly by regressing > benchmarks > with loop that have small trip count like x264 and exchange), so for now I am > going to set AVX256_OPTIMAL tune but I am still playing with it. We improved > since ZNVER1 on choosing vectorization size and also have vectorized > prologues/epilogues so it may be possible to make avx512 small win overall. > > In general I would like to keep cost tables latency based unless we have > a good reason to not do so. There are some interesting diferences in > znver3 tables that I also patched and seems performance neutral. I will > send that separately. > > Bootstrapped/regtested x86_64-linux, also benchmarked on SPEC2017 along > with AVX512 tuning. I plan to commit it tomorrow unless there are some > comments. > > Honza > > * x86-tune-costs.h (znver4_cost): Upate costs of FP and SSE moves, > division multiplication, gathers, L2 cache size, and more complex > FP instrutions. > diff --git a/gcc/config/i386/x86-tune-costs.h > b/gcc/config/i386/x86-tune-costs.h > index f01b8ee9eef..3a6ce02f093 100644 > --- a/gcc/config/i386/x86-tune-costs.h > +++ b/gcc/config/i386/x86-tune-costs.h > @@ -1867,9 +1868,9 @@ struct processor_costs znver4_cost = { > {8, 8, 8}, /* cost of storing integer > registers. */ > 2, /* cost of reg,reg fld/fst. */ > - {6, 6, 16}, /* cost of loading fp registers > + {14, 14, 17}, /* cost of loading fp > registers > in SFmode, DFmode and XFmode. */ > - {8, 8, 16}, /* cost of storing fp registers > + {12, 12, 16}, /* cost of storing fp > registers > in SFmode, DFmode and XFmode. */ > 2, /* cost of moving MMX register. */ > {6, 6}, /* cost of loading MMX registers > @@ -1878,13 +1879,13 @@ struct processor_costs znver4_cost = { > in SImode and DImode. */ > 2, 2, 3, /* cost of moving XMM,YMM,ZMM > register. */ > - {6, 6, 6, 6, 12}, /* cost of loading SSE registers > + {6, 6, 10, 10, 12}, /* cost of loading SSE registers > in 32,64,128,256 and 512-bit. */ > - {8, 8, 8, 8, 16}, /* cost of storing SSE registers > + {8, 8, 8, 12, 12}, /* cost of storing SSE registers > in 32,64,128,256 and 512-bit. */ > - 6, 6, /* SSE->integer and > integer->SSE > + 6, 8, /* SSE->integer and > integer->SSE > moves. */ > - 8, 8, /* mask->integer and integer->mask > moves */ > + 8, 8, /* mask->integer and > integer->mask moves */ > {6, 6, 6}, /* cost of loading mask register > in QImode, HImode, SImode. */ > {8, 8, 8}, /* cost if storing mask register > @@ -1894,6 +1895,7 @@ struct processor_costs znver4_cost = { > }, > > COSTS_N_INSNS (1), /* cost of an add instruction. */ > + /* TODO: Lea with 3 components has cost 2. */ > COSTS_N_INSNS (1), /* cost of a lea instruction. */ > COSTS_N_INSNS (1), /* variable shift costs. */ > COSTS_N_INSNS (1), /* constant shift costs. */ > @@ -1904,11 +1906,11 @@ struct processor_costs znver4_cost = { > COSTS_N_INSNS (3)}, /* other. */ > 0, /* cost of multiply per each bit > set. */ > - {COSTS_N_INSNS (9), /* cost of a divide/mod for QI. */ > - COSTS_N_INSNS (10), /* HI. */ > - COSTS_N_INSNS (12), /* SI. */ > - COSTS_N_INSNS (17), /* DI. */ > - COSTS_N_INSNS (17)}, /* > other. */ > + {COSTS_N_INSNS (12), /* cost of a divide/mod for QI. */ > + COSTS_N_INSNS (13), /* HI. */ > + COSTS_N_INSNS (13), /* SI. */ > + COSTS_N_INSNS (18), /* DI. */ > + COSTS_N_INSNS (18)}, /* > other. */ > COSTS_N_INSNS (1), /* cost of movsx. */ > COSTS_N_INSNS (1), /* cost of movzx. */ > 8, /* "large" insn. */ > @@ -1919,22 +1921,22 @@ struct processor_costs znver4_cost = { > Relative to reg-reg move (2). */ > {8, 8, 8}, /* cost of storing integer > registers. */ > - {6, 6, 6, 6, 12}, /* cost of loading SSE registers > + {6, 6, 10, 10, 12}, /* cost of loading SSE registers > in 32bit, 64bit, 128bit, 256bit > and 512bit */ > - {8, 8, 8, 8, 16}, /* cost of storing SSE register > + {8, 8, 8, 12, 12}, /* cost of storing SSE register > in 32bit, 64bit, 128bit, 256bit > and 512bit */ > - {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ > - {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > - 2, 2, 3, /* cost of moving XMM,YMM,ZMM > + {6, 6, 6, 6, 6}, /* cost of unaligned loads. */ > + {8, 8, 8, 8, 8}, /* cost of unaligned stores. */ > + 2, 2, 2, /* cost of moving XMM,YMM,ZMM > register. */ > 6, /* cost of moving SSE register to > integer. */ > - /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops, > - throughput 9. Approx 7 uops do not depend on vector size and every load > - is 4 uops. */ > - 14, 8, /* Gather load static, per_elt. */ > - 14, 10, /* Gather store static, per_elt. */ > + /* VGATHERDPD is 17 uops and throughput is 4, VGATHERDPS is 24 uops, > + throughput 5. Approx 7 uops do not depend on vector size and every load > + is 5 uops. */ > + 14, 10, /* Gather load static, per_elt. */ > + 14, 20, /* Gather store static, per_elt. */ > 32, /* size of l1 cache. */ > - 512, /* size of l2 cache. */ > + 1024, /* size of l2 cache. */ > 64, /* size of prefetch block. */ > /* New AMD processors never drop prefetches; if they cannot be performed > immediately, they are queued. We set number of simultaneous prefetches > @@ -1943,26 +1945,26 @@ struct processor_costs znver4_cost = { > time). */ > 100, /* number of parallel prefetches. */ > 3, /* Branch cost. */ > - COSTS_N_INSNS (5), /* cost of FADD and FSUB insns. */ > - COSTS_N_INSNS (5), /* cost of FMUL instruction. */ > + COSTS_N_INSNS (7), /* cost of FADD and FSUB insns. */ > + COSTS_N_INSNS (7), /* cost of FMUL instruction. */ > /* Latency of fdiv is 8-15. */ > COSTS_N_INSNS (15), /* cost of FDIV instruction. */ > COSTS_N_INSNS (1), /* cost of FABS instruction. */ > COSTS_N_INSNS (1), /* cost of FCHS instruction. */ > /* Latency of fsqrt is 4-10. */
the above comment looks like it needs updating as well > - COSTS_N_INSNS (10), /* cost of FSQRT instruction. */ > + COSTS_N_INSNS (25), /* cost of FSQRT instruction. */ > > COSTS_N_INSNS (1), /* cost of cheap SSE instruction. */ > COSTS_N_INSNS (3), /* cost of ADDSS/SD SUBSS/SD insns. > */ > COSTS_N_INSNS (3), /* cost of MULSS instruction. */ > COSTS_N_INSNS (3), /* cost of MULSD instruction. */ > - COSTS_N_INSNS (5), /* cost of FMA SS instruction. */ > - COSTS_N_INSNS (5), /* cost of FMA SD instruction. */ > - COSTS_N_INSNS (10), /* cost of DIVSS instruction. */ > + COSTS_N_INSNS (4), /* cost of FMA SS instruction. */ > + COSTS_N_INSNS (4), /* cost of FMA SD instruction. */ > + COSTS_N_INSNS (13), /* cost of DIVSS instruction. */ > /* 9-13. */ > COSTS_N_INSNS (13), /* cost of DIVSD instruction. */ > - COSTS_N_INSNS (10), /* cost of SQRTSS instruction. */ > - COSTS_N_INSNS (15), /* cost of SQRTSD instruction. */ > + COSTS_N_INSNS (15), /* cost of SQRTSS instruction. */ > + COSTS_N_INSNS (21), /* cost of SQRTSD instruction. */ > /* Zen can execute 4 integer operations per cycle. FP operations > take 3 cycles and it can execute 2 integer additions and 2 > multiplications thus reassociation may make sense up to with of 6.