On Thu, 18 Oct 2018, Jan Hubicka wrote: > > > > So like the following which removes the use of ix86_vec_cost > > for SSE loads and stores since we have per-mode costs already. > > I've applied the relevant factor to the individual cost tables > > (noting that for X86_TUNE_SSE_SPLIT_REGS we only apply the > > multiplication for size == 128, not size >= 128 ...) > > > > There's a ??? hunk in inline_memory_move_cost where we > > failed to apply the scaling thus in that place we'd now have > > a behavior change. Alternatively I could leave the cost > > tables unaltered if that costing part is more critical than > > the vectorizer one. > > Changing the behaviour (applying the scale there) seems like > right way to go to me... > > > > I've also spotted, when reviewing ix86_vec_cost uses, a bug > > in ix86_rtx_cost which keys on SFmode which doesn't work > > for SSE modes, thus use GET_MODE_INNER. > > > > Also I've changed X86_TUNE_AVX128_OPTIMAL to also apply > > to BTVER1 - everywhere else we glob BTVER1 and BTVER2 so > > this must surely be a omission. > > BTVER1 did not have AVX :)
Yeah - noticed that as well ;) > > > > Honza - is a patch like this OK? > > Looks OK to me. Splitting up individual changes is up to you. > I think it is not that dramatic change so hopefully regressions > won't be that hard to analyze. I'll commit the ix86_rtx_costs change separately from the load/store cost changes just to make bisection easier. > > > > Should I split out individual fixes to make bisection possible? > > > > Should I update the cost tables or instead change the vectorizer > > costing when considering the inline_memory_move_cost "issue"? > > Looks like memory move cost should do the right thing now after your patch? > Having larger loads/stores more expensive seems correct to me. Yes. > Patch is OK, without the ??? comment ;) Thanks, Richard. > Honza > > > > Thanks, > > Richard. > > > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > > index 0cf4152acb2..f5392232f61 100644 > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -39432,6 +39432,7 @@ inline_memory_move_cost (machine_mode mode, enum > > reg_class regclass, > > int index = sse_store_index (mode); > > if (index == -1) > > return 100; > > + /* ??? */ > > if (in == 2) > > return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store > > [index]); > > return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store > > [index]; > > @@ -40183,7 +40181,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int > > outer_code_i, int opno, > > gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F); > > > > *total = ix86_vec_cost (mode, > > - mode == SFmode ? cost->fmass : cost->fmasd, > > + GET_MODE_INNER (mode) == SFmode > > + ? cost->fmass : cost->fmasd, > > true); > > *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed); > > > > @@ -45122,18 +45121,14 @@ ix86_builtin_vectorization_cost (enum > > vect_cost_for_stmt type_of_cost, > > /* See PR82713 - we may end up being called on non-vector type. */ > > if (index < 0) > > index = 2; > > - return ix86_vec_cost (mode, > > - COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2, > > - true); > > + return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2; > > > > case vector_store: > > index = sse_store_index (mode); > > /* See PR82713 - we may end up being called on non-vector type. */ > > if (index < 0) > > index = 2; > > - return ix86_vec_cost (mode, > > - COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2, > > - true); > > + return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2; > > > > case vec_to_scalar: > > case scalar_to_vec: > > @@ -45146,20 +45141,14 @@ ix86_builtin_vectorization_cost (enum > > vect_cost_for_stmt type_of_cost, > > /* See PR82713 - we may end up being called on non-vector type. */ > > if (index < 0) > > index = 2; > > - return ix86_vec_cost (mode, > > - COSTS_N_INSNS > > - (ix86_cost->sse_unaligned_load[index]) / 2, > > - true); > > + return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2; > > > > case unaligned_store: > > index = sse_store_index (mode); > > /* See PR82713 - we may end up being called on non-vector type. */ > > if (index < 0) > > index = 2; > > - return ix86_vec_cost (mode, > > - COSTS_N_INSNS > > - (ix86_cost->sse_unaligned_store[index]) / 2, > > - true); > > + return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2; > > > > case vector_gather_load: > > return ix86_vec_cost (mode, > > diff --git a/gcc/config/i386/x86-tune-costs.h > > b/gcc/config/i386/x86-tune-costs.h > > index 9c8ae0a7841..59d0a8b17d0 100644 > > --- a/gcc/config/i386/x86-tune-costs.h > > +++ b/gcc/config/i386/x86-tune-costs.h > > @@ -795,12 +795,12 @@ struct processor_costs athlon_cost = { > > {4, 4}, /* cost of storing MMX registers > > in SImode and DImode */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register > > */ > > - {4, 4, 6, 12, 24}, /* cost of loading SSE registers > > + {4, 4, 12, 12, 24}, /* cost of loading SSE registers > > in 32,64,128,256 and 512-bit */ > > - {4, 4, 6, 12, 24}, /* cost of unaligned loads. */ > > - {4, 4, 5, 10, 20}, /* cost of storing SSE registers > > + {4, 4, 12, 12, 24}, /* cost of unaligned loads. */ > > + {4, 4, 10, 10, 20}, /* cost of storing SSE registers > > in 32,64,128,256 and 512-bit */ > > - {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > > + {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > 5, 5, /* SSE->integer and > > integer->SSE moves */ > > 4, 4, /* Gather load static, per_elt. > > */ > > 4, 4, /* Gather store static, > > per_elt. */ > > @@ -891,12 +891,12 @@ struct processor_costs k8_cost = { > > {4, 4}, /* cost of storing MMX registers > > in SImode and DImode */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register > > */ > > - {4, 3, 6, 12, 24}, /* cost of loading SSE registers > > + {4, 3, 12, 12, 24}, /* cost of loading SSE registers > > in 32,64,128,256 and 512-bit */ > > - {4, 3, 6, 12, 24}, /* cost of unaligned loads. */ > > - {4, 4, 5, 10, 20}, /* cost of storing SSE registers > > + {4, 3, 12, 12, 24}, /* cost of unaligned loads. */ > > + {4, 4, 10, 10, 20}, /* cost of storing SSE registers > > in 32,64,128,256 and 512-bit */ > > - {4, 4, 5, 10, 20}, /* cost of unaligned stores. */ > > + {4, 4, 10, 10, 20}, /* cost of unaligned stores. */ > > 5, 5, /* SSE->integer and > > integer->SSE moves */ > > 4, 4, /* Gather load static, per_elt. > > */ > > 4, 4, /* Gather store static, > > per_elt. */ > > @@ -1100,12 +1100,12 @@ const struct processor_costs bdver1_cost = { > > {10, 10}, /* cost of storing MMX registers > > in SImode and DImode */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register > > */ > > - {12, 12, 10, 20, 30}, /* cost of loading SSE registers > > + {12, 12, 10, 40, 60}, /* cost of loading SSE registers > > in 32,64,128,256 and 512-bit */ > > - {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ > > - {10, 10, 10, 20, 30}, /* cost of storing SSE registers > > + {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ > > + {10, 10, 10, 40, 60}, /* cost of storing SSE registers > > in 32,64,128,256 and 512-bit */ > > - {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ > > + {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > > 16, 20, /* SSE->integer and integer->SSE moves > > */ > > 12, 12, /* Gather load static, per_elt. */ > > 10, 10, /* Gather store static, per_elt. */ > > @@ -1203,12 +1203,12 @@ const struct processor_costs bdver2_cost = { > > {10, 10}, /* cost of storing MMX registers > > in SImode and DImode */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register > > */ > > - {12, 12, 10, 20, 30}, /* cost of loading SSE registers > > + {12, 12, 10, 40, 60}, /* cost of loading SSE registers > > in 32,64,128,256 and 512-bit */ > > - {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ > > - {10, 10, 10, 20, 30}, /* cost of storing SSE registers > > + {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ > > + {10, 10, 10, 40, 60}, /* cost of storing SSE registers > > in 32,64,128,256 and 512-bit */ > > - {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ > > + {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > > 16, 20, /* SSE->integer and integer->SSE moves > > */ > > 12, 12, /* Gather load static, per_elt. */ > > 10, 10, /* Gather store static, per_elt. */ > > @@ -1305,12 +1305,12 @@ struct processor_costs bdver3_cost = { > > {10, 10}, /* cost of storing MMX registers > > in SImode and DImode */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register > > */ > > - {12, 12, 10, 20, 30}, /* cost of loading SSE registers > > + {12, 12, 10, 40, 60}, /* cost of loading SSE registers > > in 32,64,128,256 and 512-bit */ > > - {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ > > - {10, 10, 10, 20, 30}, /* cost of storing SSE registers > > + {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ > > + {10, 10, 10, 40, 60}, /* cost of storing SSE registers > > in 32,64,128,256 and 512-bit */ > > - {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ > > + {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > > 16, 20, /* SSE->integer and integer->SSE moves > > */ > > 12, 12, /* Gather load static, per_elt. */ > > 10, 10, /* Gather store static, per_elt. */ > > @@ -1406,12 +1406,12 @@ struct processor_costs bdver4_cost = { > > {10, 10}, /* cost of storing MMX registers > > in SImode and DImode */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register > > */ > > - {12, 12, 10, 20, 30}, /* cost of loading SSE registers > > + {12, 12, 10, 40, 60}, /* cost of loading SSE registers > > in 32,64,128,256 and 512-bit */ > > - {12, 12, 10, 20, 30}, /* cost of unaligned loads. */ > > - {10, 10, 10, 20, 30}, /* cost of storing SSE registers > > + {12, 12, 10, 40, 60}, /* cost of unaligned loads. */ > > + {10, 10, 10, 40, 60}, /* cost of storing SSE registers > > in 32,64,128,256 and 512-bit */ > > - {10, 10, 10, 20, 30}, /* cost of unaligned stores. */ > > + {10, 10, 10, 40, 60}, /* cost of unaligned stores. */ > > 16, 20, /* SSE->integer and integer->SSE moves > > */ > > 12, 12, /* Gather load static, per_elt. */ > > 10, 10, /* Gather store static, per_elt. */ > > @@ -1518,12 +1518,12 @@ struct processor_costs znver1_cost = { > > {8, 8}, /* cost of storing MMX registers > > in SImode and DImode. */ > > 2, 3, 6, /* cost of moving XMM,YMM,ZMM register. > > */ > > - {6, 6, 6, 6, 12}, /* cost of loading SSE registers > > + {6, 6, 6, 12, 24}, /* cost of loading SSE registers > > in 32,64,128,256 and 512-bit. */ > > - {6, 6, 6, 6, 12}, /* cost of unaligned loads. */ > > - {8, 8, 8, 8, 16}, /* cost of storing SSE registers > > + {6, 6, 6, 12, 24}, /* cost of unaligned loads. */ > > + {8, 8, 8, 16, 32}, /* cost of storing SSE registers > > in 32,64,128,256 and 512-bit. */ > > - {8, 8, 8, 8, 16}, /* cost of unaligned stores. */ > > + {8, 8, 8, 16, 32}, /* cost of unaligned stores. */ > > 6, 6, /* SSE->integer and > > integer->SSE moves. */ > > /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops, > > throughput 12. Approx 9 uops do not depend on vector size and every > > load > > @@ -1726,12 +1726,12 @@ const struct processor_costs btver1_cost = { > > {12, 12}, /* cost of storing MMX registers > > in SImode and DImode */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register > > */ > > - {10, 10, 12, 24, 48}, /* cost of loading SSE registers > > + {10, 10, 12, 48, 96}, /* cost of loading SSE registers > > in 32,64,128,256 and 512-bit */ > > - {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ > > - {10, 10, 12, 24, 48}, /* cost of storing SSE registers > > + {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ > > + {10, 10, 12, 48, 96}, /* cost of storing SSE registers > > in 32,64,128,256 and 512-bit */ > > - {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ > > + {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > 14, 14, /* SSE->integer and integer->SSE moves > > */ > > 10, 10, /* Gather load static, per_elt. */ > > 10, 10, /* Gather store static, per_elt. */ > > @@ -1817,12 +1817,12 @@ const struct processor_costs btver2_cost = { > > {12, 12}, /* cost of storing MMX registers > > in SImode and DImode */ > > 2, 4, 8, /* cost of moving XMM,YMM,ZMM register > > */ > > - {10, 10, 12, 24, 48}, /* cost of loading SSE registers > > + {10, 10, 12, 48, 96}, /* cost of loading SSE registers > > in 32,64,128,256 and 512-bit */ > > - {10, 10, 12, 24, 48}, /* cost of unaligned loads. */ > > - {10, 10, 12, 24, 48}, /* cost of storing SSE registers > > + {10, 10, 12, 48, 96}, /* cost of unaligned loads. */ > > + {10, 10, 12, 48, 96}, /* cost of storing SSE registers > > in 32,64,128,256 and 512-bit */ > > - {10, 10, 12, 24, 48}, /* cost of unaligned stores. */ > > + {10, 10, 12, 48, 96}, /* cost of unaligned stores. */ > > 14, 14, /* SSE->integer and integer->SSE moves > > */ > > 10, 10, /* Gather load static, per_elt. */ > > 10, 10, /* Gather store static, per_elt. */ > > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def > > index a46450ad99d..ad8661075aa 100644 > > --- a/gcc/config/i386/x86-tune.def > > +++ b/gcc/config/i386/x86-tune.def > > @@ -441,7 +441,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, > > "256_unaligned_store_optimal" > > > > /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for > > the auto-vectorizer. */ > > -DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 > > +DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER > > | m_ZNVER1) > > > > /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of > > 512-bit AVX > > > > -- Richard Biener <rguent...@suse.de> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)