On Thu, 18 Oct 2018, Jan Hubicka wrote:

> > 
> > So like the following which removes the use of ix86_vec_cost
> > for SSE loads and stores since we have per-mode costs already.
> > I've applied the relevant factor to the individual cost tables
> > (noting that for X86_TUNE_SSE_SPLIT_REGS we only apply the
> > multiplication for size == 128, not size >= 128 ...)
> > 
> > There's a ??? hunk in inline_memory_move_cost where we
> > failed to apply the scaling thus in that place we'd now have
> > a behavior change.  Alternatively I could leave the cost
> > tables unaltered if that costing part is more critical than
> > the vectorizer one.
> 
> Changing the behaviour (applying the scale there) seems like
> right way to go to me...
> > 
> > I've also spotted, when reviewing ix86_vec_cost uses, a bug
> > in ix86_rtx_cost which keys on SFmode which doesn't work
> > for SSE modes, thus use GET_MODE_INNER.
> > 
> > Also I've changed X86_TUNE_AVX128_OPTIMAL to also apply
> > to BTVER1 - everywhere else we glob BTVER1 and BTVER2 so
> > this must surely be a omission.
> 
> BTVER1 did not have AVX :)

Yeah - noticed that as well ;)

> > 
> > Honza - is a patch like this OK?
> 
> Looks OK to me.  Splitting up individual changes is up to you.
> I think it is not that dramatic change so hopefully regressions
> won't be that hard to analyze.

I'll commit the ix86_rtx_costs change separately from the load/store
cost changes just to make bisection easier.

> > 
> > Should I split out individual fixes to make bisection possible?
> > 
> > Should I update the cost tables or instead change the vectorizer
> > costing when considering the inline_memory_move_cost "issue"?
> 
> Looks like memory move cost should do the right thing now after your patch?
> Having larger loads/stores more expensive seems correct to me.

Yes.

> Patch is OK, without the ??? comment ;)

Thanks,
Richard.

> Honza
> > 
> > Thanks,
> > Richard.
> > 
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index 0cf4152acb2..f5392232f61 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -39432,6 +39432,7 @@ inline_memory_move_cost (machine_mode mode, enum 
> > reg_class regclass,
> >        int index = sse_store_index (mode);
> >        if (index == -1)
> >     return 100;
> > +      /* ??? */
> >        if (in == 2)
> >          return MAX (ix86_cost->sse_load [index], ix86_cost->sse_store 
> > [index]);
> >        return in ? ix86_cost->sse_load [index] : ix86_cost->sse_store 
> > [index];
> > @@ -40183,7 +40181,8 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
> > outer_code_i, int opno,
> >          gcc_assert (TARGET_FMA || TARGET_FMA4 || TARGET_AVX512F);
> >  
> >          *total = ix86_vec_cost (mode,
> > -                           mode == SFmode ? cost->fmass : cost->fmasd,
> > +                           GET_MODE_INNER (mode) == SFmode
> > +                           ? cost->fmass : cost->fmasd,
> >                             true);
> >     *total += rtx_cost (XEXP (x, 1), mode, FMA, 1, speed);
> >  
> > @@ -45122,18 +45121,14 @@ ix86_builtin_vectorization_cost (enum 
> > vect_cost_for_stmt type_of_cost,
> >     /* See PR82713 - we may end up being called on non-vector type.  */
> >     if (index < 0)
> >       index = 2;
> > -        return ix86_vec_cost (mode,
> > -                         COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2,
> > -                         true);
> > +        return COSTS_N_INSNS (ix86_cost->sse_load[index]) / 2;
> >  
> >        case vector_store:
> >     index = sse_store_index (mode);
> >     /* See PR82713 - we may end up being called on non-vector type.  */
> >     if (index < 0)
> >       index = 2;
> > -        return ix86_vec_cost (mode,
> > -                         COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2,
> > -                         true);
> > +        return COSTS_N_INSNS (ix86_cost->sse_store[index]) / 2;
> >  
> >        case vec_to_scalar:
> >        case scalar_to_vec:
> > @@ -45146,20 +45141,14 @@ ix86_builtin_vectorization_cost (enum 
> > vect_cost_for_stmt type_of_cost,
> >     /* See PR82713 - we may end up being called on non-vector type.  */
> >     if (index < 0)
> >       index = 2;
> > -        return ix86_vec_cost (mode,
> > -                         COSTS_N_INSNS
> > -                            (ix86_cost->sse_unaligned_load[index]) / 2,
> > -                         true);
> > +        return COSTS_N_INSNS (ix86_cost->sse_unaligned_load[index]) / 2;
> >  
> >        case unaligned_store:
> >     index = sse_store_index (mode);
> >     /* See PR82713 - we may end up being called on non-vector type.  */
> >     if (index < 0)
> >       index = 2;
> > -        return ix86_vec_cost (mode,
> > -                         COSTS_N_INSNS
> > -                            (ix86_cost->sse_unaligned_store[index]) / 2,
> > -                         true);
> > +        return COSTS_N_INSNS (ix86_cost->sse_unaligned_store[index]) / 2;
> >  
> >        case vector_gather_load:
> >          return ix86_vec_cost (mode,
> > diff --git a/gcc/config/i386/x86-tune-costs.h 
> > b/gcc/config/i386/x86-tune-costs.h
> > index 9c8ae0a7841..59d0a8b17d0 100644
> > --- a/gcc/config/i386/x86-tune-costs.h
> > +++ b/gcc/config/i386/x86-tune-costs.h
> > @@ -795,12 +795,12 @@ struct processor_costs athlon_cost = {
> >    {4, 4},                          /* cost of storing MMX registers
> >                                        in SImode and DImode */
> >    2, 4, 8,                         /* cost of moving XMM,YMM,ZMM register 
> > */
> > -  {4, 4, 6, 12, 24},                       /* cost of loading SSE registers
> > +  {4, 4, 12, 12, 24},                      /* cost of loading SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {4, 4, 6, 12, 24},                       /* cost of unaligned loads.  */
> > -  {4, 4, 5, 10, 20},                       /* cost of storing SSE registers
> > +  {4, 4, 12, 12, 24},                      /* cost of unaligned loads.  */
> > +  {4, 4, 10, 10, 20},                      /* cost of storing SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {4, 4, 5, 10, 20},                       /* cost of unaligned stores.  */
> > +  {4, 4, 10, 10, 20},                      /* cost of unaligned stores.  */
> >    5, 5,                                    /* SSE->integer and 
> > integer->SSE moves */
> >    4, 4,                                    /* Gather load static, per_elt. 
> >  */
> >    4, 4,                                    /* Gather store static, 
> > per_elt.  */
> > @@ -891,12 +891,12 @@ struct processor_costs k8_cost = {
> >    {4, 4},                          /* cost of storing MMX registers
> >                                        in SImode and DImode */
> >    2, 4, 8,                         /* cost of moving XMM,YMM,ZMM register 
> > */
> > -  {4, 3, 6, 12, 24},                       /* cost of loading SSE registers
> > +  {4, 3, 12, 12, 24},                      /* cost of loading SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {4, 3, 6, 12, 24},                       /* cost of unaligned loads.  */
> > -  {4, 4, 5, 10, 20},                       /* cost of storing SSE registers
> > +  {4, 3, 12, 12, 24},                      /* cost of unaligned loads.  */
> > +  {4, 4, 10, 10, 20},                      /* cost of storing SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {4, 4, 5, 10, 20},                       /* cost of unaligned stores.  */
> > +  {4, 4, 10, 10, 20},                      /* cost of unaligned stores.  */
> >    5, 5,                                    /* SSE->integer and 
> > integer->SSE moves */
> >    4, 4,                                    /* Gather load static, per_elt. 
> >  */
> >    4, 4,                                    /* Gather store static, 
> > per_elt.  */
> > @@ -1100,12 +1100,12 @@ const struct processor_costs bdver1_cost = {
> >    {10, 10},                                /* cost of storing MMX registers
> >                                        in SImode and DImode */
> >    2, 4, 8,                         /* cost of moving XMM,YMM,ZMM register 
> > */
> > -  {12, 12, 10, 20, 30},                    /* cost of loading SSE registers
> > +  {12, 12, 10, 40, 60},                    /* cost of loading SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {12, 12, 10, 20, 30},                    /* cost of unaligned loads.  */
> > -  {10, 10, 10, 20, 30},                    /* cost of storing SSE registers
> > +  {12, 12, 10, 40, 60},                    /* cost of unaligned loads.  */
> > +  {10, 10, 10, 40, 60},                    /* cost of storing SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {10, 10, 10, 20, 30},                    /* cost of unaligned stores.  */
> > +  {10, 10, 10, 40, 60},                    /* cost of unaligned stores.  */
> >    16, 20,                          /* SSE->integer and integer->SSE moves 
> > */
> >    12, 12,                          /* Gather load static, per_elt.  */
> >    10, 10,                          /* Gather store static, per_elt.  */
> > @@ -1203,12 +1203,12 @@ const struct processor_costs bdver2_cost = {
> >    {10, 10},                                /* cost of storing MMX registers
> >                                        in SImode and DImode */
> >    2, 4, 8,                         /* cost of moving XMM,YMM,ZMM register 
> > */
> > -  {12, 12, 10, 20, 30},                    /* cost of loading SSE registers
> > +  {12, 12, 10, 40, 60},                    /* cost of loading SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {12, 12, 10, 20, 30},                    /* cost of unaligned loads.  */
> > -  {10, 10, 10, 20, 30},                    /* cost of storing SSE registers
> > +  {12, 12, 10, 40, 60},                    /* cost of unaligned loads.  */
> > +  {10, 10, 10, 40, 60},                    /* cost of storing SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {10, 10, 10, 20, 30},                    /* cost of unaligned stores.  */
> > +  {10, 10, 10, 40, 60},                    /* cost of unaligned stores.  */
> >    16, 20,                          /* SSE->integer and integer->SSE moves 
> > */
> >    12, 12,                          /* Gather load static, per_elt.  */
> >    10, 10,                          /* Gather store static, per_elt.  */
> > @@ -1305,12 +1305,12 @@ struct processor_costs bdver3_cost = {
> >    {10, 10},                                /* cost of storing MMX registers
> >                                        in SImode and DImode */
> >    2, 4, 8,                         /* cost of moving XMM,YMM,ZMM register 
> > */
> > -  {12, 12, 10, 20, 30},                    /* cost of loading SSE registers
> > +  {12, 12, 10, 40, 60},                    /* cost of loading SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {12, 12, 10, 20, 30},                    /* cost of unaligned loads.  */
> > -  {10, 10, 10, 20, 30},                    /* cost of storing SSE registers
> > +  {12, 12, 10, 40, 60},                    /* cost of unaligned loads.  */
> > +  {10, 10, 10, 40, 60},                    /* cost of storing SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {10, 10, 10, 20, 30},                    /* cost of unaligned stores.  */
> > +  {10, 10, 10, 40, 60},                    /* cost of unaligned stores.  */
> >    16, 20,                          /* SSE->integer and integer->SSE moves 
> > */
> >    12, 12,                          /* Gather load static, per_elt.  */
> >    10, 10,                          /* Gather store static, per_elt.  */
> > @@ -1406,12 +1406,12 @@ struct processor_costs bdver4_cost = {
> >    {10, 10},                                /* cost of storing MMX registers
> >                                        in SImode and DImode */
> >    2, 4, 8,                         /* cost of moving XMM,YMM,ZMM register 
> > */
> > -  {12, 12, 10, 20, 30},                    /* cost of loading SSE registers
> > +  {12, 12, 10, 40, 60},                    /* cost of loading SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {12, 12, 10, 20, 30},                    /* cost of unaligned loads.  */
> > -  {10, 10, 10, 20, 30},                    /* cost of storing SSE registers
> > +  {12, 12, 10, 40, 60},                    /* cost of unaligned loads.  */
> > +  {10, 10, 10, 40, 60},                    /* cost of storing SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {10, 10, 10, 20, 30},                    /* cost of unaligned stores.  */
> > +  {10, 10, 10, 40, 60},                    /* cost of unaligned stores.  */
> >    16, 20,                          /* SSE->integer and integer->SSE moves 
> > */
> >    12, 12,                          /* Gather load static, per_elt.  */
> >    10, 10,                          /* Gather store static, per_elt.  */
> > @@ -1518,12 +1518,12 @@ struct processor_costs znver1_cost = {
> >    {8, 8},                          /* cost of storing MMX registers
> >                                        in SImode and DImode.  */
> >    2, 3, 6,                         /* cost of moving XMM,YMM,ZMM register. 
> >  */
> > -  {6, 6, 6, 6, 12},                        /* cost of loading SSE registers
> > +  {6, 6, 6, 12, 24},                       /* cost of loading SSE registers
> >                                        in 32,64,128,256 and 512-bit.  */
> > -  {6, 6, 6, 6, 12},                        /* cost of unaligned loads.  */
> > -  {8, 8, 8, 8, 16},                        /* cost of storing SSE registers
> > +  {6, 6, 6, 12, 24},                       /* cost of unaligned loads.  */
> > +  {8, 8, 8, 16, 32},                       /* cost of storing SSE registers
> >                                        in 32,64,128,256 and 512-bit.  */
> > -  {8, 8, 8, 8, 16},                        /* cost of unaligned stores.  */
> > +  {8, 8, 8, 16, 32},                       /* cost of unaligned stores.  */
> >    6, 6,                                    /* SSE->integer and 
> > integer->SSE moves.  */
> >    /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> >       throughput 12.  Approx 9 uops do not depend on vector size and every 
> > load
> > @@ -1726,12 +1726,12 @@ const struct processor_costs btver1_cost = {
> >    {12, 12},                                /* cost of storing MMX registers
> >                                        in SImode and DImode */
> >    2, 4, 8,                         /* cost of moving XMM,YMM,ZMM register 
> > */
> > -  {10, 10, 12, 24, 48},                    /* cost of loading SSE registers
> > +  {10, 10, 12, 48, 96},                    /* cost of loading SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {10, 10, 12, 24, 48},                    /* cost of unaligned loads.  */
> > -  {10, 10, 12, 24, 48},                    /* cost of storing SSE registers
> > +  {10, 10, 12, 48, 96},                    /* cost of unaligned loads.  */
> > +  {10, 10, 12, 48, 96},                    /* cost of storing SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {10, 10, 12, 24, 48},                    /* cost of unaligned stores.  */
> > +  {10, 10, 12, 48, 96},                    /* cost of unaligned stores.  */
> >    14, 14,                          /* SSE->integer and integer->SSE moves 
> > */
> >    10, 10,                          /* Gather load static, per_elt.  */
> >    10, 10,                          /* Gather store static, per_elt.  */
> > @@ -1817,12 +1817,12 @@ const struct processor_costs btver2_cost = {
> >    {12, 12},                                /* cost of storing MMX registers
> >                                        in SImode and DImode */
> >    2, 4, 8,                         /* cost of moving XMM,YMM,ZMM register 
> > */
> > -  {10, 10, 12, 24, 48},                    /* cost of loading SSE registers
> > +  {10, 10, 12, 48, 96},                    /* cost of loading SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {10, 10, 12, 24, 48},                    /* cost of unaligned loads.  */
> > -  {10, 10, 12, 24, 48},                    /* cost of storing SSE registers
> > +  {10, 10, 12, 48, 96},                    /* cost of unaligned loads.  */
> > +  {10, 10, 12, 48, 96},                    /* cost of storing SSE registers
> >                                        in 32,64,128,256 and 512-bit */
> > -  {10, 10, 12, 24, 48},                    /* cost of unaligned stores.  */
> > +  {10, 10, 12, 48, 96},                    /* cost of unaligned stores.  */
> >    14, 14,                          /* SSE->integer and integer->SSE moves 
> > */
> >    10, 10,                          /* Gather load static, per_elt.  */
> >    10, 10,                          /* Gather store static, per_elt.  */
> > diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> > index a46450ad99d..ad8661075aa 100644
> > --- a/gcc/config/i386/x86-tune.def
> > +++ b/gcc/config/i386/x86-tune.def
> > @@ -441,7 +441,7 @@ DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, 
> > "256_unaligned_store_optimal"
> >  
> >  /* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for
> >     the auto-vectorizer.  */
> > -DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2
> > +DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER
> >       | m_ZNVER1)
> >  
> >  /* X86_TUNE_AVX256_OPTIMAL: Use 256-bit AVX instructions instead of 
> > 512-bit AVX
> > 
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 
21284 (AG Nuernberg)

Reply via email to