On Wed, Mar 17, 2021 at 10:46 PM Jan Hubicka <hubi...@ucw.cz> wrote:
>
> Hi,
> this patch enables gather on zen3 hardware.  For TSVC it get used by 6
> benchmarks with following runtime improvements:
>
> s4114: 1.424 -> 1.209  (84.9017%)
> s4115: 2.021 -> 1.065  (52.6967%)
> s4116: 1.549 -> 0.854  (55.1323%)
> s4117: 1.386 -> 1.193  (86.075%)
> vag: 2.741 -> 1.940  (70.7771%)
>
> and one regression:
>
> s4112: 1.115 -> 1.184  (106.188%)
>
> In s4112 the internal loop is:
>
>         for (int i = 0; i < LEN_1D; i++) {
>             a[i] += b[ip[i]] * s;
>         }
>
> (so a standard accmulate and add with indirect addressing)
>
>   40a400:       c5 fe 6f 24 03          vmovdqu (%rbx,%rax,1),%ymm4
>   40a405:       c5 fc 28 da             vmovaps %ymm2,%ymm3
>   40a409:       48 83 c0 20             add    $0x20,%rax
>   40a40d:       c4 e2 65 92 04 a5 00    vgatherdps 
> %ymm3,0x594100(,%ymm4,4),%ymm0
>   40a414:       41 59 00
>   40a417:       c4 e2 75 a8 80 e0 34    vfmadd213ps 0x5b34e0(%rax),%ymm1,%ymm0
>   40a41e:       5b 00
>   40a420:       c5 fc 29 80 e0 34 5b    vmovaps %ymm0,0x5b34e0(%rax)
>   40a427:       00
>   40a428:       48 3d 00 f4 01 00       cmp    $0x1f400,%rax
>   40a42e:       75 d0                   jne    40a400 <s4112+0x60>
>
> compared to:
>
>   40a280:       49 63 14 04             movslq (%r12,%rax,1),%rdx
>   40a284:       48 83 c0 04             add    $0x4,%rax
>   40a288:       c5 fa 10 04 95 00 41    vmovss 0x594100(,%rdx,4),%xmm0
>   40a28f:       59 00
>   40a291:       c4 e2 71 a9 80 fc 34    vfmadd213ss 0x5b34fc(%rax),%xmm1,%xmm0
>   40a298:       5b 00
>   40a29a:       c5 fa 11 80 fc 34 5b    vmovss %xmm0,0x5b34fc(%rax)
>   40a2a1:       00
>   40a2a2:       48 3d 00 f4 01 00       cmp    $0x1f400,%rax
>   40a2a8:       75 d6                   jne    40a280 <s4112+0x40>
>
> Looking at instructions latencies
>
>  - fmadd is 4 cycles
>  - vgatherdps is 39
>
> So vgather iself is 4.8 cycle per iteration and probably CPU is able to 
> execute
> rest out of order getting clos to 4 cycles per iteration (it can do 2 loads in
> parallel, one store and rest fits easily to execution resources). That would
> explain 20% slowdown.
>
> gimple internal loop is:
>   _2 = a[i_38];
>   _3 = (long unsigned int) i_38;
>   _4 = _3 * 4;
>   _5 = ip_18 + _4;
>   _6 = *_5;
>   _7 = b[_6];
>   _8 = _7 * s_19;
>   _9 = _2 + _8;
>   a[i_38] = _9;
>   i_28 = i_38 + 1;
>   ivtmp_52 = ivtmp_53 - 1;
>   if (ivtmp_52 != 0)
>     goto <bb 8>; [98.99%]
>   else
>     goto <bb 4>; [1.01%]
>
> 0x25bac30 a[i_38] 1 times scalar_load costs 12 in body
> 0x25bac30 *_5 1 times scalar_load costs 12 in body
> 0x25bac30 b[_6] 1 times scalar_load costs 12 in body
> 0x25bac30 _7 * s_19 1 times scalar_stmt costs 12 in body
> 0x25bac30 _2 + _8 1 times scalar_stmt costs 12 in body
> 0x25bac30 _9 1 times scalar_store costs 16 in body
>
> so 19 cycles estimate of scalar load
>
> 0x2668630 a[i_38] 1 times vector_load costs 12 in body
> 0x2668630 *_5 1 times unaligned_load (misalign -1) costs 12 in body
> 0x2668630 b[_6] 8 times scalar_load costs 96 in body
> 0x2668630 _7 * s_19 1 times scalar_to_vec costs 4 in prologue
> 0x2668630 _7 * s_19 1 times vector_stmt costs 12 in body
> 0x2668630 _2 + _8 1 times vector_stmt costs 12 in body
> 0x2668630 _9 1 times vector_store costs 16 in body
>
> so 40 cycles per 8x vectorized body
>
> tsvc.c:3450:27: note:  operating only on full vectors.
> tsvc.c:3450:27: note:  Cost model analysis:
>   Vector inside of loop cost: 160
>   Vector prologue cost: 4
>   Vector epilogue cost: 0
>   Scalar iteration cost: 76
>   Scalar outside cost: 0
>   Vector outside cost: 4
>   prologue iterations: 0
>   epilogue iterations: 0
>   Calculated minimum iters for profitability: 1
>
> I think this generally suffers from GIGO principle.
> One problem seems to be that we do not know about fmadd yet and compute it as
> two instructions (6 cycles instead of 4). More importnat problem is that we do
> not account the parallelism at all.  I do not see how to disable the
> vecotrization here without bumping gather costs noticeably off reality and 
> thus
> we probably can try to experiment with this if more similar problems are 
> found.

Yep.  Vectorizer costing is really hard w/o modeling the CPU pipeline more
accurately.  Esp. for the scalar side of the code where modern CPUs often
can effectively do two-lane "vectorization" by executing two lanes in parallel.
At the moment we simply assume a single-issue pipeline.  But doing better
requires tracking dependences but the current vectorizer costing API does
not expose dependencies to the target so even rough estimates are hard to
come by (like assuming an issue width of two).  My current plan is not to
revisit this as long as we have both SLP and non-SLP data structures.

> Icc is also using gather in s1115 and s128.
> For s1115 the vectorization does not seem to help and s128 gets slower.
>
> Clang nor aocc does not use gathers.
>
> Honza
>
>         * x86-tune-costs.h (znver3_cost): Update costs of gather to match 
> reality.
>         * x86-tune.def (X86_TUNE_USE_GATHER): Enable for znver3.
>
> diff --git a/gcc/config/i386/x86-tune-costs.h 
> b/gcc/config/i386/x86-tune-costs.h
> index e655e668c7a..db03738313e 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -1767,11 +1767,11 @@ struct processor_costs znver3_cost = {
>    2, 2, 3,                             /* cost of moving XMM,YMM,ZMM
>                                            register.  */
>    6,                                   /* cost of moving SSE register to 
> integer.  */
> -  /* VGATHERDPD is 23 uops and throughput is 9, VGATHERDPD is 35 uops,
> -     throughput 12.  Approx 9 uops do not depend on vector size and every 
> load
> -     is 7 uops.  */
> -  18, 8,                               /* Gather load static, per_elt.  */
> -  18, 10,                              /* Gather store static, per_elt.  */
> +  /* VGATHERDPD is 15 uops and throughput is 4, VGATHERDPS is 23 uops,
> +     throughput 9.  Approx 7 uops do not depend on vector size and every load
> +     is 4 uops.  */
> +  14, 8,                               /* Gather load static, per_elt.  */
> +  14, 10,                              /* Gather store static, per_elt.  */
>    32,                                  /* size of l1 cache.  */
>    512,                                 /* size of l2 cache.  */
>    64,                                  /* size of prefetch block.  */
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index 140ccb3d921..caebf76736e 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -436,7 +436,7 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
> "avoid_4byte_prefixes",
>
>  /* X86_TUNE_USE_GATHER: Use gather instructions.  */
>  DEF_TUNE (X86_TUNE_USE_GATHER, "use_gather",
> -         ~(m_ZNVER | m_GENERIC))
> +         ~(m_ZNVER1 | m_ZNVER2 | m_GENERIC))
>
>  /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
>     smaller FMA chain.  */

Reply via email to