> The size of a pointer on 32-bit is only 4 rather than 8 bytes, so > copying 32 pointers only requires half the number of AVX-512 load store > operations. > > Fixes: 5171b4ee6b6b ("net/i40e: optimize Tx by using AVX512") > Cc: sta...@dpdk.org > > Signed-off-by: Bruce Richardson <bruce.richard...@intel.com> > --- > drivers/net/i40e/i40e_rxtx_vec_avx512.c | 7 +++++++ > 1 file changed, 7 insertions(+) > > diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c > b/drivers/net/i40e/i40e_rxtx_vec_avx512.c > index 0238b03f8a..3b2750221b 100644 > --- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c > +++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c > @@ -799,6 +799,7 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq) > uint32_t copied = 0; > /* n is multiple of 32 */ > while (copied < n) { > +#ifdef RTE_ARCH_64 > const __m512i a = _mm512_load_si512(&txep[copied]); > const __m512i b = _mm512_load_si512(&txep[copied + > 8]); > const __m512i c = _mm512_load_si512(&txep[copied + > 16]); > @@ -808,6 +809,12 @@ i40e_tx_free_bufs_avx512(struct i40e_tx_queue *txq) > _mm512_storeu_si512(&cache_objs[copied + 8], b); > _mm512_storeu_si512(&cache_objs[copied + 16], c); > _mm512_storeu_si512(&cache_objs[copied + 24], d); > +#else > + const __m512i a = _mm512_load_si512(&txep[copied]); > + const __m512i b = _mm512_load_si512(&txep[copied + > 16]); > + _mm512_storeu_si512(&cache_objs[copied], a); > + _mm512_storeu_si512(&cache_objs[copied + 16], b); > +#endif > copied += 32; > } > cache->len += n; > -- > 2.43.0
Looks good to me, ACKED. Thanks Ian