[PATCH v2 2/2] common/idpf: enable AVX2 for single queue Tx

2023-12-06 Thread Wenzhuo Lu
In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_24_03.rst  |   3 +
 drivers/common/idpf/idpf_common_device.h|   1 +
 drivers/common/idpf/idpf_common_rxtx.h  |   4 +
 drivers/common/idpf/idpf_common_rxtx_avx2.c | 225 
 drivers/common/idpf/version.map |   1 +
 drivers/net/idpf/idpf_rxtx.c|  14 ++
 6 files changed, 248 insertions(+)

diff --git a/doc/guides/rel_notes/release_24_03.rst 
b/doc/guides/rel_notes/release_24_03.rst
index e9c9717706..08c8ee07c3 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -55,6 +55,9 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+   * **Added support of vector instructions on IDPF.**
+
+ Added support of AVX2 instructions in IDPF single queue RX and TX path.
 
 Removed Items
 -
diff --git a/drivers/common/idpf/idpf_common_device.h 
b/drivers/common/idpf/idpf_common_device.h
index afe3d48798..60f8cab53a 100644
--- a/drivers/common/idpf/idpf_common_device.h
+++ b/drivers/common/idpf/idpf_common_device.h
@@ -115,6 +115,7 @@ struct idpf_vport {
bool rx_vec_allowed;
bool tx_vec_allowed;
bool rx_use_avx2;
+   bool tx_use_avx2;
bool rx_use_avx512;
bool tx_use_avx512;
 
diff --git a/drivers/common/idpf/idpf_common_rxtx.h 
b/drivers/common/idpf/idpf_common_rxtx.h
index 4d64063718..a92d328313 100644
--- a/drivers/common/idpf/idpf_common_rxtx.h
+++ b/drivers/common/idpf/idpf_common_rxtx.h
@@ -306,5 +306,9 @@ __rte_internal
 uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+__rte_internal
+uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
+   struct rte_mbuf **tx_pkts,
+   uint16_t nb_pkts);
 
 #endif /* _IDPF_COMMON_RXTX_H_ */
diff --git a/drivers/common/idpf/idpf_common_rxtx_avx2.c 
b/drivers/common/idpf/idpf_common_rxtx_avx2.c
index 02ce0534c4..9560999c5e 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx2.c
@@ -588,3 +588,228 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct 
rte_mbuf **rx_pkts,
 {
return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, 
NULL);
 }
+
+static __rte_always_inline void
+idpf_tx_backlog_entry(struct idpf_tx_entry *txep,
+struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+   int i;
+
+   for (i = 0; i < (int)nb_pkts; ++i)
+   txep[i].mbuf = tx_pkts[i];
+}
+
+static __rte_always_inline int
+idpf_singleq_tx_free_bufs_vec(struct idpf_tx_queue *txq)
+{
+   struct idpf_tx_entry *txep;
+   uint32_t n;
+   uint32_t i;
+   int nb_free = 0;
+   struct rte_mbuf *m, *free[txq->rs_thresh];
+
+   /* check DD bits on threshold descriptor */
+   if ((txq->tx_ring[txq->next_dd].qw1 &
+   rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) !=
+   rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE))
+   return 0;
+
+   n = txq->rs_thresh;
+
+/* first buffer to free from S/W ring is at index
+ * next_dd - (rs_thresh-1)
+ */
+   txep = &txq->sw_ring[txq->next_dd - (n - 1)];
+   m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
+   if (likely(m)) {
+   free[0] = m;
+   nb_free = 1;
+   for (i = 1; i < n; i++) {
+   m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+   if (likely(m)) {
+   if (likely(m->pool == free[0]->pool)) {
+   free[nb_free++] = m;
+   } else {
+   rte_mempool_put_bulk(free[0]->pool,
+(void *)free,
+nb_free);
+   free[0] = m;
+   nb_free = 1;
+   }
+   }
+   }
+   rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+   } else {
+   for (i = 1; i < n; i++) {
+   m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+   if (m)
+   rte_mempool_put(m->pool, m);
+   }
+   }
+
+   /* buffers were freed, update counters */
+   txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
+   txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_

[PATCH v2 1/2] common/idpf: enable AVX2 for single queue Rx

2023-12-06 Thread Wenzhuo Lu
In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Signed-off-by: Wenzhuo Lu 
---
 drivers/common/idpf/idpf_common_device.h|   1 +
 drivers/common/idpf/idpf_common_rxtx.h  |   4 +
 drivers/common/idpf/idpf_common_rxtx_avx2.c | 590 
 drivers/common/idpf/meson.build |  16 +
 drivers/common/idpf/version.map |   1 +
 drivers/net/idpf/idpf_rxtx.c|  12 +
 6 files changed, 624 insertions(+)
 create mode 100644 drivers/common/idpf/idpf_common_rxtx_avx2.c

diff --git a/drivers/common/idpf/idpf_common_device.h 
b/drivers/common/idpf/idpf_common_device.h
index f767ea7cec..afe3d48798 100644
--- a/drivers/common/idpf/idpf_common_device.h
+++ b/drivers/common/idpf/idpf_common_device.h
@@ -114,6 +114,7 @@ struct idpf_vport {
 
bool rx_vec_allowed;
bool tx_vec_allowed;
+   bool rx_use_avx2;
bool rx_use_avx512;
bool tx_use_avx512;
 
diff --git a/drivers/common/idpf/idpf_common_rxtx.h 
b/drivers/common/idpf/idpf_common_rxtx.h
index b49b1ed737..4d64063718 100644
--- a/drivers/common/idpf/idpf_common_rxtx.h
+++ b/drivers/common/idpf/idpf_common_rxtx.h
@@ -302,5 +302,9 @@ uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, 
struct rte_mbuf **tx_pk
 __rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf 
**rx_pkts,
  uint16_t nb_pkts);
+__rte_internal
+uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 
 #endif /* _IDPF_COMMON_RXTX_H_ */
diff --git a/drivers/common/idpf/idpf_common_rxtx_avx2.c 
b/drivers/common/idpf/idpf_common_rxtx_avx2.c
new file mode 100644
index 00..02ce0534c4
--- /dev/null
+++ b/drivers/common/idpf/idpf_common_rxtx_avx2.c
@@ -0,0 +1,590 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include 
+
+#include "idpf_common_rxtx.h"
+#include "idpf_common_device.h"
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+static __rte_always_inline void
+idpf_singleq_rx_rearm(struct idpf_rx_queue *rxq)
+{
+   int i;
+   uint16_t rx_id;
+   volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
+   struct rte_mbuf **rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+   rxdp += rxq->rxrearm_start;
+
+   /* Pull 'n' more MBUFs into the software ring */
+   if (rte_mempool_get_bulk(rxq->mp,
+(void *)rxep,
+IDPF_RXQ_REARM_THRESH) < 0) {
+   if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+   rxq->nb_rx_desc) {
+   __m128i dma_addr0;
+
+   dma_addr0 = _mm_setzero_si128();
+   for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+   rxep[i] = &rxq->fake_mbuf;
+   _mm_store_si128((__m128i *)&rxdp[i].read,
+   dma_addr0);
+   }
+   }
+   __atomic_fetch_add(&rxq->rx_stats.mbuf_alloc_failed,
+  IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED);
+
+   return;
+   }
+
+   struct rte_mbuf *mb0, *mb1;
+   __m128i dma_addr0, dma_addr1;
+   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+   RTE_PKTMBUF_HEADROOM);
+   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
+   for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+   __m128i vaddr0, vaddr1;
+
+   mb0 = rxep[0];
+   mb1 = rxep[1];
+
+   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+   offsetof(struct rte_mbuf, buf_addr) + 8);
+   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+   /* convert pa to dma_addr hdr/data */
+   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+   /* add headroom to pa values */
+   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+   /* flush desc with pa dma_addr */
+   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+   }
+
+   rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+   if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+

[PATCH v2 0/2] enable AVX2 for IDPF single queue

2023-12-06 Thread Wenzhuo Lu
In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

---
v2:
 - Removed unused code.

*** BLURB HERE ***

Wenzhuo Lu (2):
  common/idpf: enable AVX2 for single queue Rx
  common/idpf: enable AVX2 for single queue Tx

 doc/guides/rel_notes/release_24_03.rst  |   3 +
 drivers/common/idpf/idpf_common_device.h|   2 +
 drivers/common/idpf/idpf_common_rxtx.h  |   8 +
 drivers/common/idpf/idpf_common_rxtx_avx2.c | 815 
 drivers/common/idpf/meson.build |  16 +
 drivers/common/idpf/version.map |   2 +
 drivers/net/idpf/idpf_rxtx.c|  26 +
 7 files changed, 872 insertions(+)
 create mode 100644 drivers/common/idpf/idpf_common_rxtx_avx2.c

-- 
2.25.1



[PATCH 2/2] common/idpf: enable AVX2 for single queue Tx

2023-12-06 Thread Wenzhuo Lu
In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_24_03.rst  |   3 +
 drivers/common/idpf/idpf_common_device.h|   1 +
 drivers/common/idpf/idpf_common_rxtx.h  |   4 +
 drivers/common/idpf/idpf_common_rxtx_avx2.c | 225 
 drivers/common/idpf/version.map |   1 +
 drivers/net/idpf/idpf_rxtx.c|  14 ++
 6 files changed, 248 insertions(+)

diff --git a/doc/guides/rel_notes/release_24_03.rst 
b/doc/guides/rel_notes/release_24_03.rst
index e9c9717706..08c8ee07c3 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -55,6 +55,9 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+   * **Added support of vector instructions on IDPF.**
+
+ Added support of AVX2 instructions in IDPF single queue RX and TX path.
 
 Removed Items
 -
diff --git a/drivers/common/idpf/idpf_common_device.h 
b/drivers/common/idpf/idpf_common_device.h
index afe3d48798..60f8cab53a 100644
--- a/drivers/common/idpf/idpf_common_device.h
+++ b/drivers/common/idpf/idpf_common_device.h
@@ -115,6 +115,7 @@ struct idpf_vport {
bool rx_vec_allowed;
bool tx_vec_allowed;
bool rx_use_avx2;
+   bool tx_use_avx2;
bool rx_use_avx512;
bool tx_use_avx512;
 
diff --git a/drivers/common/idpf/idpf_common_rxtx.h 
b/drivers/common/idpf/idpf_common_rxtx.h
index 4d64063718..a92d328313 100644
--- a/drivers/common/idpf/idpf_common_rxtx.h
+++ b/drivers/common/idpf/idpf_common_rxtx.h
@@ -306,5 +306,9 @@ __rte_internal
 uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+__rte_internal
+uint16_t idpf_dp_singleq_xmit_pkts_avx2(void *tx_queue,
+   struct rte_mbuf **tx_pkts,
+   uint16_t nb_pkts);
 
 #endif /* _IDPF_COMMON_RXTX_H_ */
diff --git a/drivers/common/idpf/idpf_common_rxtx_avx2.c 
b/drivers/common/idpf/idpf_common_rxtx_avx2.c
index 0403cf118f..77e651b201 100644
--- a/drivers/common/idpf/idpf_common_rxtx_avx2.c
+++ b/drivers/common/idpf/idpf_common_rxtx_avx2.c
@@ -607,3 +607,228 @@ idpf_dp_singleq_recv_pkts_avx2(void *rx_queue, struct 
rte_mbuf **rx_pkts,
 {
return _idpf_singleq_recv_raw_pkts_vec_avx2(rx_queue, rx_pkts, nb_pkts, 
NULL);
 }
+
+static __rte_always_inline void
+idpf_tx_backlog_entry(struct idpf_tx_entry *txep,
+struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+   int i;
+
+   for (i = 0; i < (int)nb_pkts; ++i)
+   txep[i].mbuf = tx_pkts[i];
+}
+
+static __rte_always_inline int
+idpf_singleq_tx_free_bufs_vec(struct idpf_tx_queue *txq)
+{
+   struct idpf_tx_entry *txep;
+   uint32_t n;
+   uint32_t i;
+   int nb_free = 0;
+   struct rte_mbuf *m, *free[txq->rs_thresh];
+
+   /* check DD bits on threshold descriptor */
+   if ((txq->tx_ring[txq->next_dd].qw1 &
+   rte_cpu_to_le_64(IDPF_TXD_QW1_DTYPE_M)) !=
+   rte_cpu_to_le_64(IDPF_TX_DESC_DTYPE_DESC_DONE))
+   return 0;
+
+   n = txq->rs_thresh;
+
+/* first buffer to free from S/W ring is at index
+ * next_dd - (rs_thresh-1)
+ */
+   txep = &txq->sw_ring[txq->next_dd - (n - 1)];
+   m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
+   if (likely(m)) {
+   free[0] = m;
+   nb_free = 1;
+   for (i = 1; i < n; i++) {
+   m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+   if (likely(m)) {
+   if (likely(m->pool == free[0]->pool)) {
+   free[nb_free++] = m;
+   } else {
+   rte_mempool_put_bulk(free[0]->pool,
+(void *)free,
+nb_free);
+   free[0] = m;
+   nb_free = 1;
+   }
+   }
+   }
+   rte_mempool_put_bulk(free[0]->pool, (void **)free, nb_free);
+   } else {
+   for (i = 1; i < n; i++) {
+   m = rte_pktmbuf_prefree_seg(txep[i].mbuf);
+   if (m)
+   rte_mempool_put(m->pool, m);
+   }
+   }
+
+   /* buffers were freed, update counters */
+   txq->nb_free = (uint16_t)(txq->nb_free + txq->rs_thresh);
+   txq->next_dd = (uint16_t)(txq->next_dd + txq->rs_

[PATCH 1/2] common/idpf: enable AVX2 for single queue Rx

2023-12-06 Thread Wenzhuo Lu
In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Signed-off-by: Wenzhuo Lu 
---
 drivers/common/idpf/idpf_common_device.h|   1 +
 drivers/common/idpf/idpf_common_rxtx.h  |   4 +
 drivers/common/idpf/idpf_common_rxtx_avx2.c | 609 
 drivers/common/idpf/meson.build |  16 +
 drivers/common/idpf/version.map |   1 +
 drivers/net/idpf/idpf_rxtx.c|  12 +
 6 files changed, 643 insertions(+)
 create mode 100644 drivers/common/idpf/idpf_common_rxtx_avx2.c

diff --git a/drivers/common/idpf/idpf_common_device.h 
b/drivers/common/idpf/idpf_common_device.h
index f767ea7cec..afe3d48798 100644
--- a/drivers/common/idpf/idpf_common_device.h
+++ b/drivers/common/idpf/idpf_common_device.h
@@ -114,6 +114,7 @@ struct idpf_vport {
 
bool rx_vec_allowed;
bool tx_vec_allowed;
+   bool rx_use_avx2;
bool rx_use_avx512;
bool tx_use_avx512;
 
diff --git a/drivers/common/idpf/idpf_common_rxtx.h 
b/drivers/common/idpf/idpf_common_rxtx.h
index b49b1ed737..4d64063718 100644
--- a/drivers/common/idpf/idpf_common_rxtx.h
+++ b/drivers/common/idpf/idpf_common_rxtx.h
@@ -302,5 +302,9 @@ uint16_t idpf_dp_splitq_xmit_pkts_avx512(void *tx_queue, 
struct rte_mbuf **tx_pk
 __rte_internal
 uint16_t idpf_dp_singleq_recv_scatter_pkts(void *rx_queue, struct rte_mbuf 
**rx_pkts,
  uint16_t nb_pkts);
+__rte_internal
+uint16_t idpf_dp_singleq_recv_pkts_avx2(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 
 #endif /* _IDPF_COMMON_RXTX_H_ */
diff --git a/drivers/common/idpf/idpf_common_rxtx_avx2.c 
b/drivers/common/idpf/idpf_common_rxtx_avx2.c
new file mode 100644
index 00..0403cf118f
--- /dev/null
+++ b/drivers/common/idpf/idpf_common_rxtx_avx2.c
@@ -0,0 +1,609 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2023 Intel Corporation
+ */
+
+#include 
+
+#include "idpf_common_rxtx.h"
+#include "idpf_common_device.h"
+
+#ifndef __INTEL_COMPILER
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+
+static __rte_always_inline void
+idpf_singleq_rx_rearm(struct idpf_rx_queue *rxq)
+{
+   int i;
+   uint16_t rx_id;
+   volatile union virtchnl2_rx_desc *rxdp = rxq->rx_ring;
+   struct rte_mbuf **rxep = &rxq->sw_ring[rxq->rxrearm_start];
+
+   rxdp += rxq->rxrearm_start;
+
+   /* Pull 'n' more MBUFs into the software ring */
+   if (rte_mempool_get_bulk(rxq->mp,
+(void *)rxep,
+IDPF_RXQ_REARM_THRESH) < 0) {
+   if (rxq->rxrearm_nb + IDPF_RXQ_REARM_THRESH >=
+   rxq->nb_rx_desc) {
+   __m128i dma_addr0;
+
+   dma_addr0 = _mm_setzero_si128();
+   for (i = 0; i < IDPF_VPMD_DESCS_PER_LOOP; i++) {
+   rxep[i] = &rxq->fake_mbuf;
+   _mm_store_si128((__m128i *)&rxdp[i].read,
+   dma_addr0);
+   }
+   }
+   __atomic_fetch_add(&rxq->rx_stats.mbuf_alloc_failed,
+  IDPF_RXQ_REARM_THRESH, __ATOMIC_RELAXED);
+
+   return;
+   }
+
+   struct rte_mbuf *mb0, *mb1;
+   __m128i dma_addr0, dma_addr1;
+   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+   RTE_PKTMBUF_HEADROOM);
+   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
+   for (i = 0; i < IDPF_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+   __m128i vaddr0, vaddr1;
+
+   mb0 = rxep[0];
+   mb1 = rxep[1];
+
+   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
+   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+   offsetof(struct rte_mbuf, buf_addr) + 8);
+   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+   /* convert pa to dma_addr hdr/data */
+   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+   /* add headroom to pa values */
+   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+   /* flush desc with pa dma_addr */
+   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+   }
+
+   rxq->rxrearm_start += IDPF_RXQ_REARM_THRESH;
+   if (rxq->rxrearm_start >= rxq->nb_rx_desc)
+

[PATCH 0/2] enable AVX2 for IDPF single queue

2023-12-06 Thread Wenzhuo Lu
In case some CPUs don't support AVX512. Enable AVX2 for them to
get better per-core performance.

Wenzhuo Lu (2):
  common/idpf: enable AVX2 for single queue Rx
  common/idpf: enable AVX2 for single queue Tx

 doc/guides/rel_notes/release_24_03.rst  |   3 +
 drivers/common/idpf/idpf_common_device.h|   2 +
 drivers/common/idpf/idpf_common_rxtx.h  |   8 +
 drivers/common/idpf/idpf_common_rxtx_avx2.c | 834 
 drivers/common/idpf/meson.build |  16 +
 drivers/common/idpf/version.map |   2 +
 drivers/net/idpf/idpf_rxtx.c|  26 +
 7 files changed, 891 insertions(+)
 create mode 100644 drivers/common/idpf/idpf_common_rxtx_avx2.c

-- 
2.25.1



[PATCH v2] net/iavf: fix VLAN insertion in vector path

2023-07-02 Thread Wenzhuo Lu
As the VLAN insertion is partially supported in vector path,
the behavior is different in scalar and vector path.
For a VLAN packet, if using scalar path, the new VLAN tag will
be inserted after the original VLAN tag. If using vector path,
the new VLAN tag is inserted before the original VLAN tag.
To avoid any misleading, disable VLAN insertion in vector path.

Fixes: 059f18ae2aec ("net/iavf: add offload path for Tx AVX512")
Cc: sta...@dpdk.org

Signed-off-by: Wenzhuo Lu 
---

v2:
 - Updated iavf.ini.

 doc/guides/nics/features/iavf.ini | 2 +-
 drivers/net/iavf/iavf_rxtx.h  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/guides/nics/features/iavf.ini 
b/doc/guides/nics/features/iavf.ini
index fbb5b7d..55a0216 100644
--- a/doc/guides/nics/features/iavf.ini
+++ b/doc/guides/nics/features/iavf.ini
@@ -24,7 +24,7 @@ RSS key update   = Y
 RSS reta update  = Y
 VLAN filter  = Y
 CRC offload  = Y
-VLAN offload = Y
+VLAN offload = P
 L3 checksum offload  = Y
 L4 checksum offload  = Y
 Timestamp offload= P
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 2bf2e32..8d4a772 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -27,13 +27,13 @@
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
 #define IAVF_TX_NO_VECTOR_FLAGS (   \
+   RTE_ETH_TX_OFFLOAD_VLAN_INSERT | \
+   RTE_ETH_TX_OFFLOAD_QINQ_INSERT | \
RTE_ETH_TX_OFFLOAD_MULTI_SEGS |  \
RTE_ETH_TX_OFFLOAD_TCP_TSO | \
RTE_ETH_TX_OFFLOAD_SECURITY)
 
 #define IAVF_TX_VECTOR_OFFLOAD (\
-   RTE_ETH_TX_OFFLOAD_VLAN_INSERT | \
-   RTE_ETH_TX_OFFLOAD_QINQ_INSERT | \
RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |  \
RTE_ETH_TX_OFFLOAD_SCTP_CKSUM |  \
RTE_ETH_TX_OFFLOAD_UDP_CKSUM |   \
-- 
1.8.3.1



[PATCH] doc: update release note for iavf AVX2 feature

2023-06-28 Thread Wenzhuo Lu
Add the missed release note for iavf AVX2 feature in 23.07.

Fixes: 5712bf9d6e14 ("net/iavf: add Tx AVX2 offload path")

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_23_07.rst | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/doc/guides/rel_notes/release_23_07.rst 
b/doc/guides/rel_notes/release_23_07.rst
index 4459144..92c8a1d 100644
--- a/doc/guides/rel_notes/release_23_07.rst
+++ b/doc/guides/rel_notes/release_23_07.rst
@@ -200,6 +200,12 @@ New Features
 
   Enhanced the GRO library to support TCP packets over IPv6 network.
 
+* **Updated Intel iavf driver.**
+
+  * Added new RX and TX paths in the AVX2 code to use HW offload
+features. When the HW offload features are configured to be used, the
+offload paths are chosen automatically. In parallel the support for HW
+offload features was removed from the legacy AVX2 paths.
 
 Removed Items
 -
-- 
1.8.3.1



[PATCH] net/iavf: fix VLAN insertion in vector path

2023-06-20 Thread Wenzhuo Lu
As the VLAN insertion is partially supported in vector path,
the behavior is different in scalar and vector path.
For a VLAN packet, if using scalar path, the new VLAN tag will
be inserted after the original VLAN tag. If using vector path,
the new VLAN tag is inserted before the original VLAN tag.
To avoid any misleading, disable VLAN insertion in vector path.

Fixes: 059f18ae2aec ("net/iavf: add offload path for Tx AVX512")
Cc: sta...@dpdk.org

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 547b68f..2459c15 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -27,13 +27,13 @@
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
 #define IAVF_TX_NO_VECTOR_FLAGS (   \
+   RTE_ETH_TX_OFFLOAD_VLAN_INSERT | \
+   RTE_ETH_TX_OFFLOAD_QINQ_INSERT | \
RTE_ETH_TX_OFFLOAD_MULTI_SEGS |  \
RTE_ETH_TX_OFFLOAD_TCP_TSO | \
RTE_ETH_TX_OFFLOAD_SECURITY)
 
 #define IAVF_TX_VECTOR_OFFLOAD (\
-   RTE_ETH_TX_OFFLOAD_VLAN_INSERT | \
-   RTE_ETH_TX_OFFLOAD_QINQ_INSERT | \
RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |  \
RTE_ETH_TX_OFFLOAD_SCTP_CKSUM |  \
RTE_ETH_TX_OFFLOAD_UDP_CKSUM |   \
-- 
1.8.3.1



[PATCH] net/iavf: fix SCTP tunnel packet forwarding issue

2023-06-20 Thread Wenzhuo Lu
The SCTP tunnel packets cannot be forwarded in AVX2 mode.

As 2 features are developed in parallel, 5712bf9d6e14
("net/iavf: add Tx AVX2 offload path") doesn't consider
the impact of 4f8259df563a ("net/iavf: enable Tx outer
checksum offload on AVX512"). So, the wrong TX path is
selected.

Fixes: 5712bf9d6e14 ("net/iavf: add Tx AVX2 offload path")

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 4c59c1a..a22abb1 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -3919,7 +3919,7 @@ struct iavf_tx_context_desc_qws {
 
check_ret = iavf_tx_vec_dev_check(dev);
 
-   if (check_ret >= 0 &&
+   if ((check_ret == IAVF_VECTOR_PATH || check_ret == 
IAVF_VECTOR_OFFLOAD_PATH) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
/* SSE not support offload path yet. */
if (check_ret == IAVF_VECTOR_PATH) {
-- 
1.8.3.1



[PATCH] usertools: enhance CPU layout

2023-04-17 Thread Wenzhuo Lu
The cores in a single CPU may be not all
the same.
The user tool is updated to show the
difference of the cores.

This patch addes below informantion,
1, Group the cores based on the die.
2, A core is either a performance core or an
   efficency core.
   A performance core is shown as 'Core-P'.
   An efficency core is shown as 'Core-E'.
3, All the E-cores which share the same L2-cache
   are grouped to one module.

The known limitation.
1, To tell a core is P-core or E-core is based on if
   this core shares L2 cache with others.

Signed-off-by: Wenzhuo Lu 
---
 usertools/cpu_layout.py | 77 +
 1 file changed, 63 insertions(+), 14 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 891b9238fa..d78758cf2c 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -1,11 +1,17 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2010-2014 Intel Corporation
+# Copyright(c) 2010-2023 Intel Corporation
 # Copyright(c) 2017 Cavium, Inc. All rights reserved.
 
 sockets = []
+dies = []
 cores = []
+module_id = []
 core_map = {}
+core_p_e = {}
+title_len = 47
+die_len = 8
+module_no = 0
 base_path = "/sys/devices/system/cpu"
 fd = open("{}/kernel_max".format(base_path))
 max_cpus = int(fd.read())
@@ -20,19 +26,54 @@
 fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu))
 socket = int(fd.read())
 fd.close()
+fd = open("{}/cpu{}/topology/die_id".format(base_path, cpu))
+die = int(fd.read())
+fd.close()
+fd = open("{}/cpu{}/topology/thread_siblings_list".format(base_path, cpu))
+threads_share = str(fd.read())
+fd.close()
+fd = open("{}/cpu{}/cache/index2/shared_cpu_list".format(base_path, cpu))
+l2_cache_share = str(fd.read())
+fd.close()
+if (threads_share == l2_cache_share):
+p_e = '-P'
+module_id.append(-1)
+else:
+module_tmp = []
+p_e = '-E'
+for i in l2_cache_share:
+if not i.isdigit():
+break
+module_tmp.append(i)
+if (cpu == int("".join(module_tmp))):
+module_id.append(module_no)
+module_no += 1
+else:
+module_id.append(-1)
 if core not in cores:
 cores.append(core)
+if die not in dies:
+dies.append(die)
 if socket not in sockets:
 sockets.append(socket)
-key = (socket, core)
+key = (socket, die, core)
+key_p_e = (die, core)
 if key not in core_map:
 core_map[key] = []
+if key_p_e not in core_p_e:
+core_p_e[key_p_e] = p_e
 core_map[key].append(cpu)
 
-print(format("=" * (47 + len(base_path
+print(format("=" * (title_len + len(base_path
 print("Core and Socket Information (as reported by '{}')".format(base_path))
-print("{}\n".format("=" * (47 + len(base_path
+print("{}\n".format("=" * (title_len + len(base_path
 print("cores = ", cores)
+meaningful_module = []
+for i in module_id:
+if (i != -1):
+meaningful_module.append(i)
+print("modules = ", meaningful_module)
+print("dies = ", dies)
 print("sockets = ", sockets)
 print("")
 
@@ -43,22 +84,30 @@
   + len('[]') + len('Socket ')
 max_core_id_len = len(str(max(cores)))
 
-output = " ".ljust(max_core_id_len + len('Core '))
+socket_space_len = max_core_id_len + len('Core ') + die_len + len('-P')
+output = " ".ljust(socket_space_len)
 for s in sockets:
 output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket '))
 print(output)
 
-output = " ".ljust(max_core_id_len + len('Core '))
+output = " ".ljust(socket_space_len)
 for s in sockets:
 output += " ".ljust(max_core_map_len)
 output += " "
 print(output)
 
-for c in cores:
-output = "Core %s" % str(c).ljust(max_core_id_len)
-for s in sockets:
-if (s, c) in core_map:
-output += " " + str(core_map[(s, c)]).ljust(max_core_map_len)
-else:
-output += " " * (max_core_map_len + 1)
-print(output)
+for d in dies:
+print("Die", die)
+for c in cores:
+if (module_id[core_map[(sockets[0], d, c)][0]] != -1):
+print("Module", module_id[core_map[(sockets[0], d, c)][0]])
+output = " ".ljust(die_len)
+output += "Core"
+output += core_p_e[(d, c)]
+output += " %s" % str(c).ljust(max_core_id_len)
+for s in sockets:
+if (s, d, c) in core_map:
+output += " " + str(core_map[(s, d, 
c)]).ljust(max_core_map_len)
+else:
+output += " " * (max_core_map_len + 1)
+print(output)
-- 
2.25.1



[PATCH 2/2] net/iavf: add Rx AVX2 offload path

2023-04-17 Thread Wenzhuo Lu
Add a specific path for RX AVX2.
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c  | 112 +++--
 drivers/net/iavf/iavf_rxtx.h  |  11 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c | 580 +++---
 3 files changed, 416 insertions(+), 287 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 6cadecfad9..97ce828b59 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -3731,25 +3731,41 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
}
 
if (dev->data->scattered_rx) {
-   if (!use_avx512) {
+   if (!use_avx2 && !use_avx512) {
PMD_DRV_LOG(DEBUG,
-   "Using %sVector Scattered Rx (port 
%d).",
-   use_avx2 ? "avx2 " : "",
+   "Using Vector Scattered Rx (port 
%d).",
dev->data->port_id);
} else {
-   if (check_ret == IAVF_VECTOR_PATH)
-   PMD_DRV_LOG(DEBUG,
-   "Using AVX512 Vector 
Scattered Rx (port %d).",
-   dev->data->port_id);
-   else
-   PMD_DRV_LOG(DEBUG,
-   "Using AVX512 OFFLOAD 
Vector Scattered Rx (port %d).",
-   dev->data->port_id);
+   if (use_avx2) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   PMD_DRV_LOG(DEBUG,
+   "Using AVX2 Vector 
Scattered Rx (port %d).",
+   dev->data->port_id);
+   else
+   PMD_DRV_LOG(DEBUG,
+   "Using AVX2 OFFLOAD 
Vector Scattered Rx (port %d).",
+   dev->data->port_id);
+   } else {
+   if (check_ret == IAVF_VECTOR_PATH)
+   PMD_DRV_LOG(DEBUG,
+   "Using AVX512 
Vector Scattered Rx (port %d).",
+   dev->data->port_id);
+   else
+   PMD_DRV_LOG(DEBUG,
+   "Using AVX512 
OFFLOAD Vector Scattered Rx (port %d).",
+   dev->data->port_id);
+   }
}
if (use_flex) {
-   dev->rx_pkt_burst = use_avx2 ?
-   
iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
-   iavf_recv_scattered_pkts_vec_flex_rxd;
+   dev->rx_pkt_burst = 
iavf_recv_scattered_pkts_vec_flex_rxd;
+   if (use_avx2) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx2_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx2_flex_rxd_offload;
+   }
 #ifdef CC_AVX512_SUPPORT
if (use_avx512) {
if (check_ret == IAVF_VECTOR_PATH)
@@ -3761,9 +3777,15 @@ iavf_set_rx_function(struct rte_eth_dev *dev)
}
 #endif
} else {
-   dev->rx_pkt_burst = use_avx2 ?
-   iavf_recv_scattered_pkts_vec_avx2 :
-   iavf_recv_scattered_pkts_vec;
+   dev->rx_pkt_burst = 
iavf_recv_scattered_

[PATCH 0/2] add offload path on iavf AVX2

2023-04-17 Thread Wenzhuo Lu
Add a specific path for RX/TX AVX2.
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
These offload features are removed from the original
path to make that path faster in case no HW feature
needed.

*** BLURB HERE ***

Wenzhuo Lu (2):
  net/iavf: add Tx AVX2 offload path
  net/iavf: add Rx AVX2 offload path

 drivers/net/iavf/iavf_rxtx.c  | 145 --
 drivers/net/iavf/iavf_rxtx.h  |  13 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c | 634 +++---
 3 files changed, 478 insertions(+), 314 deletions(-)

-- 
2.25.1



[PATCH 1/2] net/iavf: add Tx AVX2 offload path

2023-04-17 Thread Wenzhuo Lu
Add a specific path for TX AVX2.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c  | 33 ++--
 drivers/net/iavf/iavf_rxtx.h  |  2 +
 drivers/net/iavf/iavf_rxtx_vec_avx2.c | 54 +++
 3 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index b1d0fbceb6..6cadecfad9 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -3876,14 +3876,14 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
 
if (check_ret >= 0 &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   /* SSE and AVX2 not support offload path yet. */
+   /* SSE not support offload path yet. */
if (check_ret == IAVF_VECTOR_PATH) {
use_sse = true;
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
}
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+   use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -3894,15 +3894,24 @@ iavf_set_tx_function(struct rte_eth_dev *dev)
if (!use_sse && !use_avx2 && !use_avx512)
goto normal;
 
-   if (!use_avx512) {
-   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-   use_avx2 ? "avx2 " : "",
+   dev->tx_pkt_prepare = NULL;
+   if (use_sse) {
+   PMD_DRV_LOG(DEBUG, "Using Vector Tx (port %d).",
dev->data->port_id);
-   dev->tx_pkt_burst = use_avx2 ?
-   iavf_xmit_pkts_vec_avx2 :
-   iavf_xmit_pkts_vec;
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec;
+   }
+   if (use_avx2) {
+   if (check_ret == IAVF_VECTOR_PATH) {
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx2;
+   PMD_DRV_LOG(DEBUG, "Using AVX2 Vector Tx (port 
%d).",
+   dev->data->port_id);
+   } else {
+   dev->tx_pkt_burst = 
iavf_xmit_pkts_vec_avx2_offload;
+   dev->tx_pkt_prepare = iavf_prep_pkts;
+   PMD_DRV_LOG(DEBUG, "Using AVX2 OFFLOAD Vector 
Tx (port %d).",
+   dev->data->port_id);
+   }
}
-   dev->tx_pkt_prepare = NULL;
 #ifdef CC_AVX512_SUPPORT
if (use_avx512) {
if (check_ret == IAVF_VECTOR_PATH) {
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 09e2127db0..85801160e1 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -693,6 +693,8 @@ uint16_t iavf_xmit_pkts_vec(void *tx_queue, struct rte_mbuf 
**tx_pkts,
uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
 uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx2_offload(void *tx_queue, struct rte_mbuf 
**tx_pkts,
+uint16_t nb_pkts);
 int iavf_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc);
 int iavf_rx_vec_dev_check(struct rte_eth_dev *dev);
 int iavf_tx_vec_dev_check(struct rte_eth_dev *dev);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index b4ebac9d34..c17b96008b 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -1426,30 +1426,32 @@ iavf_recv_scattered_pkts_vec_avx2_flex_rxd(void 
*rx_queue,
rx_pkts + retval, nb_pkts);
 }
 
-static inline void
+static __rte_always_inline void
 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
- struct rte_mbuf *pkt, uint64_

[RFC] usertools: enhance CPU layout

2023-02-10 Thread Wenzhuo Lu
CPU is becoming more and more complex.
Some CPUs are made up of several dies.
The cores in different dies may be different.
The user tool can be updated to show more
about CPU components.

This patch addes below informantion,
1, Group the cores based on the die.
2, A core is either a performance core or an
   efficency core.
   A performance core is shown as 'Core-P'.
   An efficency core is shown as 'Core-E'.

The known limitation/issue.
1, To tell a core is P-core or E-core is based on if
   this core shares L2 cache with others.
   Not sure if there's any better criteria.
2, OS shows there's only 1 die in a CPU although there're
   actually several.
   Hope the accurate information can be proved by the coming
   OSs.

There's also other information, like accelerators and even
memory are embedded into CPU. In the future the user tool
may be updated to show more.

Signed-off-by: Wenzhuo Lu 
---
 usertools/cpu_layout.py | 57 +++--
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/usertools/cpu_layout.py b/usertools/cpu_layout.py
index 891b9238fa..9812df0503 100755
--- a/usertools/cpu_layout.py
+++ b/usertools/cpu_layout.py
@@ -1,11 +1,15 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2010-2014 Intel Corporation
+# Copyright(c) 2010-2023 Intel Corporation
 # Copyright(c) 2017 Cavium, Inc. All rights reserved.
 
 sockets = []
+dies = []
 cores = []
 core_map = {}
+core_p_e = {}
+title_len = 47
+die_len = 6
 base_path = "/sys/devices/system/cpu"
 fd = open("{}/kernel_max".format(base_path))
 max_cpus = int(fd.read())
@@ -20,19 +24,38 @@
 fd = open("{}/cpu{}/topology/physical_package_id".format(base_path, cpu))
 socket = int(fd.read())
 fd.close()
+fd = open("{}/cpu{}/topology/die_id".format(base_path, cpu))
+die = int(fd.read())
+fd.close()
+fd = open("{}/cpu{}/topology/thread_siblings_list".format(base_path, cpu))
+threads_share = str(fd.read())
+fd.close()
+fd = open("{}/cpu{}/cache/index2/shared_cpu_list".format(base_path, cpu))
+l2_cache_share = str(fd.read())
+fd.close()
+if (threads_share == l2_cache_share):
+p_e = '-P'
+else:
+p_e = '-E'
 if core not in cores:
 cores.append(core)
+if die not in dies:
+dies.append(die)
 if socket not in sockets:
 sockets.append(socket)
-key = (socket, core)
+key = (socket, die, core)
+key_p_e = (die, core)
 if key not in core_map:
 core_map[key] = []
+if key_p_e not in core_p_e:
+core_p_e[key_p_e] = p_e
 core_map[key].append(cpu)
 
-print(format("=" * (47 + len(base_path
+print(format("=" * (title_len + len(base_path
 print("Core and Socket Information (as reported by '{}')".format(base_path))
-print("{}\n".format("=" * (47 + len(base_path
+print("{}\n".format("=" * (title_len + len(base_path
 print("cores = ", cores)
+print("dies = ", dies)
 print("sockets = ", sockets)
 print("")
 
@@ -43,22 +66,28 @@
   + len('[]') + len('Socket ')
 max_core_id_len = len(str(max(cores)))
 
-output = " ".ljust(max_core_id_len + len('Core '))
+socket_space_len = max_core_id_len + len('Core ') + die_len + len('-P')
+output = " ".ljust(socket_space_len)
 for s in sockets:
 output += " Socket %s" % str(s).ljust(max_core_map_len - len('Socket '))
 print(output)
 
-output = " ".ljust(max_core_id_len + len('Core '))
+output = " ".ljust(socket_space_len)
 for s in sockets:
 output += " ".ljust(max_core_map_len)
 output += " "
 print(output)
 
-for c in cores:
-output = "Core %s" % str(c).ljust(max_core_id_len)
-for s in sockets:
-if (s, c) in core_map:
-output += " " + str(core_map[(s, c)]).ljust(max_core_map_len)
-else:
-output += " " * (max_core_map_len + 1)
-print(output)
+for d in dies:
+print("Die", die)
+for c in cores:
+output = " ".ljust(die_len)
+output += "Core"
+output += core_p_e[(d, c)]
+output += " %s" % str(c).ljust(max_core_id_len)
+for s in sockets:
+if (s, d, c) in core_map:
+output += " " + str(core_map[(s, d, 
c)]).ljust(max_core_map_len)
+else:
+output += " " * (max_core_map_len + 1)
+print(output)
-- 
2.25.1



[PATCH v2] net/ice: remove avx512 specific Rx queue rearm code

2023-02-07 Thread Wenzhuo Lu
'ice_rxq_rearm' in avx512 path is optimized to improve the performance.
But after the commit a2833ecc5ea4 ("mempool: fix get objects from mempool
with cache"), this avx512 specific optimization is not necessary.
This patch remove the unnecessary PMD specific optimization to make the
code easier to maintain and get the benefit from the enhancement of common
lib.

Reported-by: Haijun Chu 
Signed-off-by: Wenzhuo Lu 
---
v2:
 - Rebased on dpdk-next-net-intel

 drivers/net/ice/ice_rxtx_vec_avx512.c | 120 +-
 1 file changed, 1 insertion(+), 119 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c 
b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 7e388b7569..c3b087c52e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -16,125 +16,7 @@
 static __rte_always_inline void
 ice_rxq_rearm(struct ice_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union ice_rx_flex_desc *rxdp;
-   struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-   struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq->mp,
-   rte_lcore_id());
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   if (unlikely(!cache))
-   return ice_rxq_rearm_common(rxq, true);
-
-   /* We need to pull 'n' more MBUFs into the software ring */
-   if (cache->len < ICE_RXQ_REARM_THRESH) {
-   uint32_t req = ICE_RXQ_REARM_THRESH + (cache->size -
-   cache->len);
-
-   int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
-   &cache->objs[cache->len], req);
-   if (ret == 0) {
-   cache->len += req;
-   } else {
-   if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128
-   ((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   
rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   ICE_RXQ_REARM_THRESH;
-   return;
-   }
-   }
-
-#if RTE_IOVA_AS_PA
-   const __m512i iova_offsets =  _mm512_set1_epi64
-   (offsetof(struct rte_mbuf, buf_iova));
-#else
-   const __m512i iova_offsets =  _mm512_set1_epi64
-   (offsetof(struct rte_mbuf, buf_addr));
-#endif
-   const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
-
-#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
-   /* shuffle the iova into correct slots. Values 4-7 will contain
-* zeros, so use 7 for a zero-value.
-*/
-   const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
-#else
-   const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
-#endif
-
-   /* fill up the rxd in vector, process 8 mbufs in one loop */
-   for (i = 0; i < ICE_RXQ_REARM_THRESH / 8; i++) {
-   const __m512i mbuf_ptrs = _mm512_loadu_si512
-   (&cache->objs[cache->len - 8]);
-   _mm512_store_si512(rxep, mbuf_ptrs);
-
-   /* gather iova of mbuf0-7 into one zmm reg */
-   const __m512i iova_base_addrs = _mm512_i64gather_epi64
-   (_mm512_add_epi64(mbuf_ptrs, iova_offsets),
-   0, /* base */
-   1  /* scale */);
-   const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
-   headroom);
-#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
-   const __m512i iovas0 = _mm512_castsi256_si512
-   (_mm512_extracti64x4_epi64(iova_addrs, 0));
-   const __m512i iovas1 = _mm512_castsi256_si512
-   (_mm512_extracti64x4_epi64(iova_addrs, 1));
-
-   /* permute leaves iova 2-3 in hdr_addr of desc 0-1
-* but these are ignored by driver since header split not
-* enabled. Similarly for desc 4 & 5.
-*/
-   const __m512i desc0_1 = _mm512_permutexvar_epi64
-   (permute_idx, iovas0);
-   const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8);
-
-   const __m512i desc4_5 = _mm512_permutexvar_epi64
-   (permute_idx, iovas1);
-   const __m512i desc6_7 = _mm512_bsrl

[PATCH] net/i40e: remove avx512 specific Rx queue rearm code

2023-02-06 Thread Wenzhuo Lu
'i40e_rxq_rearm' in avx512 path is optimized to improve the performance.
But after the commit a2833ecc5ea4 ("mempool: fix get objects from mempool
with cache"), this avx512 specific optimization is not necessary.
This patch remove the unnecessary PMD specific optimization to make the
code easier to maintain and get the benefit from the enhancement of common
lib.

Reported-by: Haijun Chu 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 125 +---
 1 file changed, 1 insertion(+), 124 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 60c97d5331..d3c7bfd121 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -24,130 +24,7 @@
 static __rte_always_inline void
 i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union i40e_rx_desc *rxdp;
-   struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-   struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq->mp,
-   rte_lcore_id());
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   if (unlikely(!cache))
-   return i40e_rxq_rearm_common(rxq, true);
-
-   /* We need to pull 'n' more MBUFs into the software ring from mempool
-* We inline the mempool function here, so we can vectorize the copy
-* from the cache into the shadow ring.
-*/
-
-   if (cache->len < RTE_I40E_RXQ_REARM_THRESH) {
-   /* No. Backfill the cache first, and then fill from it */
-   uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size -
-   cache->len);
-
-   /* How many do we require
-* i.e. number to fill the cache + the request
-*/
-   int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
-   &cache->objs[cache->len], req);
-   if (ret == 0) {
-   cache->len += req;
-   } else {
-   if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128
-   ((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   
rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   RTE_I40E_RXQ_REARM_THRESH;
-   return;
-   }
-   }
-
-   const __m512i iova_offsets =  _mm512_set1_epi64
-   (offsetof(struct rte_mbuf, buf_iova));
-   const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
-
-#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
-   /* to shuffle the addresses to correct slots. Values 4-7 will contain
-* zeros, so use 7 for a zero-value.
-*/
-   const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
-#else
-   const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
-#endif
-
-   /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
-* from mempool cache and populating both shadow and HW rings
-*/
-   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {
-   const __m512i mbuf_ptrs = _mm512_loadu_si512
-   (&cache->objs[cache->len - 8]);
-   _mm512_store_si512(rxep, mbuf_ptrs);
-
-   /* gather iova of mbuf0-7 into one zmm reg */
-   const __m512i iova_base_addrs = _mm512_i64gather_epi64
-   (_mm512_add_epi64(mbuf_ptrs, iova_offsets),
-   0, /* base */
-   1 /* scale */);
-   const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
-   headroom);
-#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
-   const __m512i iovas0 = _mm512_castsi256_si512
-   (_mm512_extracti64x4_epi64(iova_addrs, 0));
-   const __m512i iovas1 = _mm512_castsi256_si512
-   (_mm512_extracti64x4_epi64(iova_addrs, 1));
-
-   /* permute leaves desc 2-3 addresses in header address slots 0-1
-* but these are ignored by driver since header split not
-* enabled. Similarly for desc 4 & 5.
-*/
- 

[PATCH] net/ice: remove avx512 specific Rx queue rearm code

2023-02-06 Thread Wenzhuo Lu
'ice_rxq_rearm' in avx512 path is optimized to improve the performance.
But after the commit a2833ecc5ea4 ("mempool: fix get objects from mempool
with cache"), this avx512 specific optimization is not necessary.
This patch remove the unnecessary PMD specific optimization to make the
code easier to maintain and get the benefit from the enhancement of common
lib.

Reported-by: Haijun Chu 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/ice/ice_rxtx_vec_avx512.c | 115 +-
 1 file changed, 1 insertion(+), 114 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c 
b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 5bfd5152df..569d485c2c 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -16,120 +16,7 @@
 static __rte_always_inline void
 ice_rxq_rearm(struct ice_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union ice_rx_flex_desc *rxdp;
-   struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-   struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq->mp,
-   rte_lcore_id());
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   if (unlikely(!cache))
-   return ice_rxq_rearm_common(rxq, true);
-
-   /* We need to pull 'n' more MBUFs into the software ring */
-   if (cache->len < ICE_RXQ_REARM_THRESH) {
-   uint32_t req = ICE_RXQ_REARM_THRESH + (cache->size -
-   cache->len);
-
-   int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
-   &cache->objs[cache->len], req);
-   if (ret == 0) {
-   cache->len += req;
-   } else {
-   if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128
-   ((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   
rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   ICE_RXQ_REARM_THRESH;
-   return;
-   }
-   }
-
-   const __m512i iova_offsets =  _mm512_set1_epi64
-   (offsetof(struct rte_mbuf, buf_iova));
-   const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
-
-#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
-   /* shuffle the iova into correct slots. Values 4-7 will contain
-* zeros, so use 7 for a zero-value.
-*/
-   const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
-#else
-   const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
-#endif
-
-   /* fill up the rxd in vector, process 8 mbufs in one loop */
-   for (i = 0; i < ICE_RXQ_REARM_THRESH / 8; i++) {
-   const __m512i mbuf_ptrs = _mm512_loadu_si512
-   (&cache->objs[cache->len - 8]);
-   _mm512_store_si512(rxep, mbuf_ptrs);
-
-   /* gather iova of mbuf0-7 into one zmm reg */
-   const __m512i iova_base_addrs = _mm512_i64gather_epi64
-   (_mm512_add_epi64(mbuf_ptrs, iova_offsets),
-   0, /* base */
-   1  /* scale */);
-   const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
-   headroom);
-#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
-   const __m512i iovas0 = _mm512_castsi256_si512
-   (_mm512_extracti64x4_epi64(iova_addrs, 0));
-   const __m512i iovas1 = _mm512_castsi256_si512
-   (_mm512_extracti64x4_epi64(iova_addrs, 1));
-
-   /* permute leaves iova 2-3 in hdr_addr of desc 0-1
-* but these are ignored by driver since header split not
-* enabled. Similarly for desc 4 & 5.
-*/
-   const __m512i desc0_1 = _mm512_permutexvar_epi64
-   (permute_idx, iovas0);
-   const __m512i desc2_3 = _mm512_bsrli_epi128(desc0_1, 8);
-
-   const __m512i desc4_5 = _mm512_permutexvar_epi64
-   (permute_idx, iovas1);
-   const __m512i desc6_7 = _mm512_bsrli_epi128(desc4_5, 8);
-
-   _mm512_store_si512((void *)rxdp, desc0_1);
-   _mm512_store_si512((void *)(rxdp + 2), desc2_3);
-   _mm512_store_si512((void

[PATCH] net/i40e: remove avx512 specific Rx queue rearm code

2023-02-06 Thread Wenzhuo Lu
'i40e_rxq_rearm' in avx512 path is optimized to improve the performance.
But after the commit a2833ecc5ea4 ("mempool: fix get objects from mempool
with cache"), this avx512 specific optimization is not necessary.
This patch remove the unnecessary PMD specific optimization to make the
code easier to maintain and get the benefit from the enhancement of common
lib.

Reported-by: Haijun Chu 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 125 +---
 1 file changed, 1 insertion(+), 124 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 60c97d5331..d3c7bfd121 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -24,130 +24,7 @@
 static __rte_always_inline void
 i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union i40e_rx_desc *rxdp;
-   struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-   struct rte_mempool_cache *cache = rte_mempool_default_cache(rxq->mp,
-   rte_lcore_id());
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   if (unlikely(!cache))
-   return i40e_rxq_rearm_common(rxq, true);
-
-   /* We need to pull 'n' more MBUFs into the software ring from mempool
-* We inline the mempool function here, so we can vectorize the copy
-* from the cache into the shadow ring.
-*/
-
-   if (cache->len < RTE_I40E_RXQ_REARM_THRESH) {
-   /* No. Backfill the cache first, and then fill from it */
-   uint32_t req = RTE_I40E_RXQ_REARM_THRESH + (cache->size -
-   cache->len);
-
-   /* How many do we require
-* i.e. number to fill the cache + the request
-*/
-   int ret = rte_mempool_ops_dequeue_bulk(rxq->mp,
-   &cache->objs[cache->len], req);
-   if (ret == 0) {
-   cache->len += req;
-   } else {
-   if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128
-   ((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   
rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   RTE_I40E_RXQ_REARM_THRESH;
-   return;
-   }
-   }
-
-   const __m512i iova_offsets =  _mm512_set1_epi64
-   (offsetof(struct rte_mbuf, buf_iova));
-   const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
-
-#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
-   /* to shuffle the addresses to correct slots. Values 4-7 will contain
-* zeros, so use 7 for a zero-value.
-*/
-   const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
-#else
-   const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
-#endif
-
-   /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
-* from mempool cache and populating both shadow and HW rings
-*/
-   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH / 8; i++) {
-   const __m512i mbuf_ptrs = _mm512_loadu_si512
-   (&cache->objs[cache->len - 8]);
-   _mm512_store_si512(rxep, mbuf_ptrs);
-
-   /* gather iova of mbuf0-7 into one zmm reg */
-   const __m512i iova_base_addrs = _mm512_i64gather_epi64
-   (_mm512_add_epi64(mbuf_ptrs, iova_offsets),
-   0, /* base */
-   1 /* scale */);
-   const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
-   headroom);
-#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
-   const __m512i iovas0 = _mm512_castsi256_si512
-   (_mm512_extracti64x4_epi64(iova_addrs, 0));
-   const __m512i iovas1 = _mm512_castsi256_si512
-   (_mm512_extracti64x4_epi64(iova_addrs, 1));
-
-   /* permute leaves desc 2-3 addresses in header address slots 0-1
-* but these are ignored by driver since header split not
-* enabled. Similarly for desc 4 & 5.
-*/
- 

[PATCH] net/iavf: remove avx512 specific Rx queue rearm code

2023-02-06 Thread Wenzhuo Lu
'iavf_rxq_rearm' in avx512 path is optimized to improve the performance.
But after the commit a2833ecc5ea4 ("mempool: fix get objects from mempool
with cache"), this avx512 specific optimization is not necessary.
This patch remove the unnecessary PMD specific optimization to make the
code easier to maintain and get the benefit from the enhancement of common
lib.

Reported-by: Haijun Chu 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 126 +---
 1 file changed, 1 insertion(+), 125 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index b416a716cf..0abedbb3bb 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -32,131 +32,7 @@
 static __rte_always_inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union iavf_rx_desc *rxdp;
-   struct rte_mempool_cache *cache =
-   rte_mempool_default_cache(rxq->mp, rte_lcore_id());
-   struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   if (unlikely(!cache))
-   return iavf_rxq_rearm_common(rxq, true);
-
-   /* We need to pull 'n' more MBUFs into the software ring from mempool
-* We inline the mempool function here, so we can vectorize the copy
-* from the cache into the shadow ring.
-*/
-
-   /* Can this be satisfied from the cache? */
-   if (cache->len < IAVF_RXQ_REARM_THRESH) {
-   /* No. Backfill the cache first, and then fill from it */
-   uint32_t req = IAVF_RXQ_REARM_THRESH + (cache->size -
-   cache->len);
-
-   /* How many do we require i.e. number to fill the cache + the 
request */
-   int ret = rte_mempool_ops_dequeue_bulk
-   (rxq->mp, &cache->objs[cache->len], req);
-   if (ret == 0) {
-   cache->len += req;
-   } else {
-   if (rxq->rxrearm_nb + IAVF_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) {
-   rxp[i] = &rxq->fake_mbuf;
-   _mm_storeu_si128((__m128i 
*)&rxdp[i].read,
-dma_addr0);
-   }
-   }
-   
rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   IAVF_RXQ_REARM_THRESH;
-   return;
-   }
-   }
-
-   const __m512i iova_offsets =  _mm512_set1_epi64(offsetof
-   (struct rte_mbuf, 
buf_iova));
-   const __m512i headroom = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
-
-#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-   /* to shuffle the addresses to correct slots. Values 4-7 will contain
-* zeros, so use 7 for a zero-value.
-*/
-   const __m512i permute_idx = _mm512_set_epi64(7, 7, 3, 1, 7, 7, 2, 0);
-#else
-   const __m512i permute_idx = _mm512_set_epi64(7, 3, 6, 2, 5, 1, 4, 0);
-#endif
-
-   /* Initialize the mbufs in vector, process 8 mbufs in one loop, taking
-* from mempool cache and populating both shadow and HW rings
-*/
-   for (i = 0; i < IAVF_RXQ_REARM_THRESH / IAVF_DESCS_PER_LOOP_AVX; i++) {
-   const __m512i mbuf_ptrs = _mm512_loadu_si512
-   (&cache->objs[cache->len - IAVF_DESCS_PER_LOOP_AVX]);
-   _mm512_storeu_si512(rxp, mbuf_ptrs);
-
-   const __m512i iova_base_addrs = _mm512_i64gather_epi64
-   (_mm512_add_epi64(mbuf_ptrs, iova_offsets),
-0, /* base */
-1  /* scale */);
-   const __m512i iova_addrs = _mm512_add_epi64(iova_base_addrs,
-   headroom);
-#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-   const __m512i iovas0 = _mm512_castsi256_si512
-   (_mm512_extracti64x4_epi64(iova_addrs, 0));
-   const __m512i iovas1 = _mm512_castsi256_si512
-   (_mm512_extracti64x4_epi64(iova_addrs, 1));
-
-   /* permute leaves desc 2-3 addresses in header address slots 0-1
-* but these are ignored by driver since header split not
-* enabled. Similarly for desc 6 & 7.
- 

[PATCH] net/iavf: fix vlan offload issue

2022-11-01 Thread Wenzhuo Lu
HW VLAN offload cannot be enabled because the HW capability flags
are not set correctly.

Fixes: eff56a7b9f97 ("net/iavf: add offload path for Rx AVX512")
Cc: sta...@dpdk.org

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_ethdev.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index 8d9f3a6..3196210 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -1125,6 +1125,7 @@ static int iavf_config_rx_queues_irqs(struct rte_eth_dev 
*dev,
RTE_ETH_RX_OFFLOAD_OUTER_IPV4_CKSUM |
RTE_ETH_RX_OFFLOAD_SCATTER |
RTE_ETH_RX_OFFLOAD_VLAN_FILTER |
+   RTE_ETH_RX_OFFLOAD_VLAN_EXTEND |
RTE_ETH_RX_OFFLOAD_RSS_HASH;
 
dev_info->tx_offload_capa =
-- 
1.8.3.1



[dpdk-dev] [PATCH v3 2/2] net/ice: add Rx AVX2 offload path

2021-06-28 Thread Wenzhuo Lu
Add a specific path for RX AVX2.
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_21_08.rst |   6 +
 drivers/net/ice/ice_rxtx.c |  50 --
 drivers/net/ice/ice_rxtx.h |   5 +
 drivers/net/ice/ice_rxtx_vec_avx2.c| 296 +++--
 4 files changed, 217 insertions(+), 140 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index a6ecfdf..203b772 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -55,6 +55,12 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+* **Updated Intel ice driver.**
+
+  * In AVX2 code, added the new RX and TX paths to use the HW offload
+features. When the HW offload features are configured to be used, the
+offload paths are chosen automatically. In parallel the support for HW
+offload features was removed from the legacy AVX2 paths.
 
 Removed Items
 -
diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index 5419047..97c3d80 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -1995,7 +1995,9 @@
dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx512_offload ||
 #endif
dev->rx_pkt_burst == ice_recv_pkts_vec_avx2 ||
-   dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2)
+   dev->rx_pkt_burst == ice_recv_pkts_vec_avx2_offload ||
+   dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2 ||
+   dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2_offload)
return ptypes;
 #endif
 
@@ -3052,7 +3054,7 @@
 #ifdef RTE_ARCH_X86
struct ice_rx_queue *rxq;
int i;
-   int rx_check_ret = 0;
+   int rx_check_ret = -1;
 
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
ad->rx_use_avx512 = false;
@@ -3107,14 +3109,25 @@

ice_recv_scattered_pkts_vec_avx512;
}
 #endif
+   } else if (ad->rx_use_avx2) {
+   if (rx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 OFFLOAD Vector 
Scattered Rx (port %d).",
+   dev->data->port_id);
+   dev->rx_pkt_burst =
+   
ice_recv_scattered_pkts_vec_avx2_offload;
+   } else {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 Vector 
Scattered Rx (port %d).",
+   dev->data->port_id);
+   dev->rx_pkt_burst =
+   
ice_recv_scattered_pkts_vec_avx2;
+   }
} else {
PMD_DRV_LOG(DEBUG,
-   "Using %sVector Scattered Rx (port 
%d).",
-   ad->rx_use_avx2 ? "avx2 " : "",
+   "Using Vector Scattered Rx (port %d).",
dev->data->port_id);
-   dev->rx_pkt_burst = ad->rx_use_avx2 ?
-   ice_recv_scattered_pkts_vec_avx2 :
-   ice_recv_scattered_pkts_vec;
+   dev->rx_pkt_burst = ice_recv_scattered_pkts_vec;
}
} else {
if (ad->rx_use_avx512) {
@@ -3133,14 +3146,25 @@
ice_recv_pkts_vec_avx512;
}
 #endif
+   } else if (ad->rx_use_avx2) {
+   if (rx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 OFFLOAD Vector 
Rx (port %d).",
+   dev->data->port_id);
+   dev->rx_pkt_burst =
+   ice_recv_pkts_vec_avx2_offload;
+   } else {
+   PMD_DRV_LOG(NO

[dpdk-dev] [PATCH v3 1/2] net/ice: add Tx AVX2 offload path

2021-06-28 Thread Wenzhuo Lu
Add a specific path for TX AVX2.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/ice/ice_rxtx.c  | 37 -
 drivers/net/ice/ice_rxtx.h  |  2 ++
 drivers/net/ice/ice_rxtx_vec_avx2.c | 54 ++---
 3 files changed, 65 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index fc9bb5a..5419047 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -3288,7 +3288,7 @@
 #ifdef RTE_ARCH_X86
struct ice_tx_queue *txq;
int i;
-   int tx_check_ret = 0;
+   int tx_check_ret = -1;
 
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
ad->tx_use_avx2 = false;
@@ -3307,13 +3307,14 @@
PMD_DRV_LOG(NOTICE,
"AVX512 is not supported in build env");
 #endif
-   if (!ad->tx_use_avx512 && tx_check_ret == 
ICE_VECTOR_PATH &&
-   (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-   rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+   if (!ad->tx_use_avx512 &&
+   (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 
1 ||
+   rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) 
== 1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
ad->tx_use_avx2 = true;
 
-   if (!ad->tx_use_avx512 && tx_check_ret == 
ICE_VECTOR_OFFLOAD_PATH)
+   if (!ad->tx_use_avx2 && !ad->tx_use_avx512 &&
+   tx_check_ret == ICE_VECTOR_OFFLOAD_PATH)
ad->tx_vec_allowed = false;
 
if (ad->tx_vec_allowed) {
@@ -3331,6 +3332,7 @@
}
 
if (ad->tx_vec_allowed) {
+   dev->tx_pkt_prepare = NULL;
if (ad->tx_use_avx512) {
 #ifdef CC_AVX512_SUPPORT
if (tx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
@@ -3339,6 +3341,7 @@
dev->data->port_id);
dev->tx_pkt_burst =
ice_xmit_pkts_vec_avx512_offload;
+   dev->tx_pkt_prepare = ice_prep_pkts;
} else {
PMD_DRV_LOG(NOTICE,
"Using AVX512 Vector Tx (port %d).",
@@ -3347,14 +3350,22 @@
}
 #endif
} else {
-   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-   ad->tx_use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   dev->tx_pkt_burst = ad->tx_use_avx2 ?
-   ice_xmit_pkts_vec_avx2 :
-   ice_xmit_pkts_vec;
+   if (tx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 OFFLOAD Vector Tx (port 
%d).",
+   dev->data->port_id);
+   dev->tx_pkt_burst =
+   ice_xmit_pkts_vec_avx2_offload;
+   dev->tx_pkt_prepare = ice_prep_pkts;
+   } else {
+   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port 
%d).",
+   ad->tx_use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   dev->tx_pkt_burst = ad->tx_use_avx2 ?
+   ice_xmit_pkts_vec_avx2 :
+   ice_xmit_pkts_vec;
+   }
}
-   dev->tx_pkt_prepare = NULL;
 
return;
}
diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
index 86b6f3d..f0536f7 100644
--- a/drivers/net/ice/ice_rxtx.h
+++ b/drivers/net/ice/ice_rxtx.h
@@ -255,6 +255,8 @@ uint16_t ice_recv_scattered_pkts_vec_avx2(void *rx_queue,
  uint16_t nb_pkts);
 uint16_t ice_xmit_pkts_vec_avx2(void *tx_

[dpdk-dev] [PATCH v3 0/2] add Rx/Tx offload paths for ICE AVX2

2021-06-28 Thread Wenzhuo Lu
Add specific paths for RX/TX AVX2, called offload paths.
In these paths, support the HW offload features, like, checksum, VLAN, RSS 
offload.
These paths are chosen automatically according to the configuration.

v2:
 - fdir should be supported by offload and normal path.

v3:
 - rebased on the newest code.

Wenzhuo Lu (2):
  net/ice: add Tx AVX2 offload path
  net/ice: add Rx AVX2 offload path

 doc/guides/rel_notes/release_21_08.rst |   6 +
 drivers/net/ice/ice_rxtx.c |  87 +---
 drivers/net/ice/ice_rxtx.h |   7 +
 drivers/net/ice/ice_rxtx_vec_avx2.c| 350 +++--
 4 files changed, 282 insertions(+), 168 deletions(-)

-- 
1.8.3.1



[dpdk-dev] [PATCH v2 2/2] net/ice: add Rx AVX2 offload path

2021-06-28 Thread Wenzhuo Lu
Add a specific path for RX AVX2.
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_21_08.rst |   6 +
 drivers/net/ice/ice_rxtx.c |  50 --
 drivers/net/ice/ice_rxtx.h |   5 +
 drivers/net/ice/ice_rxtx_vec_avx2.c| 296 +++--
 4 files changed, 217 insertions(+), 140 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index a6ecfdf..203b772 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -55,6 +55,12 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+* **Updated Intel ice driver.**
+
+  * In AVX2 code, added the new RX and TX paths to use the HW offload
+features. When the HW offload features are configured to be used, the
+offload paths are chosen automatically. In parallel the support for HW
+offload features was removed from the legacy AVX2 paths.
 
 Removed Items
 -
diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index 5d7ca60..27fd248 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -1999,7 +1999,9 @@
dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx512_offload ||
 #endif
dev->rx_pkt_burst == ice_recv_pkts_vec_avx2 ||
-   dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2)
+   dev->rx_pkt_burst == ice_recv_pkts_vec_avx2_offload ||
+   dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2 ||
+   dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2_offload)
return ptypes;
 #endif
 
@@ -3058,7 +3060,7 @@
 #ifdef RTE_ARCH_X86
struct ice_rx_queue *rxq;
int i;
-   int rx_check_ret;
+   int rx_check_ret = -1;
bool use_avx512 = false;
bool use_avx2 = false;
 
@@ -3113,14 +3115,25 @@

ice_recv_scattered_pkts_vec_avx512;
}
 #endif
+   } else if (use_avx2) {
+   if (rx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 OFFLOAD Vector 
Scattered Rx (port %d).",
+   dev->data->port_id);
+   dev->rx_pkt_burst =
+   
ice_recv_scattered_pkts_vec_avx2_offload;
+   } else {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 Vector 
Scattered Rx (port %d).",
+   dev->data->port_id);
+   dev->rx_pkt_burst =
+   
ice_recv_scattered_pkts_vec_avx2;
+   }
} else {
PMD_DRV_LOG(DEBUG,
-   "Using %sVector Scattered Rx (port 
%d).",
-   use_avx2 ? "avx2 " : "",
+   "Using Vector Scattered Rx (port %d).",
dev->data->port_id);
-   dev->rx_pkt_burst = use_avx2 ?
-   ice_recv_scattered_pkts_vec_avx2 :
-   ice_recv_scattered_pkts_vec;
+   dev->rx_pkt_burst = ice_recv_scattered_pkts_vec;
}
} else {
if (use_avx512) {
@@ -3139,14 +3152,25 @@
ice_recv_pkts_vec_avx512;
}
 #endif
+   } else if (use_avx2) {
+   if (rx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 OFFLOAD Vector 
Rx (port %d).",
+   dev->data->port_id);
+   dev->rx_pkt_burst =
+   ice_recv_pkts_vec_avx2_offload;
+   } else {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 Vector Rx (p

[dpdk-dev] [PATCH v2 1/2] net/ice: add Tx AVX2 offload path

2021-06-28 Thread Wenzhuo Lu
Add a specific path for TX AVX2.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/ice/ice_rxtx.c  | 46 ++-
 drivers/net/ice/ice_rxtx.h  |  2 ++
 drivers/net/ice/ice_rxtx_vec_avx2.c | 54 ++---
 3 files changed, 69 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index 49abcb2..5d7ca60 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -3294,9 +3294,9 @@
 #ifdef RTE_ARCH_X86
struct ice_tx_queue *txq;
int i;
-   int tx_check_ret;
-   bool use_avx512 = false;
-   bool use_avx2 = false;
+   int tx_check_ret = -1;
+   bool cap_avx512 = false;
+   bool cap_avx2 = false;
 
if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
tx_check_ret = ice_tx_vec_dev_check(dev);
@@ -3308,18 +3308,18 @@
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1)
 #ifdef CC_AVX512_SUPPORT
-   use_avx512 = true;
+   cap_avx512 = true;
 #else
PMD_DRV_LOG(NOTICE,
"AVX512 is not supported in build env");
 #endif
-   if (!use_avx512 && tx_check_ret == ICE_VECTOR_PATH &&
-   (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-   rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-   use_avx2 = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   cap_avx2 = true;
 
-   if (!use_avx512 && tx_check_ret == 
ICE_VECTOR_OFFLOAD_PATH)
+   if (!cap_avx2 && !cap_avx512 &&
+   tx_check_ret == ICE_VECTOR_OFFLOAD_PATH)
ad->tx_vec_allowed = false;
 
if (ad->tx_vec_allowed) {
@@ -3337,7 +3337,8 @@
}
 
if (ad->tx_vec_allowed) {
-   if (use_avx512) {
+   dev->tx_pkt_prepare = NULL;
+   if (cap_avx512) {
 #ifdef CC_AVX512_SUPPORT
if (tx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
PMD_DRV_LOG(NOTICE,
@@ -3345,6 +3346,7 @@
dev->data->port_id);
dev->tx_pkt_burst =
ice_xmit_pkts_vec_avx512_offload;
+   dev->tx_pkt_prepare = ice_prep_pkts;
} else {
PMD_DRV_LOG(NOTICE,
"Using AVX512 Vector Tx (port %d).",
@@ -3353,14 +3355,22 @@
}
 #endif
} else {
-   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   dev->tx_pkt_burst = use_avx2 ?
-   ice_xmit_pkts_vec_avx2 :
-   ice_xmit_pkts_vec;
+   if (tx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 OFFLOAD Vector Tx (port 
%d).",
+   dev->data->port_id);
+   dev->tx_pkt_burst =
+   ice_xmit_pkts_vec_avx2_offload;
+   dev->tx_pkt_prepare = ice_prep_pkts;
+   } else {
+   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port 
%d).",
+   cap_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   dev->tx_pkt_burst = cap_avx2 ?
+   ice_xmit_pkts_vec_avx2 :
+   ice_xmit_pkts_vec;
+   }
}
-   dev->tx_pkt

[dpdk-dev] [PATCH v2 0/2] add Rx/Tx offload paths for ICE AVX2

2021-06-28 Thread Wenzhuo Lu
Add specific paths for RX/TX AVX2, called offload paths.
In these paths, support the HW offload features, like, checksum, VLAN, RSS 
offload.
These paths are chosen automatically according to the configuration.

v2:
 - fdir should be supported by offload and normal path.

Wenzhuo Lu (2):
  net/ice: add Tx AVX2 offload path
  net/ice: add Rx AVX2 offload path

 doc/guides/rel_notes/release_21_08.rst |   6 +
 drivers/net/ice/ice_rxtx.c |  96 ++---
 drivers/net/ice/ice_rxtx.h |   7 +
 drivers/net/ice/ice_rxtx_vec_avx2.c| 350 +++--
 4 files changed, 286 insertions(+), 173 deletions(-)

-- 
1.8.3.1



[dpdk-dev] [PATCH 2/2] net/ice: add Rx AVX2 offload path

2021-06-01 Thread Wenzhuo Lu
Add a specific path for RX AVX2.
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_21_08.rst |   6 +
 drivers/net/ice/ice_rxtx.c |  50 +++--
 drivers/net/ice/ice_rxtx.h |   5 +
 drivers/net/ice/ice_rxtx_vec_avx2.c| 348 ++---
 4 files changed, 243 insertions(+), 166 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_08.rst 
b/doc/guides/rel_notes/release_21_08.rst
index a6ecfdf..203b772 100644
--- a/doc/guides/rel_notes/release_21_08.rst
+++ b/doc/guides/rel_notes/release_21_08.rst
@@ -55,6 +55,12 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+* **Updated Intel ice driver.**
+
+  * In AVX2 code, added the new RX and TX paths to use the HW offload
+features. When the HW offload features are configured to be used, the
+offload paths are chosen automatically. In parallel the support for HW
+offload features was removed from the legacy AVX2 paths.
 
 Removed Items
 -
diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index 7c9474e..4e51fd6 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -1999,7 +1999,9 @@
dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx512_offload ||
 #endif
dev->rx_pkt_burst == ice_recv_pkts_vec_avx2 ||
-   dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2)
+   dev->rx_pkt_burst == ice_recv_pkts_vec_avx2_offload ||
+   dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2 ||
+   dev->rx_pkt_burst == ice_recv_scattered_pkts_vec_avx2_offload)
return ptypes;
 #endif
 
@@ -3058,7 +3060,7 @@
 #ifdef RTE_ARCH_X86
struct ice_rx_queue *rxq;
int i;
-   int rx_check_ret;
+   int rx_check_ret = -1;
bool use_avx512 = false;
bool use_avx2 = false;
 
@@ -3113,14 +3115,25 @@

ice_recv_scattered_pkts_vec_avx512;
}
 #endif
+   } else if (use_avx2) {
+   if (rx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 OFFLOAD Vector 
Scattered Rx (port %d).",
+   dev->data->port_id);
+   dev->rx_pkt_burst =
+   
ice_recv_scattered_pkts_vec_avx2_offload;
+   } else {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 Vector 
Scattered Rx (port %d).",
+   dev->data->port_id);
+   dev->rx_pkt_burst =
+   
ice_recv_scattered_pkts_vec_avx2;
+   }
} else {
PMD_DRV_LOG(DEBUG,
-   "Using %sVector Scattered Rx (port 
%d).",
-   use_avx2 ? "avx2 " : "",
+   "Using Vector Scattered Rx (port %d).",
dev->data->port_id);
-   dev->rx_pkt_burst = use_avx2 ?
-   ice_recv_scattered_pkts_vec_avx2 :
-   ice_recv_scattered_pkts_vec;
+   dev->rx_pkt_burst = ice_recv_scattered_pkts_vec;
}
} else {
if (use_avx512) {
@@ -3139,14 +3152,25 @@
ice_recv_pkts_vec_avx512;
}
 #endif
+   } else if (use_avx2) {
+   if (rx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 OFFLOAD Vector 
Rx (port %d).",
+   dev->data->port_id);
+   dev->rx_pkt_burst =
+   ice_recv_pkts_vec_avx2_offload;
+   } else {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 Vector Rx (p

[dpdk-dev] [PATCH 0/2] add Rx/Tx offload paths for ICE AVX2

2021-06-01 Thread Wenzhuo Lu
Add specific paths for RX/TX AVX2, called offload paths.
In these paths, support the HW offload features, like, checksum, VLAN, RSS 
offload.
These paths are chosen automatically according to the configuration.

Wenzhuo Lu (2):
  net/ice: add Tx AVX2 offload path
  net/ice: add Rx AVX2 offload path

 doc/guides/rel_notes/release_21_08.rst |   6 +
 drivers/net/ice/ice_rxtx.c |  86 +--
 drivers/net/ice/ice_rxtx.h |   7 +
 drivers/net/ice/ice_rxtx_vec_avx2.c| 402 +++--
 4 files changed, 307 insertions(+), 194 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH 1/2] net/ice: add Tx AVX2 offload path

2021-06-01 Thread Wenzhuo Lu
Add a specific path for TX AVX2.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/ice/ice_rxtx.c  | 36 -
 drivers/net/ice/ice_rxtx.h  |  2 ++
 drivers/net/ice/ice_rxtx_vec_avx2.c | 54 ++---
 3 files changed, 64 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx.c b/drivers/net/ice/ice_rxtx.c
index 49abcb2..7c9474e 100644
--- a/drivers/net/ice/ice_rxtx.c
+++ b/drivers/net/ice/ice_rxtx.c
@@ -3294,7 +3294,7 @@
 #ifdef RTE_ARCH_X86
struct ice_tx_queue *txq;
int i;
-   int tx_check_ret;
+   int tx_check_ret = -1;
bool use_avx512 = false;
bool use_avx2 = false;
 
@@ -3313,13 +3313,13 @@
PMD_DRV_LOG(NOTICE,
"AVX512 is not supported in build env");
 #endif
-   if (!use_avx512 && tx_check_ret == ICE_VECTOR_PATH &&
-   (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-   rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
use_avx2 = true;
 
-   if (!use_avx512 && tx_check_ret == 
ICE_VECTOR_OFFLOAD_PATH)
+   if (!use_avx2 && !use_avx512 &&
+   tx_check_ret == ICE_VECTOR_OFFLOAD_PATH)
ad->tx_vec_allowed = false;
 
if (ad->tx_vec_allowed) {
@@ -3337,6 +3337,7 @@
}
 
if (ad->tx_vec_allowed) {
+   dev->tx_pkt_prepare = NULL;
if (use_avx512) {
 #ifdef CC_AVX512_SUPPORT
if (tx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
@@ -3345,6 +3346,7 @@
dev->data->port_id);
dev->tx_pkt_burst =
ice_xmit_pkts_vec_avx512_offload;
+   dev->tx_pkt_prepare = ice_prep_pkts;
} else {
PMD_DRV_LOG(NOTICE,
"Using AVX512 Vector Tx (port %d).",
@@ -3353,14 +3355,22 @@
}
 #endif
} else {
-   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   dev->tx_pkt_burst = use_avx2 ?
-   ice_xmit_pkts_vec_avx2 :
-   ice_xmit_pkts_vec;
+   if (tx_check_ret == ICE_VECTOR_OFFLOAD_PATH) {
+   PMD_DRV_LOG(NOTICE,
+   "Using AVX2 OFFLOAD Vector Tx (port 
%d).",
+   dev->data->port_id);
+   dev->tx_pkt_burst =
+   ice_xmit_pkts_vec_avx2_offload;
+   dev->tx_pkt_prepare = ice_prep_pkts;
+   } else {
+   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port 
%d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   dev->tx_pkt_burst = use_avx2 ?
+   ice_xmit_pkts_vec_avx2 :
+   ice_xmit_pkts_vec;
+   }
}
-   dev->tx_pkt_prepare = NULL;
 
return;
}
diff --git a/drivers/net/ice/ice_rxtx.h b/drivers/net/ice/ice_rxtx.h
index b29387c..595dc66 100644
--- a/drivers/net/ice/ice_rxtx.h
+++ b/drivers/net/ice/ice_rxtx.h
@@ -255,6 +255,8 @@ uint16_t ice_recv_scattered_pkts_vec_avx2(void *rx_queue,
  uint16_t nb_pkts);
 uint16_t ice_xmit_pkts_vec_avx2(void *tx_queue, struct rte_mbuf **tx_pkts,
uint16_t nb_pkts);
+uint16_t ice_xmit_pkts_vec_avx2_offload(void *tx_queue, struct rte_mbuf 
**tx_pkts,
+   uint16_t n

[dpdk-dev] [PATCH] net/iavf: fix coverity issue

2021-05-09 Thread Wenzhuo Lu
The coverity issue,
"CID 370606:  Control flow issues  (DEADCODE)
Execution cannot reach the expression "use_avx2"
inside this statement: "if (!use_sse && !use_avx2 &..."."

After commit bb3ef9aaa478, the check is useless.

Fixes: bb3ef9aaa478 ("net/iavf: fix Rx function selection")

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 87f7eeb..74b5ab5 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2393,7 +2393,6 @@
struct iavf_rx_queue *rxq;
int i;
int check_ret;
-   bool use_sse = false;
bool use_avx2 = false;
bool use_avx512 = false;
bool use_flex = false;
@@ -2401,7 +2400,6 @@
check_ret = iavf_rx_vec_dev_check(dev);
if (check_ret >= 0 &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   use_sse = true;
if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
 rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
@@ -2414,9 +2412,6 @@
use_avx512 = true;
 #endif
 
-   if (!use_sse && !use_avx2 && !use_avx512)
-   goto normal;
-
if (vf->vf_res->vf_cap_flags &
VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
use_flex = true;
@@ -2520,7 +2515,6 @@
return;
}
 
-normal:
 #endif
if (dev->data->scattered_rx) {
PMD_DRV_LOG(DEBUG, "Using a Scattered Rx callback (port=%d).",
-- 
1.9.3



[dpdk-dev] [PATCH] net/iavf: fix performance drop

2021-04-28 Thread Wenzhuo Lu
The performance drop is caused by that the RX scalar path
is selected when AVX512 is disabled and some HW offload
is enabled.
Actaully, the HW offload is supported by AVX2 and SSE.
In this scenario AVX2 path should be chosen.

This patch removes the offload related check for SSE and AVX2
as SSE and AVX2 do support the offload features.
No implement change about the data path.

Fixes: eff56a7b9f97 ("net/iavf: add offload path for Rx AVX512")

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 3f3cf63..0ba19dbf 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2401,13 +2401,11 @@
check_ret = iavf_rx_vec_dev_check(dev);
if (check_ret >= 0 &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   if (check_ret == IAVF_VECTOR_PATH) {
-   use_sse = true;
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
-   }
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+   use_avx2 = true;
 
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
-- 
1.9.3



[dpdk-dev] [PATCH] net/iavf: fix l4 checksum error

2021-04-26 Thread Wenzhuo Lu
leverage the behavior of the scalar path, preparing
packets is necessary for the checksum offload.

Fixes: 059f18ae2aec ("net/iavf: add offload path for Tx AVX512")

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0ba19dbf..87f7eeb 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2585,6 +2585,7 @@
iavf_xmit_pkts_vec_avx2 :
iavf_xmit_pkts_vec;
}
+   dev->tx_pkt_prepare = NULL;
 #ifdef CC_AVX512_SUPPORT
if (use_avx512) {
if (check_ret == IAVF_VECTOR_PATH) {
@@ -2593,12 +2594,12 @@
dev->data->port_id);
} else {
dev->tx_pkt_burst = 
iavf_xmit_pkts_vec_avx512_offload;
+   dev->tx_pkt_prepare = iavf_prep_pkts;
PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector 
Tx (port %d).",
dev->data->port_id);
}
}
 #endif
-   dev->tx_pkt_prepare = NULL;
 
for (i = 0; i < dev->data->nb_tx_queues; i++) {
txq = dev->data->tx_queues[i];
-- 
1.9.3



[dpdk-dev] [PATCH] net/iavf: fix performance drop

2021-04-25 Thread Wenzhuo Lu
AVX2 and SSE don't have the offload path.
Not necessary doing any check. Or the scalar path
will be chosen.

Fixes: eff56a7b9f97 ("net/iavf: add offload path for Rx AVX512")

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 3f3cf63..0ba19dbf 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2401,13 +2401,11 @@
check_ret = iavf_rx_vec_dev_check(dev);
if (check_ret >= 0 &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   if (check_ret == IAVF_VECTOR_PATH) {
-   use_sse = true;
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
-   }
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+   use_avx2 = true;
 
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
-- 
1.9.3



[dpdk-dev] [PATCH] net/i40e: fix potential Tx hang

2021-04-19 Thread Wenzhuo Lu
Tx hang may happen if there's no memory barrier.

Fixes: b4669bb95038 ("i40e: add vector Tx")
Fixes: aed68d5b0e81 ("net/i40e: add AVX2 Tx function")
Fixes: e6a6a138919f ("net/i40e: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: Tao Yang 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/i40e/i40e_rxtx_vec_avx2.c   | 1 +
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 1 +
 drivers/net/i40e/i40e_rxtx_vec_sse.c| 1 +
 3 files changed, 3 insertions(+)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 3b9eef9..a06813c 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -816,6 +816,7 @@
 
txq->tx_tail = tx_id;
 
+   rte_io_wmb();
I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
return nb_pkts;
diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index bd21d64..2f0915e 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -,6 +,7 @@
 
txq->tx_tail = tx_id;
 
+   rte_io_wmb();
I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
return nb_pkts;
diff --git a/drivers/net/i40e/i40e_rxtx_vec_sse.c 
b/drivers/net/i40e/i40e_rxtx_vec_sse.c
index bfa5aff..b37bc71 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_sse.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_sse.c
@@ -759,6 +759,7 @@
 
txq->tx_tail = tx_id;
 
+   rte_io_wmb();
I40E_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);
 
return nb_pkts;
-- 
1.9.3



[dpdk-dev] [PATCH v5 4/4] net/iavf: add offload path for Rx AVX512 flex desc

2021-04-14 Thread Wenzhuo Lu
Add a specific path for RX AVX512 (flexible descriptor).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_21_05.rst  |   5 +
 drivers/net/iavf/iavf_rxtx.c|  27 +-
 drivers/net/iavf/iavf_rxtx.h|   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 436 ++--
 4 files changed, 281 insertions(+), 193 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_05.rst 
b/doc/guides/rel_notes/release_21_05.rst
index 9a666b6..a6aa5c9 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -109,6 +109,11 @@ New Features
 
   * Added flow filter to support GTPU inner L3/L4 fields matching.
 
+  * In AVX512 code, added the new RX and TX paths to use the HW offload
+features. When the HW offload features are configured to be used, the
+offload paths are chosen automatically. In parallel the support of HW
+offload features is removed from the legacy AVX512 paths.
+
 * **Updated Intel ice driver.**
 
   * Added Intel ice support on Windows.
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index ca01ed9..3f3cf63 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2420,11 +2420,8 @@
goto normal;
 
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
use_flex = true;
-   if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
-   use_flex = false;
-   }
 
for (i = 0; i < dev->data->nb_rx_queues; i++) {
rxq = dev->data->rx_queues[i];
@@ -2452,9 +2449,14 @@

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload;
+   }
 #endif
} else {
dev->rx_pkt_burst = use_avx2 ?
@@ -2491,9 +2493,14 @@
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_pkts_vec_avx512_flex_rxd;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd_offload;
+   }
 #endif
} else {
dev->rx_pkt_burst = use_avx2 ?
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index a8e5664..19b6028 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -496,6 +496,9 @@ uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 struct rte_mbuf **rx_pkts,
 

[dpdk-dev] [PATCH v5 3/4] net/iavf: add offload path for Rx AVX512

2021-04-14 Thread Wenzhuo Lu
Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c| 105 +++---
 drivers/net/iavf/iavf_rxtx.h|  12 ++
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 353 
 drivers/net/iavf/iavf_rxtx_vec_common.h |  17 +-
 4 files changed, 324 insertions(+), 163 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 099ede7..ca01ed9 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2392,22 +2392,23 @@
 #ifdef RTE_ARCH_X86
struct iavf_rx_queue *rxq;
int i;
+   int check_ret;
+   bool use_sse = false;
bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
-#endif
+   bool use_flex = false;
 
-   if (!iavf_rx_vec_dev_check(dev) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   for (i = 0; i < dev->data->nb_rx_queues; i++) {
-   rxq = dev->data->rx_queues[i];
-   (void)iavf_rxq_vec_setup(rxq);
+   check_ret = iavf_rx_vec_dev_check(dev);
+   if (check_ret >= 0 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+   if (check_ret == IAVF_VECTOR_PATH) {
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   use_avx2 = true;
}
 
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2415,13 +2416,38 @@
use_avx512 = true;
 #endif
 
+   if (!use_sse && !use_avx2 && !use_avx512)
+   goto normal;
+
+   if (vf->vf_res->vf_cap_flags &
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   use_flex = true;
+   if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+   use_flex = false;
+   }
+
+   for (i = 0; i < dev->data->nb_rx_queues; i++) {
+   rxq = dev->data->rx_queues[i];
+   (void)iavf_rxq_vec_setup(rxq);
+   }
+
if (dev->data->scattered_rx) {
-   PMD_DRV_LOG(DEBUG,
-   "Using %sVector Scattered Rx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   if (!use_avx512) {
+   PMD_DRV_LOG(DEBUG,
+   "Using %sVector Scattered Rx (port 
%d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   } else {
+   if (check_ret == IAVF_VECTOR_PATH)
+   PMD_DRV_LOG(DEBUG,
+   "Using AVX512 Vector 
Scattered Rx (port %d).",
+   dev->data->port_id);
+   else
+   PMD_DRV_LOG(DEBUG,
+   "Using AVX512 OFFLOAD 
Vector Scattered Rx (port %d).",
+   dev->data->port_id);
+   }
+   if (use_flex) {
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2435,17 +2461,32 @@
 

[dpdk-dev] [PATCH v5 2/4] net/iavf: add offload path for Tx AVX512

2021-04-14 Thread Wenzhuo Lu
Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c|  57 +++--
 drivers/net/iavf/iavf_rxtx.h|  14 +++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 110 +++-
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 ++--
 4 files changed, 210 insertions(+), 69 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bd0b7ee..099ede7 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -160,7 +160,7 @@
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-   if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+   if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2498,17 +2498,23 @@
 #ifdef RTE_ARCH_X86
struct iavf_tx_queue *txq;
int i;
+   int check_ret;
+   bool use_sse = false;
bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
-#endif
 
-   if (!iavf_tx_vec_dev_check(dev) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
+   check_ret = iavf_tx_vec_dev_check(dev);
+
+   if (check_ret >= 0 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+   /* SSE and AVX2 not support offload path yet. */
+   if (check_ret == IAVF_VECTOR_PATH) {
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   use_avx2 = true;
+   }
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2516,15 +2522,29 @@
use_avx512 = true;
 #endif
 
-   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   dev->tx_pkt_burst = use_avx2 ?
-   iavf_xmit_pkts_vec_avx2 :
-   iavf_xmit_pkts_vec;
+   if (!use_sse && !use_avx2 && !use_avx512)
+   goto normal;
+
+   if (!use_avx512) {
+   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   dev->tx_pkt_burst = use_avx2 ?
+   iavf_xmit_pkts_vec_avx2 :
+   iavf_xmit_pkts_vec;
+   }
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH) {
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+   PMD_DRV_LOG(DEBUG, "Using AVX512 Vector Tx 
(port %d).",
+   dev->data->port_id);
+   } else {
+   dev->tx_pkt_burst = 
iavf_xmit_pkts_vec_avx512_offload;
+   PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector 
Tx (port %d).",
+   dev->data->port_id);
+   }
+   }
 #endif
dev->tx_pkt_prepare = NULL;
 
@@ -2544,8 +2564,9 @@
 
return;
}
-#endif
 
+normal:
+#endif
PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
dev->data->port_id);
dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/dri

[dpdk-dev] [PATCH v5 1/4] net/iavf: store offload flag of Rx queue

2021-04-14 Thread Wenzhuo Lu
Add the offload flag for RX queues to know which offload
features are set.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c | 4 
 drivers/net/iavf/iavf_rxtx.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 541b444..bd0b7ee 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -523,9 +523,12 @@
uint8_t proto_xtr;
uint16_t len;
uint16_t rx_free_thresh;
+   uint64_t offloads;
 
PMD_INIT_FUNC_TRACE();
 
+   offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
if (nb_desc % IAVF_ALIGN_RING_DESC != 0 ||
nb_desc > IAVF_MAX_RING_DESC ||
nb_desc < IAVF_MIN_RING_DESC) {
@@ -596,6 +599,7 @@
rxq->rx_deferred_start = rx_conf->rx_deferred_start;
rxq->rx_hdr_len = 0;
rxq->vsi = vsi;
+   rxq->offloads = offloads;
 
if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
rxq->crc_len = RTE_ETHER_CRC_LEN;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 4fbd847..f56dd74 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -198,6 +198,7 @@ struct iavf_rx_queue {
/* flexible descriptor metadata extraction offload flag */
iavf_rxd_to_pkt_fields_t rxd_to_pkt_fields;
/* handle flexible descriptor by RXDID */
+   uint64_t offloads;
 };
 
 struct iavf_tx_entry {
-- 
1.9.3



[dpdk-dev] [PATCH v5 0/4] add Rx/Tx offload paths for IAVF AVX512

2021-04-14 Thread Wenzhuo Lu
Add specific paths for RX/TX AVX512, called offload paths.
In these paths, support the HW offload features, like, checksum, VLAN, RSS 
offload.
These paths are chosen automatically according to the configuration.

v2:
 - Fixed compile error.

v3:
 - Used 'inline' to drop the duplicate code.
 - some minor change.

v4:
 - Rebased on next-net-intel.

v5:
 - Minor change of release note.

Wenzhuo Lu (4):
  net/iavf: store offload flag of Rx queue
  net/iavf: add offload path for Tx AVX512
  net/iavf: add offload path for Rx AVX512
  net/iavf: add offload path for Rx AVX512 flex desc

 doc/guides/rel_notes/release_21_05.rst  |   5 +
 drivers/net/iavf/iavf_rxtx.c| 187 +--
 drivers/net/iavf/iavf_rxtx.h|  33 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 899 +++-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 115 +++-
 5 files changed, 817 insertions(+), 422 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH v5 3/3] net/i40e: fix segment fault in AVX512

2021-04-14 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.
If there's no memory in the default cache, fall back to the
previous process.

The previous AVX2 rearm function is changed to add some AVX512
intructions and changed to a callee of the AVX2 and AVX512
rearm functions.

Fixes: e6a6a138919f ("net/i40e: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/i40e/i40e_rxtx_vec_avx2.c   | 117 +--
 drivers/net/i40e/i40e_rxtx_vec_avx512.c |   5 +-
 drivers/net/i40e/i40e_rxtx_vec_common.h | 201 
 3 files changed, 207 insertions(+), 116 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 15abd9d..3b9eef9 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -18,123 +18,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union i40e_rx_desc *rxdp;
-   struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxep,
-RTE_I40E_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   RTE_I40E_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH;
-   i += 4, rxep += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-   mb2 = rxep[2].mbuf;
-   mb3 = rxep[3].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /*
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-   

[dpdk-dev] [PATCH v5 2/3] net/ice: fix segment fault in AVX512

2021-04-14 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.
If there's no memory in the default cache, fall back to the
previous process.

The previous AVX2 rearm function is changed to add some AVX512
intructions and changed to a callee of the AVX2 and AVX512
rearm functions.

Fixes: 7f85d5ebcfe1 ("net/ice: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/ice/ice_rxtx_vec_avx2.c   | 120 +---
 drivers/net/ice/ice_rxtx_vec_avx512.c |   5 +-
 drivers/net/ice/ice_rxtx_vec_common.h | 203 ++
 drivers/net/ice/meson.build   |   2 +
 4 files changed, 211 insertions(+), 119 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c 
b/drivers/net/ice/ice_rxtx_vec_avx2.c
index 25efd30..83dcdf1 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -10,126 +10,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 ice_rxq_rearm(struct ice_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union ice_rx_flex_desc *rxdp;
-   struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxep,
-ICE_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   ICE_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < ICE_RXQ_REARM_THRESH;
-   i += 4, rxep += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-   mb2 = rxep[2].mbuf;
-   mb3 = rxep[3].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /**
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   

[dpdk-dev] [PATCH v5 1/3] net/iavf: fix segment fault in AVX512

2021-04-14 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.
If there's no memory in the default cache, fall back to the
previous process.

The previous AVX2 rearm function is changed to add some AVX512
intructions and changed to a callee of the AVX2 and AVX512
rearm functions.

Fixes: 31737f2b66fb ("net/iavf: enable AVX512 for legacy Rx")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 120 +--
 drivers/net/iavf/iavf_rxtx_vec_avx512.c |   5 +-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 203 
 3 files changed, 209 insertions(+), 119 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index cdb5139..f5646d6 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -10,126 +10,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union iavf_rx_desc *rxdp;
-   struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxp,
-IAVF_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + IAVF_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) {
-   rxp[i] = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   IAVF_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < IAVF_RXQ_REARM_THRESH; i += 2, rxp += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxp[0];
-   mb1 = rxp[1];
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < IAVF_RXQ_REARM_THRESH;
-   i += 4, rxp += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxp[0];
-   mb1 = rxp[1];
-   mb2 = rxp[2];
-   mb3 = rxp[3];
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /**
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   vaddr0_1 =
-   

[dpdk-dev] [PATCH v5 0/3] fix segment fault in avx512 code

2021-04-14 Thread Wenzhuo Lu
Fix no memory segment fault of iavf, ice, i40e.

v2:
 - Drop the duplicate code.

v3:
 - Fix compile error on no-x86 platform.

v4:
 - Minor performance optimization.

v5:
 - Fix a potential compile issue.
 - Some minor change.

Wenzhuo Lu (3):
  net/iavf: fix segment fault in AVX512
  net/ice: fix segment fault in AVX512
  net/i40e: fix segment fault in AVX512

 drivers/net/i40e/i40e_rxtx_vec_avx2.c   | 117 +-
 drivers/net/i40e/i40e_rxtx_vec_avx512.c |   5 +-
 drivers/net/i40e/i40e_rxtx_vec_common.h | 201 +++
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 120 +--
 drivers/net/iavf/iavf_rxtx_vec_avx512.c |   5 +-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 203 
 drivers/net/ice/ice_rxtx_vec_avx2.c | 120 +--
 drivers/net/ice/ice_rxtx_vec_avx512.c   |   5 +-
 drivers/net/ice/ice_rxtx_vec_common.h   | 203 
 drivers/net/ice/meson.build |   2 +
 10 files changed, 627 insertions(+), 354 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH v4 4/4] net/iavf: add offload path for Rx AVX512 flex desc

2021-04-08 Thread Wenzhuo Lu
Add a specific path for RX AVX512 (flexible descriptor).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c|  27 +-
 drivers/net/iavf/iavf_rxtx.h|   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 436 ++--
 4 files changed, 283 insertions(+), 193 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_05.rst 
b/doc/guides/rel_notes/release_21_05.rst
index 95713f2..253917a 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -137,6 +137,13 @@ New Features
   * Added command to display Rx queue used descriptor count.
 ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added the offload paths for IAVF AVX512.**
+
+  * Added the new RX and TX paths to use the HW offload features. When the HW
+offload features are configured to be used, the offload paths are chosen
+automatically.
+  * The code of HW offload features is removed from the legacy paths.
+
 
 Removed Items
 -
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index ca01ed9..3f3cf63 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2420,11 +2420,8 @@
goto normal;
 
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
use_flex = true;
-   if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
-   use_flex = false;
-   }
 
for (i = 0; i < dev->data->nb_rx_queues; i++) {
rxq = dev->data->rx_queues[i];
@@ -2452,9 +2449,14 @@

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload;
+   }
 #endif
} else {
dev->rx_pkt_burst = use_avx2 ?
@@ -2491,9 +2493,14 @@
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_pkts_vec_avx512_flex_rxd;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd_offload;
+   }
 #endif
} else {
dev->rx_pkt_burst = use_avx2 ?
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index a8e5664..19b6028 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -496,6 +496,9 @@ uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 struct rte_mbuf **rx_pkts,
 

[dpdk-dev] [PATCH v4 3/4] net/iavf: add offload path for Rx AVX512

2021-04-08 Thread Wenzhuo Lu
Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c| 105 +++---
 drivers/net/iavf/iavf_rxtx.h|  12 ++
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 353 
 drivers/net/iavf/iavf_rxtx_vec_common.h |  17 +-
 4 files changed, 324 insertions(+), 163 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 099ede7..ca01ed9 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2392,22 +2392,23 @@
 #ifdef RTE_ARCH_X86
struct iavf_rx_queue *rxq;
int i;
+   int check_ret;
+   bool use_sse = false;
bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
-#endif
+   bool use_flex = false;
 
-   if (!iavf_rx_vec_dev_check(dev) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   for (i = 0; i < dev->data->nb_rx_queues; i++) {
-   rxq = dev->data->rx_queues[i];
-   (void)iavf_rxq_vec_setup(rxq);
+   check_ret = iavf_rx_vec_dev_check(dev);
+   if (check_ret >= 0 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+   if (check_ret == IAVF_VECTOR_PATH) {
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   use_avx2 = true;
}
 
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2415,13 +2416,38 @@
use_avx512 = true;
 #endif
 
+   if (!use_sse && !use_avx2 && !use_avx512)
+   goto normal;
+
+   if (vf->vf_res->vf_cap_flags &
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   use_flex = true;
+   if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+   use_flex = false;
+   }
+
+   for (i = 0; i < dev->data->nb_rx_queues; i++) {
+   rxq = dev->data->rx_queues[i];
+   (void)iavf_rxq_vec_setup(rxq);
+   }
+
if (dev->data->scattered_rx) {
-   PMD_DRV_LOG(DEBUG,
-   "Using %sVector Scattered Rx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   if (!use_avx512) {
+   PMD_DRV_LOG(DEBUG,
+   "Using %sVector Scattered Rx (port 
%d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   } else {
+   if (check_ret == IAVF_VECTOR_PATH)
+   PMD_DRV_LOG(DEBUG,
+   "Using AVX512 Vector 
Scattered Rx (port %d).",
+   dev->data->port_id);
+   else
+   PMD_DRV_LOG(DEBUG,
+   "Using AVX512 OFFLOAD 
Vector Scattered Rx (port %d).",
+   dev->data->port_id);
+   }
+   if (use_flex) {
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2435,17 +2461,32 @@
 

[dpdk-dev] [PATCH v4 2/4] net/iavf: add offload path for Tx AVX512

2021-04-08 Thread Wenzhuo Lu
Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c|  57 +++--
 drivers/net/iavf/iavf_rxtx.h|  14 +++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 110 +++-
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 ++--
 4 files changed, 210 insertions(+), 69 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bd0b7ee..099ede7 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -160,7 +160,7 @@
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-   if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+   if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2498,17 +2498,23 @@
 #ifdef RTE_ARCH_X86
struct iavf_tx_queue *txq;
int i;
+   int check_ret;
+   bool use_sse = false;
bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
-#endif
 
-   if (!iavf_tx_vec_dev_check(dev) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
+   check_ret = iavf_tx_vec_dev_check(dev);
+
+   if (check_ret >= 0 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+   /* SSE and AVX2 not support offload path yet. */
+   if (check_ret == IAVF_VECTOR_PATH) {
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   use_avx2 = true;
+   }
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2516,15 +2522,29 @@
use_avx512 = true;
 #endif
 
-   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   dev->tx_pkt_burst = use_avx2 ?
-   iavf_xmit_pkts_vec_avx2 :
-   iavf_xmit_pkts_vec;
+   if (!use_sse && !use_avx2 && !use_avx512)
+   goto normal;
+
+   if (!use_avx512) {
+   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   dev->tx_pkt_burst = use_avx2 ?
+   iavf_xmit_pkts_vec_avx2 :
+   iavf_xmit_pkts_vec;
+   }
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH) {
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+   PMD_DRV_LOG(DEBUG, "Using AVX512 Vector Tx 
(port %d).",
+   dev->data->port_id);
+   } else {
+   dev->tx_pkt_burst = 
iavf_xmit_pkts_vec_avx512_offload;
+   PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector 
Tx (port %d).",
+   dev->data->port_id);
+   }
+   }
 #endif
dev->tx_pkt_prepare = NULL;
 
@@ -2544,8 +2564,9 @@
 
return;
}
-#endif
 
+normal:
+#endif
PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
dev->data->port_id);
dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/dri

[dpdk-dev] [PATCH v4 1/4] net/iavf: store offload flag of Rx queue

2021-04-08 Thread Wenzhuo Lu
Add the offload flag for RX queues to know which offload
features are set.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c | 4 
 drivers/net/iavf/iavf_rxtx.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 541b444..bd0b7ee 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -523,9 +523,12 @@
uint8_t proto_xtr;
uint16_t len;
uint16_t rx_free_thresh;
+   uint64_t offloads;
 
PMD_INIT_FUNC_TRACE();
 
+   offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
if (nb_desc % IAVF_ALIGN_RING_DESC != 0 ||
nb_desc > IAVF_MAX_RING_DESC ||
nb_desc < IAVF_MIN_RING_DESC) {
@@ -596,6 +599,7 @@
rxq->rx_deferred_start = rx_conf->rx_deferred_start;
rxq->rx_hdr_len = 0;
rxq->vsi = vsi;
+   rxq->offloads = offloads;
 
if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
rxq->crc_len = RTE_ETHER_CRC_LEN;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 4fbd847..f56dd74 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -198,6 +198,7 @@ struct iavf_rx_queue {
/* flexible descriptor metadata extraction offload flag */
iavf_rxd_to_pkt_fields_t rxd_to_pkt_fields;
/* handle flexible descriptor by RXDID */
+   uint64_t offloads;
 };
 
 struct iavf_tx_entry {
-- 
1.9.3



[dpdk-dev] [PATCH v4 0/4] add Rx/Tx offload paths for IAVF AVX512

2021-04-08 Thread Wenzhuo Lu
Add specific paths for RX/TX AVX512, called offload paths.
In these paths, support the HW offload features, like, checksum, VLAN, RSS 
offload.
These paths are chosen automatically according to the configuration.

v2:
 - Fixed compile error.

v3:
 - Used 'inline' to drop the duplicate code.
 - some minor change.

v4:
 - Rebased on next-net-intel.

Wenzhuo Lu (4):
  net/iavf: store offload flag of Rx queue
  net/iavf: add offload path for Tx AVX512
  net/iavf: add offload path for Rx AVX512
  net/iavf: add offload path for Rx AVX512 flex desc

 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c| 187 +--
 drivers/net/iavf/iavf_rxtx.h|  33 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 899 +++-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 115 +++-
 5 files changed, 819 insertions(+), 422 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH v4 3/3] net/i40e: fix segment fault in AVX512

2021-04-08 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: e6a6a138919f ("net/i40e: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/i40e/i40e_rxtx_vec_avx2.c   | 117 +--
 drivers/net/i40e/i40e_rxtx_vec_avx512.c |   5 +-
 drivers/net/i40e/i40e_rxtx_vec_common.h | 201 
 3 files changed, 207 insertions(+), 116 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 15abd9d..133e2fb 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -18,123 +18,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union i40e_rx_desc *rxdp;
-   struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxep,
-RTE_I40E_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   RTE_I40E_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH;
-   i += 4, rxep += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-   mb2 = rxep[2].mbuf;
-   mb3 = rxep[3].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /*
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   vaddr0_1 = _mm256_inserti128_si256(
-   _mm256_castsi128_si256(vaddr0), vaddr1, 1);
-   vaddr2_3 = _mm256_inserti128_si256(
-   _mm256_castsi128_si

[dpdk-dev] [PATCH v4 2/3] net/ice: fix segment fault in AVX512

2021-04-08 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: 7f85d5ebcfe1 ("net/ice: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/ice/ice_rxtx_vec_avx2.c   | 120 +---
 drivers/net/ice/ice_rxtx_vec_avx512.c |   5 +-
 drivers/net/ice/ice_rxtx_vec_common.h | 203 ++
 3 files changed, 209 insertions(+), 119 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c 
b/drivers/net/ice/ice_rxtx_vec_avx2.c
index 25efd30..7092275 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -10,126 +10,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 ice_rxq_rearm(struct ice_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union ice_rx_flex_desc *rxdp;
-   struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxep,
-ICE_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   ICE_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < ICE_RXQ_REARM_THRESH;
-   i += 4, rxep += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-   mb2 = rxep[2].mbuf;
-   mb3 = rxep[3].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /**
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   vaddr0_1 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
-   vaddr1, 1);
-   vaddr2_3 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(v

[dpdk-dev] [PATCH v4 1/3] net/iavf: fix segment fault in AVX512

2021-04-08 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: 31737f2b66fb ("net/iavf: enable AVX512 for legacy Rx")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 120 +--
 drivers/net/iavf/iavf_rxtx_vec_avx512.c |   5 +-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 203 
 3 files changed, 209 insertions(+), 119 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index cdb5139..2c2b139 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -10,126 +10,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union iavf_rx_desc *rxdp;
-   struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxp,
-IAVF_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + IAVF_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) {
-   rxp[i] = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   IAVF_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < IAVF_RXQ_REARM_THRESH; i += 2, rxp += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxp[0];
-   mb1 = rxp[1];
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < IAVF_RXQ_REARM_THRESH;
-   i += 4, rxp += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxp[0];
-   mb1 = rxp[1];
-   mb2 = rxp[2];
-   mb3 = rxp[3];
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /**
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   vaddr0_1 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
-   vaddr1, 1);
-   vaddr2_3 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
-

[dpdk-dev] [PATCH v4 0/3] fix segment fault in avx512 code

2021-04-08 Thread Wenzhuo Lu
Fix no memory segment fault of iavf, ice, i40e.

v2:
 - Drop the duplicate code.

v3:
 - Fix compile error on no-x86 platform.

v4:
 - minor performance optimization.

Wenzhuo Lu (3):
  net/iavf: fix segment fault in AVX512
  net/ice: fix segment fault in AVX512
  net/i40e: fix segment fault in AVX512

 drivers/net/i40e/i40e_rxtx_vec_avx2.c   | 117 +-
 drivers/net/i40e/i40e_rxtx_vec_avx512.c |   5 +-
 drivers/net/i40e/i40e_rxtx_vec_common.h | 201 +++
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 120 +--
 drivers/net/iavf/iavf_rxtx_vec_avx512.c |   5 +-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 203 
 drivers/net/ice/ice_rxtx_vec_avx2.c | 120 +--
 drivers/net/ice/ice_rxtx_vec_avx512.c   |   5 +-
 drivers/net/ice/ice_rxtx_vec_common.h   | 203 
 9 files changed, 625 insertions(+), 354 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH v3 3/3] net/i40e: fix segment fault in AVX512

2021-03-29 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: e6a6a138919f ("net/i40e: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/i40e/i40e_rxtx_vec_avx2.c   | 117 +--
 drivers/net/i40e/i40e_rxtx_vec_avx512.c |   5 +-
 drivers/net/i40e/i40e_rxtx_vec_common.h | 201 
 3 files changed, 207 insertions(+), 116 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 15abd9d..133e2fb 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -18,123 +18,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union i40e_rx_desc *rxdp;
-   struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxep,
-RTE_I40E_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   RTE_I40E_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH;
-   i += 4, rxep += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-   mb2 = rxep[2].mbuf;
-   mb3 = rxep[3].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /*
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   vaddr0_1 = _mm256_inserti128_si256(
-   _mm256_castsi128_si256(vaddr0), vaddr1, 1);
-   vaddr2_3 = _mm256_inserti128_si256(
-   _mm256_castsi128_si

[dpdk-dev] [PATCH v3 2/3] net/ice: fix segment fault in AVX512

2021-03-29 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: 7f85d5ebcfe1 ("net/ice: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/ice/ice_rxtx_vec_avx2.c   | 120 +---
 drivers/net/ice/ice_rxtx_vec_avx512.c |   5 +-
 drivers/net/ice/ice_rxtx_vec_common.h | 203 ++
 3 files changed, 209 insertions(+), 119 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c 
b/drivers/net/ice/ice_rxtx_vec_avx2.c
index 1cc5490..8d3cf3e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -10,126 +10,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 ice_rxq_rearm(struct ice_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union ice_rx_flex_desc *rxdp;
-   struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxep,
-ICE_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   ICE_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < ICE_RXQ_REARM_THRESH;
-   i += 4, rxep += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-   mb2 = rxep[2].mbuf;
-   mb3 = rxep[3].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /**
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   vaddr0_1 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
-   vaddr1, 1);
-   vaddr2_3 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(v

[dpdk-dev] [PATCH v3 1/3] net/iavf: fix segment fault in AVX512

2021-03-29 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: 31737f2b66fb ("net/iavf: enable AVX512 for legacy Rx")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 120 +--
 drivers/net/iavf/iavf_rxtx_vec_avx512.c |   5 +-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 203 
 3 files changed, 209 insertions(+), 119 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index cdb5139..2c2b139 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -10,126 +10,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union iavf_rx_desc *rxdp;
-   struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxp,
-IAVF_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + IAVF_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) {
-   rxp[i] = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   IAVF_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < IAVF_RXQ_REARM_THRESH; i += 2, rxp += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxp[0];
-   mb1 = rxp[1];
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < IAVF_RXQ_REARM_THRESH;
-   i += 4, rxp += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxp[0];
-   mb1 = rxp[1];
-   mb2 = rxp[2];
-   mb3 = rxp[3];
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /**
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   vaddr0_1 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
-   vaddr1, 1);
-   vaddr2_3 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
-

[dpdk-dev] [PATCH v3 0/3] fix segment fault in avx512 code

2021-03-29 Thread Wenzhuo Lu
Fix no memory segment fault of iavf, ice, i40e.

v2:
 - Drop the duplicate code.

v3:
 - Fix compile error on no-x86 platform.

Wenzhuo Lu (3):
  net/iavf: fix segment fault in AVX512
  net/ice: fix segment fault in AVX512
  net/i40e: fix segment fault in AVX512

 drivers/net/i40e/i40e_rxtx_vec_avx2.c   | 117 +-
 drivers/net/i40e/i40e_rxtx_vec_avx512.c |   5 +-
 drivers/net/i40e/i40e_rxtx_vec_common.h | 201 +++
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 120 +--
 drivers/net/iavf/iavf_rxtx_vec_avx512.c |   5 +-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 203 
 drivers/net/ice/ice_rxtx_vec_avx2.c | 120 +--
 drivers/net/ice/ice_rxtx_vec_avx512.c   |   5 +-
 drivers/net/ice/ice_rxtx_vec_common.h   | 203 
 9 files changed, 625 insertions(+), 354 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH v2 3/3] net/i40e: fix segment fault in AVX512

2021-03-29 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: e6a6a138919f ("net/i40e: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/i40e/i40e_rxtx_vec_avx2.c   | 117 +--
 drivers/net/i40e/i40e_rxtx_vec_avx512.c |   5 +-
 drivers/net/i40e/i40e_rxtx_vec_common.h | 199 
 3 files changed, 205 insertions(+), 116 deletions(-)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx2.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
index 15abd9d..133e2fb 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx2.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx2.c
@@ -18,123 +18,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 i40e_rxq_rearm(struct i40e_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union i40e_rx_desc *rxdp;
-   struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxep,
-RTE_I40E_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   RTE_I40E_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH;
-   i += 4, rxep += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-   mb2 = rxep[2].mbuf;
-   mb3 = rxep[3].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /*
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   vaddr0_1 = _mm256_inserti128_si256(
-   _mm256_castsi128_si256(vaddr0), vaddr1, 1);
-   vaddr2_3 = _mm256_inserti128_si256(
-   _mm256_castsi128_si

[dpdk-dev] [PATCH v2 2/3] net/ice: fix segment fault in AVX512

2021-03-29 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: 7f85d5ebcfe1 ("net/ice: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/ice/ice_rxtx_vec_avx2.c   | 120 +---
 drivers/net/ice/ice_rxtx_vec_avx512.c |   5 +-
 drivers/net/ice/ice_rxtx_vec_common.h | 201 ++
 3 files changed, 207 insertions(+), 119 deletions(-)

diff --git a/drivers/net/ice/ice_rxtx_vec_avx2.c 
b/drivers/net/ice/ice_rxtx_vec_avx2.c
index 1cc5490..8d3cf3e 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx2.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx2.c
@@ -10,126 +10,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 ice_rxq_rearm(struct ice_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union ice_rx_flex_desc *rxdp;
-   struct ice_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxep,
-ICE_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
-   rxep[i].mbuf = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   ICE_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < ICE_RXQ_REARM_THRESH;
-   i += 4, rxep += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxep[0].mbuf;
-   mb1 = rxep[1].mbuf;
-   mb2 = rxep[2].mbuf;
-   mb3 = rxep[3].mbuf;
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /**
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   vaddr0_1 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
-   vaddr1, 1);
-   vaddr2_3 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(v

[dpdk-dev] [PATCH v2 1/3] net/iavf: fix segment fault in AVX512

2021-03-29 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: 31737f2b66fb ("net/iavf: enable AVX512 for legacy Rx")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 120 +--
 drivers/net/iavf/iavf_rxtx_vec_avx512.c |   5 +-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 201 
 3 files changed, 207 insertions(+), 119 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx2.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
index cdb5139..2c2b139 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx2.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx2.c
@@ -10,126 +10,10 @@
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 
-static inline void
+static __rte_always_inline void
 iavf_rxq_rearm(struct iavf_rx_queue *rxq)
 {
-   int i;
-   uint16_t rx_id;
-   volatile union iavf_rx_desc *rxdp;
-   struct rte_mbuf **rxp = &rxq->sw_ring[rxq->rxrearm_start];
-
-   rxdp = rxq->rx_ring + rxq->rxrearm_start;
-
-   /* Pull 'n' more MBUFs into the software ring */
-   if (rte_mempool_get_bulk(rxq->mp,
-(void *)rxp,
-IAVF_RXQ_REARM_THRESH) < 0) {
-   if (rxq->rxrearm_nb + IAVF_RXQ_REARM_THRESH >=
-   rxq->nb_rx_desc) {
-   __m128i dma_addr0;
-
-   dma_addr0 = _mm_setzero_si128();
-   for (i = 0; i < IAVF_VPMD_DESCS_PER_LOOP; i++) {
-   rxp[i] = &rxq->fake_mbuf;
-   _mm_store_si128((__m128i *)&rxdp[i].read,
-   dma_addr0);
-   }
-   }
-   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
-   IAVF_RXQ_REARM_THRESH;
-   return;
-   }
-
-#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
-   struct rte_mbuf *mb0, *mb1;
-   __m128i dma_addr0, dma_addr1;
-   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
-   RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 2 mbufs in one loop */
-   for (i = 0; i < IAVF_RXQ_REARM_THRESH; i += 2, rxp += 2) {
-   __m128i vaddr0, vaddr1;
-
-   mb0 = rxp[0];
-   mb1 = rxp[1];
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-
-   /* convert pa to dma_addr hdr/data */
-   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
-   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
-
-   /* add headroom to pa values */
-   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
-   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
-
-   /* flush desc with pa dma_addr */
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
-   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
-   }
-#else
-   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
-   __m256i dma_addr0_1, dma_addr2_3;
-   __m256i hdr_room = _mm256_set1_epi64x(RTE_PKTMBUF_HEADROOM);
-   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
-   for (i = 0; i < IAVF_RXQ_REARM_THRESH;
-   i += 4, rxp += 4, rxdp += 4) {
-   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
-   __m256i vaddr0_1, vaddr2_3;
-
-   mb0 = rxp[0];
-   mb1 = rxp[1];
-   mb2 = rxp[2];
-   mb3 = rxp[3];
-
-   /* load buf_addr(lo 64bit) and buf_iova(hi 64bit) */
-   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
-   offsetof(struct rte_mbuf, buf_addr) + 8);
-   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
-   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
-   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
-   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
-
-   /**
-* merge 0 & 1, by casting 0 to 256-bit and inserting 1
-* into the high lanes. Similarly for 2 & 3
-*/
-   vaddr0_1 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr0),
-   vaddr1, 1);
-   vaddr2_3 =
-   _mm256_inserti128_si256(_mm256_castsi128_si256(vaddr2),
-

[dpdk-dev] [PATCH v2 0/3] fix segment fault in avx512

2021-03-29 Thread Wenzhuo Lu
Fix no memory segment fault of iavf, ice, i40e.

v2:
 - Drop the duplicate code.

Wenzhuo Lu (3):
  net/iavf: fix segment fault in AVX512
  net/ice: fix segment fault in AVX512
  net/i40e: fix segment fault in AVX512

 drivers/net/i40e/i40e_rxtx_vec_avx2.c   | 117 +--
 drivers/net/i40e/i40e_rxtx_vec_avx512.c |   5 +-
 drivers/net/i40e/i40e_rxtx_vec_common.h | 199 +++
 drivers/net/iavf/iavf_rxtx_vec_avx2.c   | 120 +--
 drivers/net/iavf/iavf_rxtx_vec_avx512.c |   5 +-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 201 
 drivers/net/ice/ice_rxtx_vec_avx2.c | 120 +--
 drivers/net/ice/ice_rxtx_vec_avx512.c   |   5 +-
 drivers/net/ice/ice_rxtx_vec_common.h   | 201 
 9 files changed, 619 insertions(+), 354 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH v3 4/4] net/iavf: add offload path for Rx AVX512 flex desc

2021-03-25 Thread Wenzhuo Lu
Add a specific path for RX AVX512 (flexible descriptor).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c|  27 +-
 drivers/net/iavf/iavf_rxtx.h|   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 436 ++--
 4 files changed, 283 insertions(+), 193 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_05.rst 
b/doc/guides/rel_notes/release_21_05.rst
index 21dc6d2..a7769f8 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -83,6 +83,13 @@ New Features
   * Added command to display Rx queue used descriptor count.
 ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added the offload paths for IAVF AVX512.**
+
+  * Added the new RX and TX paths to use the HW offload features. When the HW
+offload features are configured to be used, the offload paths are chosen
+automatically.
+  * The code of HW offload features is removed from the legacy paths.
+
 
 Removed Items
 -
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 1a03d97..d72393c 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2395,11 +2395,8 @@
goto normal;
 
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
use_flex = true;
-   if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
-   use_flex = false;
-   }
 
for (i = 0; i < dev->data->nb_rx_queues; i++) {
rxq = dev->data->rx_queues[i];
@@ -2427,9 +2424,14 @@

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload;
+   }
 #endif
} else {
dev->rx_pkt_burst = use_avx2 ?
@@ -2466,9 +2468,14 @@
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_pkts_vec_avx512_flex_rxd;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd_offload;
+   }
 #endif
} else {
dev->rx_pkt_burst = use_avx2 ?
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index d0e5909..fb16d18 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -495,6 +495,9 @@ uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 struct rte_mbuf **rx_pkts,
 

[dpdk-dev] [PATCH v3 3/4] net/iavf: add offload path for Rx AVX512

2021-03-25 Thread Wenzhuo Lu
Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c| 105 +++---
 drivers/net/iavf/iavf_rxtx.h|  12 ++
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 355 
 drivers/net/iavf/iavf_rxtx_vec_common.h |  17 +-
 4 files changed, 325 insertions(+), 164 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 4d9b3b9..1a03d97 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2367,22 +2367,23 @@
 #ifdef RTE_ARCH_X86
struct iavf_rx_queue *rxq;
int i;
+   int check_ret;
+   bool use_sse = false;
bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
-#endif
+   bool use_flex = false;
 
-   if (!iavf_rx_vec_dev_check(dev) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   for (i = 0; i < dev->data->nb_rx_queues; i++) {
-   rxq = dev->data->rx_queues[i];
-   (void)iavf_rxq_vec_setup(rxq);
+   check_ret = iavf_rx_vec_dev_check(dev);
+   if (check_ret >= 0 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+   if (check_ret == IAVF_VECTOR_PATH) {
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   use_avx2 = true;
}
 
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2390,13 +2391,38 @@
use_avx512 = true;
 #endif
 
+   if (!use_sse && !use_avx2 && !use_avx512)
+   goto normal;
+
+   if (vf->vf_res->vf_cap_flags &
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   use_flex = true;
+   if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+   use_flex = false;
+   }
+
+   for (i = 0; i < dev->data->nb_rx_queues; i++) {
+   rxq = dev->data->rx_queues[i];
+   (void)iavf_rxq_vec_setup(rxq);
+   }
+
if (dev->data->scattered_rx) {
-   PMD_DRV_LOG(DEBUG,
-   "Using %sVector Scattered Rx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   if (!use_avx512) {
+   PMD_DRV_LOG(DEBUG,
+   "Using %sVector Scattered Rx (port 
%d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   } else {
+   if (check_ret == IAVF_VECTOR_PATH)
+   PMD_DRV_LOG(DEBUG,
+   "Using AVX512 Vector 
Scattered Rx (port %d).",
+   dev->data->port_id);
+   else
+   PMD_DRV_LOG(DEBUG,
+   "Using AVX512 OFFLOAD 
Vector Scattered Rx (port %d).",
+   dev->data->port_id);
+   }
+   if (use_flex) {
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2410,17 +2436,32 @@
 

[dpdk-dev] [PATCH v3 2/4] net/iavf: add offload path for Tx AVX512

2021-03-25 Thread Wenzhuo Lu
Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c|  57 +++--
 drivers/net/iavf/iavf_rxtx.h|  14 +++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 110 +++-
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 ++--
 4 files changed, 210 insertions(+), 69 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bf1064d..4d9b3b9 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -135,7 +135,7 @@
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-   if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+   if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2473,17 +2473,23 @@
 #ifdef RTE_ARCH_X86
struct iavf_tx_queue *txq;
int i;
+   int check_ret;
+   bool use_sse = false;
bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
-#endif
 
-   if (!iavf_tx_vec_dev_check(dev) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
+   check_ret = iavf_tx_vec_dev_check(dev);
+
+   if (check_ret >= 0 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+   /* SSE and AVX2 not support offload path yet. */
+   if (check_ret == IAVF_VECTOR_PATH) {
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   use_avx2 = true;
+   }
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2491,15 +2497,29 @@
use_avx512 = true;
 #endif
 
-   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   dev->tx_pkt_burst = use_avx2 ?
-   iavf_xmit_pkts_vec_avx2 :
-   iavf_xmit_pkts_vec;
+   if (!use_sse && !use_avx2 && !use_avx512)
+   goto normal;
+
+   if (!use_avx512) {
+   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   dev->tx_pkt_burst = use_avx2 ?
+   iavf_xmit_pkts_vec_avx2 :
+   iavf_xmit_pkts_vec;
+   }
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH) {
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+   PMD_DRV_LOG(DEBUG, "Using AVX512 Vector Tx 
(port %d).",
+   dev->data->port_id);
+   } else {
+   dev->tx_pkt_burst = 
iavf_xmit_pkts_vec_avx512_offload;
+   PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector 
Tx (port %d).",
+   dev->data->port_id);
+   }
+   }
 #endif
dev->tx_pkt_prepare = NULL;
 
@@ -2519,8 +2539,9 @@
 
return;
}
-#endif
 
+normal:
+#endif
PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
dev->data->port_id);
dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/dri

[dpdk-dev] [PATCH v3 1/4] net/iavf: store offload flag of Rx queue

2021-03-25 Thread Wenzhuo Lu
Add the offload flag for RX queues to know which offload
features are set.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c | 4 
 drivers/net/iavf/iavf_rxtx.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 8fafe45..bf1064d 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -498,9 +498,12 @@
uint8_t proto_xtr;
uint16_t len;
uint16_t rx_free_thresh;
+   uint64_t offloads;
 
PMD_INIT_FUNC_TRACE();
 
+   offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
if (nb_desc % IAVF_ALIGN_RING_DESC != 0 ||
nb_desc > IAVF_MAX_RING_DESC ||
nb_desc < IAVF_MIN_RING_DESC) {
@@ -571,6 +574,7 @@
rxq->rx_deferred_start = rx_conf->rx_deferred_start;
rxq->rx_hdr_len = 0;
rxq->vsi = vsi;
+   rxq->offloads = offloads;
 
if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
rxq->crc_len = RTE_ETHER_CRC_LEN;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 922ddad..06ff528 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -198,6 +198,7 @@ struct iavf_rx_queue {
/* flexible descriptor metadata extraction offload flag */
iavf_rxd_to_pkt_fields_t rxd_to_pkt_fields;
/* handle flexible descriptor by RXDID */
+   uint64_t offloads;
 };
 
 struct iavf_tx_entry {
-- 
1.9.3



[dpdk-dev] [PATCH v3 0/4] add Rx/Tx offload paths for IAVF AVX512

2021-03-25 Thread Wenzhuo Lu
Add specific paths for RX/TX AVX512, called offload paths.
In these paths, support the HW offload features, like,
checksum, VLAN, RSS offload.
These paths are chosen automatically according to the
configuration.

v2:
 - Fixed compile error.

v3:
 - Used 'inline' to drop the duplicate code.
 - some minor change.

Wenzhuo Lu (4):
  net/iavf: store offload flag of Rx queue
  net/iavf: add offload path for Tx AVX512
  net/iavf: add offload path for Rx AVX512
  net/iavf: add offload path for Rx AVX512 flex desc

 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c| 187 +--
 drivers/net/iavf/iavf_rxtx.h|  33 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 901 +++-
 drivers/net/iavf/iavf_rxtx_vec_common.h | 115 +++-
 5 files changed, 820 insertions(+), 423 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH v2 4/4] net/iavf: add offload path for Rx AVX512 flex desc

2021-03-17 Thread Wenzhuo Lu
Add a specific path for RX AVX512 (flexible descriptor).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

All the code for the above HW offload features is removed
from the legacy path.

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c|  27 +-
 drivers/net/iavf/iavf_rxtx.h|   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 899 +---
 4 files changed, 746 insertions(+), 193 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_05.rst 
b/doc/guides/rel_notes/release_21_05.rst
index 21dc6d2..a7769f8 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -83,6 +83,13 @@ New Features
   * Added command to display Rx queue used descriptor count.
 ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added the offload paths for IAVF AVX512.**
+
+  * Added the new RX and TX paths to use the HW offload features. When the HW
+offload features are configured to be used, the offload paths are chosen
+automatically.
+  * The code of HW offload features is removed from the legacy paths.
+
 
 Removed Items
 -
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 9b3f8be..5aeb19d 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2395,11 +2395,8 @@
goto normal;
 
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
use_flex = true;
-   if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
-   use_flex = false;
-   }
 
for (i = 0; i < dev->data->nb_rx_queues; i++) {
rxq = dev->data->rx_queues[i];
@@ -2417,9 +2414,14 @@

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload;
+   }
 #endif
} else {
dev->rx_pkt_burst = use_avx2 ?
@@ -2446,9 +2448,14 @@
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_pkts_vec_avx512_flex_rxd;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd_offload;
+   }
 #endif
} else {
dev->rx_pkt_burst = use_avx2 ?
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index b8c90f8..42e9f79 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -494,6 +494,9 @@ uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 struct rte_mbuf **rx_pkts,

[dpdk-dev] [PATCH v2 3/4] net/iavf: add offload path for Rx AVX512

2021-03-17 Thread Wenzhuo Lu
Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

All the code for the above HW offload features is removed
from the legacy path.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c|  83 ++--
 drivers/net/iavf/iavf_rxtx.h|  12 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 722 ++--
 drivers/net/iavf/iavf_rxtx_vec_common.h |  15 +-
 4 files changed, 682 insertions(+), 150 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 4744c35..9b3f8be 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2367,22 +2367,23 @@
 #ifdef RTE_ARCH_X86
struct iavf_rx_queue *rxq;
int i;
+   int check_ret;
+   bool use_sse = false;
bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
-#endif
+   bool use_flex = false;
 
-   if (!iavf_rx_vec_dev_check(dev) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   for (i = 0; i < dev->data->nb_rx_queues; i++) {
-   rxq = dev->data->rx_queues[i];
-   (void)iavf_rxq_vec_setup(rxq);
+   check_ret = iavf_rx_vec_dev_check(dev);
+   if (check_ret >= 0 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+   if (check_ret == IAVF_VECTOR_PATH) {
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   use_avx2 = true;
}
 
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2390,13 +2391,28 @@
use_avx512 = true;
 #endif
 
+   if (!use_sse && !use_avx2 && !use_avx512)
+   goto normal;
+
+   if (vf->vf_res->vf_cap_flags &
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   use_flex = true;
+   if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+   use_flex = false;
+   }
+
+   for (i = 0; i < dev->data->nb_rx_queues; i++) {
+   rxq = dev->data->rx_queues[i];
+   (void)iavf_rxq_vec_setup(rxq);
+   }
+
if (dev->data->scattered_rx) {
-   PMD_DRV_LOG(DEBUG,
-   "Using %sVector Scattered Rx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   if (!use_avx512)
+   PMD_DRV_LOG(DEBUG,
+   "Using %sVector Scattered Rx (port 
%d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   if (use_flex) {
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2410,17 +2426,22 @@
iavf_recv_scattered_pkts_vec_avx2 :
iavf_recv_scattered_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_scattered_pkts_vec_avx512;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512;
+   

[dpdk-dev] [PATCH v2 2/4] net/iavf: add offload path for Tx AVX512

2021-03-17 Thread Wenzhuo Lu
Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c|  50 ++
 drivers/net/iavf/iavf_rxtx.h|  13 ++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 165 
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 +--
 4 files changed, 301 insertions(+), 25 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bf1064d..4744c35 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -135,7 +135,7 @@
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-   if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+   if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2473,17 +2473,23 @@
 #ifdef RTE_ARCH_X86
struct iavf_tx_queue *txq;
int i;
+   int check_ret;
+   bool use_sse = false;
bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
-#endif
 
-   if (!iavf_tx_vec_dev_check(dev) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
+   check_ret = iavf_tx_vec_dev_check(dev);
+
+   if (check_ret >= 0 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+   /* SSE and AVX2 not support offload path yet. */
+   if (check_ret == IAVF_VECTOR_PATH) {
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   use_avx2 = true;
+   }
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2491,15 +2497,24 @@
use_avx512 = true;
 #endif
 
-   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   dev->tx_pkt_burst = use_avx2 ?
-   iavf_xmit_pkts_vec_avx2 :
-   iavf_xmit_pkts_vec;
+   if (!use_sse && !use_avx2 && !use_avx512)
+   goto normal;
+
+   if (!use_avx512) {
+   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   dev->tx_pkt_burst = use_avx2 ?
+   iavf_xmit_pkts_vec_avx2 :
+   iavf_xmit_pkts_vec;
+   }
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+   else
+   dev->tx_pkt_burst = 
iavf_xmit_pkts_vec_avx512_offload;
+   }
 #endif
dev->tx_pkt_prepare = NULL;
 
@@ -2521,6 +2536,7 @@
}
 #endif
 
+normal:
PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
dev->data->port_id);
dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 06ff528..da39f78 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -23,14 +23,20 @@
 #define IAVF_VPMD_DESCS_PER_LOOP  4
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
-#define IAVF_NO_VECTOR_FLAGS (  \
+#define IAVF_TX_NO_VECTOR_FLAGS (   \
DEV_TX_OFFLOAD_MULTI_SEGS |  \
+   DEV_TX_OFFLOAD_TCP_TSO)
+
+#define IAVF_TX_VECTOR_OFFLOAD (   

[dpdk-dev] [PATCH v2 1/4] net/iavf: store offload flag of Rx queue

2021-03-17 Thread Wenzhuo Lu
Add the offload flag for RX queues to know which offload
features are set.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c | 4 
 drivers/net/iavf/iavf_rxtx.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 8fafe45..bf1064d 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -498,9 +498,12 @@
uint8_t proto_xtr;
uint16_t len;
uint16_t rx_free_thresh;
+   uint64_t offloads;
 
PMD_INIT_FUNC_TRACE();
 
+   offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
if (nb_desc % IAVF_ALIGN_RING_DESC != 0 ||
nb_desc > IAVF_MAX_RING_DESC ||
nb_desc < IAVF_MIN_RING_DESC) {
@@ -571,6 +574,7 @@
rxq->rx_deferred_start = rx_conf->rx_deferred_start;
rxq->rx_hdr_len = 0;
rxq->vsi = vsi;
+   rxq->offloads = offloads;
 
if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
rxq->crc_len = RTE_ETHER_CRC_LEN;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 922ddad..06ff528 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -198,6 +198,7 @@ struct iavf_rx_queue {
/* flexible descriptor metadata extraction offload flag */
iavf_rxd_to_pkt_fields_t rxd_to_pkt_fields;
/* handle flexible descriptor by RXDID */
+   uint64_t offloads;
 };
 
 struct iavf_tx_entry {
-- 
1.9.3



[dpdk-dev] [PATCH v2 0/4] add Rx/Tx offload paths for IAVF AVX512

2021-03-17 Thread Wenzhuo Lu
Add specific paths for RX/TX AVX512, called offload paths.
In these paths, support the HW offload features, like,
checksum, VLAN, RSS offload.
These paths are chosen automatically according to the
configuration.

The code for the above HW offload features, which are
supported by offload paths, is removed from the legacy path.

v2:
 - Fixed compile error.

Wenzhuo Lu (4):
  net/iavf: store offload flag of Rx queue
  net/iavf: add offload path for Tx AVX512
  net/iavf: add offload path for Rx AVX512
  net/iavf: add offload path for Rx AVX512 flex desc

 doc/guides/rel_notes/release_21_05.rst  |7 +
 drivers/net/iavf/iavf_rxtx.c|  158 ++-
 drivers/net/iavf/iavf_rxtx.h|   32 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 1646 ++-
 drivers/net/iavf/iavf_rxtx_vec_common.h |  113 ++-
 5 files changed, 1661 insertions(+), 295 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH 4/4] net/iavf: add offload path for Rx AVX512 flex desc

2021-03-16 Thread Wenzhuo Lu
Add a specific path for RX AVX512 (flexible descriptor).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

All the code for the above HW offload features is removed
from the legacy path.

Signed-off-by: Wenzhuo Lu 
---
 doc/guides/rel_notes/release_21_05.rst  |   7 +
 drivers/net/iavf/iavf_rxtx.c|  27 +-
 drivers/net/iavf/iavf_rxtx.h|   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 899 +---
 4 files changed, 746 insertions(+), 193 deletions(-)

diff --git a/doc/guides/rel_notes/release_21_05.rst 
b/doc/guides/rel_notes/release_21_05.rst
index 21dc6d2..a7769f8 100644
--- a/doc/guides/rel_notes/release_21_05.rst
+++ b/doc/guides/rel_notes/release_21_05.rst
@@ -83,6 +83,13 @@ New Features
   * Added command to display Rx queue used descriptor count.
 ``show port (port_id) rxq (queue_id) desc used count``
 
+* **Added the offload paths for IAVF AVX512.**
+
+  * Added the new RX and TX paths to use the HW offload features. When the HW
+offload features are configured to be used, the offload paths are chosen
+automatically.
+  * The code of HW offload features is removed from the legacy paths.
+
 
 Removed Items
 -
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 741c3cd..a39c110 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2397,11 +2397,8 @@
goto normal;
 
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
use_flex = true;
-   if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
-   use_flex = false;
-   }
 
for (i = 0; i < dev->data->nb_rx_queues; i++) {
rxq = dev->data->rx_queues[i];
@@ -2419,9 +2416,14 @@

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd_offload;
+   }
 #endif
} else {
dev->rx_pkt_burst = use_avx2 ?
@@ -2448,9 +2450,14 @@
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_pkts_vec_avx512_flex_rxd;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd;
+   else
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd_offload;
+   }
 #endif
} else {
dev->rx_pkt_burst = use_avx2 ?
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index b8c90f8..42e9f79 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -494,6 +494,9 @@ uint16_t iavf_recv_pkts_vec_avx512_offload(void *rx_queue,
 uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
struct rte_mbuf **rx_pkts,
uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd_offload(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 struct rte_mbuf **rx_pkts,

[dpdk-dev] [PATCH 3/4] net/iavf: add offload path for Rx AVX512

2021-03-16 Thread Wenzhuo Lu
Add a specific path for RX AVX512 (traditional).
In this path, support the HW offload features, like,
checksum, VLAN stripping, RSS hash.
This path is chosen automatically according to the
configuration.

All the code for the above HW offload features is removed
from the legacy path.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c|  81 ++--
 drivers/net/iavf/iavf_rxtx.h|  12 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 722 ++--
 drivers/net/iavf/iavf_rxtx_vec_common.h |  15 +-
 4 files changed, 682 insertions(+), 148 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 4744c35..741c3cd 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2367,22 +2367,25 @@
 #ifdef RTE_ARCH_X86
struct iavf_rx_queue *rxq;
int i;
+   int check_ret;
+   bool use_sse = false;
bool use_avx2 = false;
 #ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
 #endif
+   bool use_flex = false;
 
-   if (!iavf_rx_vec_dev_check(dev) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   for (i = 0; i < dev->data->nb_rx_queues; i++) {
-   rxq = dev->data->rx_queues[i];
-   (void)iavf_rxq_vec_setup(rxq);
+   check_ret = iavf_rx_vec_dev_check(dev);
+   if (check_ret >= 0 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+   if (check_ret == IAVF_VECTOR_PATH) {
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   use_avx2 = true;
}
 
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2390,13 +2393,28 @@
use_avx512 = true;
 #endif
 
+   if (!use_sse && !use_avx2 && !use_avx512)
+   goto normal;
+
+   if (vf->vf_res->vf_cap_flags &
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   use_flex = true;
+   if (use_avx512 && check_ret == IAVF_VECTOR_OFFLOAD_PATH)
+   use_flex = false;
+   }
+
+   for (i = 0; i < dev->data->nb_rx_queues; i++) {
+   rxq = dev->data->rx_queues[i];
+   (void)iavf_rxq_vec_setup(rxq);
+   }
+
if (dev->data->scattered_rx) {
-   PMD_DRV_LOG(DEBUG,
-   "Using %sVector Scattered Rx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
+   if (!use_avx512)
+   PMD_DRV_LOG(DEBUG,
+   "Using %sVector Scattered Rx (port 
%d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   if (use_flex) {
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
@@ -2410,17 +2428,22 @@
iavf_recv_scattered_pkts_vec_avx2 :
iavf_recv_scattered_pkts_vec;
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->rx_pkt_burst =
-   
iavf_recv_scattered_pkts_vec_avx512;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512;
+   

[dpdk-dev] [PATCH 2/4] net/iavf: add offload path for Tx AVX512

2021-03-16 Thread Wenzhuo Lu
Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c|  50 ++
 drivers/net/iavf/iavf_rxtx.h|  13 ++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 165 
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 +--
 4 files changed, 301 insertions(+), 25 deletions(-)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bf1064d..4744c35 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -135,7 +135,7 @@
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-   if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+   if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2473,17 +2473,23 @@
 #ifdef RTE_ARCH_X86
struct iavf_tx_queue *txq;
int i;
+   int check_ret;
+   bool use_sse = false;
bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
bool use_avx512 = false;
-#endif
 
-   if (!iavf_tx_vec_dev_check(dev) &&
-   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
-   use_avx2 = true;
+   check_ret = iavf_tx_vec_dev_check(dev);
+
+   if (check_ret >= 0 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+   /* SSE and AVX2 not support offload path yet. */
+   if (check_ret == IAVF_VECTOR_PATH) {
+   use_sse = true;
+   if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 
1) &&
+   rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
+   use_avx2 = true;
+   }
 #ifdef CC_AVX512_SUPPORT
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2491,15 +2497,24 @@
use_avx512 = true;
 #endif
 
-   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-   use_avx2 ? "avx2 " : "",
-   dev->data->port_id);
-   dev->tx_pkt_burst = use_avx2 ?
-   iavf_xmit_pkts_vec_avx2 :
-   iavf_xmit_pkts_vec;
+   if (!use_sse && !use_avx2 && !use_avx512)
+   goto normal;
+
+   if (!use_avx512) {
+   PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+   use_avx2 ? "avx2 " : "",
+   dev->data->port_id);
+   dev->tx_pkt_burst = use_avx2 ?
+   iavf_xmit_pkts_vec_avx2 :
+   iavf_xmit_pkts_vec;
+   }
 #ifdef CC_AVX512_SUPPORT
-   if (use_avx512)
-   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+   if (use_avx512) {
+   if (check_ret == IAVF_VECTOR_PATH)
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+   else
+   dev->tx_pkt_burst = 
iavf_xmit_pkts_vec_avx512_offload;
+   }
 #endif
dev->tx_pkt_prepare = NULL;
 
@@ -2521,6 +2536,7 @@
}
 #endif
 
+normal:
PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
dev->data->port_id);
dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 06ff528..da39f78 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -23,14 +23,20 @@
 #define IAVF_VPMD_DESCS_PER_LOOP  4
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
-#define IAVF_NO_VECTOR_FLAGS (  \
+#define IAVF_TX_NO_VECTOR_FLAGS (   \
DEV_TX_OFFLOAD_MULTI_SEGS |  \
+   DEV_TX_OFFLOAD_TCP_TSO)
+
+#define IAVF_TX_VECTOR_OFFLOAD (   

[dpdk-dev] [PATCH 1/4] net/iavf: store offload flag of Rx queue

2021-03-16 Thread Wenzhuo Lu
Add the offload flag for RX queues to know which offload
features are set.

Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx.c | 4 
 drivers/net/iavf/iavf_rxtx.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 8fafe45..bf1064d 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -498,9 +498,12 @@
uint8_t proto_xtr;
uint16_t len;
uint16_t rx_free_thresh;
+   uint64_t offloads;
 
PMD_INIT_FUNC_TRACE();
 
+   offloads = rx_conf->offloads | dev->data->dev_conf.rxmode.offloads;
+
if (nb_desc % IAVF_ALIGN_RING_DESC != 0 ||
nb_desc > IAVF_MAX_RING_DESC ||
nb_desc < IAVF_MIN_RING_DESC) {
@@ -571,6 +574,7 @@
rxq->rx_deferred_start = rx_conf->rx_deferred_start;
rxq->rx_hdr_len = 0;
rxq->vsi = vsi;
+   rxq->offloads = offloads;
 
if (dev->data->dev_conf.rxmode.offloads & DEV_RX_OFFLOAD_KEEP_CRC)
rxq->crc_len = RTE_ETHER_CRC_LEN;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 922ddad..06ff528 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -198,6 +198,7 @@ struct iavf_rx_queue {
/* flexible descriptor metadata extraction offload flag */
iavf_rxd_to_pkt_fields_t rxd_to_pkt_fields;
/* handle flexible descriptor by RXDID */
+   uint64_t offloads;
 };
 
 struct iavf_tx_entry {
-- 
1.9.3



[dpdk-dev] [PATCH 0/4] add Rx/Tx offload paths for IAVF AVX512

2021-03-16 Thread Wenzhuo Lu
Add specific paths for RX/TX AVX512, called offload paths.
In these paths, support the HW offload features, like,
checksum, VLAN, RSS offload.
These paths are chosen automatically according to the
configuration.

The code for the above HW offload features, which are
supported by offload paths, is removed from the legacy path.

Wenzhuo Lu (4):
  net/iavf: store offload flag of Rx queue
  net/iavf: add offload path for Tx AVX512
  net/iavf: add offload path for Rx AVX512
  net/iavf: add offload path for Rx AVX512 flex desc

 doc/guides/rel_notes/release_21_05.rst  |7 +
 drivers/net/iavf/iavf_rxtx.c|  156 ++-
 drivers/net/iavf/iavf_rxtx.h|   32 +-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 1646 ++-
 drivers/net/iavf/iavf_rxtx_vec_common.h |  113 ++-
 5 files changed, 1661 insertions(+), 293 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH 3/3] net/i40e: fix segment fault in AVX512

2021-03-11 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: e6a6a138919f ("net/i40e: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/i40e/i40e_rxtx_vec_avx512.c | 128 
 1 file changed, 128 insertions(+)

diff --git a/drivers/net/i40e/i40e_rxtx_vec_avx512.c 
b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
index 862c916..36521da 100644
--- a/drivers/net/i40e/i40e_rxtx_vec_avx512.c
+++ b/drivers/net/i40e/i40e_rxtx_vec_avx512.c
@@ -32,6 +32,9 @@
 
rxdp = rxq->rx_ring + rxq->rxrearm_start;
 
+   if (!cache)
+   goto normal;
+
/* We need to pull 'n' more MBUFs into the software ring from mempool
 * We inline the mempool function here, so we can vectorize the copy
 * from the cache into the shadow ring.
@@ -132,7 +135,132 @@
 #endif
rxep += 8, rxdp += 8, cache->len -= 8;
}
+   goto done;
+
+normal:
+   /* Pull 'n' more MBUFs into the software ring */
+   if (rte_mempool_get_bulk(rxq->mp,
+(void *)rxep,
+RTE_I40E_RXQ_REARM_THRESH) < 0) {
+   if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >=
+   rxq->nb_rx_desc) {
+   __m128i dma_addr0;
+
+   dma_addr0 = _mm_setzero_si128();
+   for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) {
+   rxep[i].mbuf = &rxq->fake_mbuf;
+   _mm_store_si128((__m128i *)&rxdp[i].read,
+   dma_addr0);
+   }
+   }
+   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+   RTE_I40E_RXQ_REARM_THRESH;
+   return;
+   }
+
+#ifndef RTE_LIBRTE_I40E_16BYTE_RX_DESC
+   struct rte_mbuf *mb0, *mb1;
+   __m128i dma_addr0, dma_addr1;
+   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+   RTE_PKTMBUF_HEADROOM);
+   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
+   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+   __m128i vaddr0, vaddr1;
+
+   mb0 = rxep[0].mbuf;
+   mb1 = rxep[1].mbuf;
+
+   /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
+   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+   offsetof(struct rte_mbuf, buf_addr) + 8);
+   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+   /* convert pa to dma_addr hdr/data */
+   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+   /* add headroom to pa values */
+   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+   /* flush desc with pa dma_addr */
+   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+   }
+#else
+   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+   struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
+   __m512i dma_addr0_3, dma_addr4_7;
+   __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
+   for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH;
+   i += 8, rxep += 8, rxdp += 8) {
+   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
+   __m128i vaddr4, vaddr5, vaddr6, vaddr7;
+   __m256i vaddr0_1, vaddr2_3;
+   __m256i vaddr4_5, vaddr6_7;
+   __m512i vaddr0_3, vaddr4_7;
+
+   mb0 = rxep[0].mbuf;
+   mb1 = rxep[1].mbuf;
+   mb2 = rxep[2].mbuf;
+   mb3 = rxep[3].mbuf;
+   mb4 = rxep[4].mbuf;
+   mb5 = rxep[5].mbuf;
+   mb6 = rxep[6].mbuf;
+   mb7 = rxep[7].mbuf;
+
+   /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
+   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+   offsetof(struct rte_mbuf, buf_addr) + 8);
+   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
+   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+   vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
+   vaddr5 = _mm_loadu_si128((__m128i *)&mb5-

[dpdk-dev] [PATCH 2/3] net/ice: fix segment fault in AVX512

2021-03-11 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: 7f85d5ebcfe1 ("net/ice: add AVX512 vector path")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/ice/ice_rxtx_vec_avx512.c | 129 ++
 1 file changed, 129 insertions(+)

diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c 
b/drivers/net/ice/ice_rxtx_vec_avx512.c
index 0e5a676..7c458d5 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -24,6 +24,9 @@
 
rxdp = rxq->rx_ring + rxq->rxrearm_start;
 
+   if (!cache)
+   goto normal;
+
/* We need to pull 'n' more MBUFs into the software ring */
if (cache->len < ICE_RXQ_REARM_THRESH) {
uint32_t req = ICE_RXQ_REARM_THRESH + (cache->size -
@@ -115,6 +118,132 @@
rxep += 8, rxdp += 8, cache->len -= 8;
}
 
+   goto done;
+
+normal:
+   /* Pull 'n' more MBUFs into the software ring */
+   if (rte_mempool_get_bulk(rxq->mp,
+(void *)rxep,
+ICE_RXQ_REARM_THRESH) < 0) {
+   if (rxq->rxrearm_nb + ICE_RXQ_REARM_THRESH >=
+   rxq->nb_rx_desc) {
+   __m128i dma_addr0;
+
+   dma_addr0 = _mm_setzero_si128();
+   for (i = 0; i < ICE_DESCS_PER_LOOP; i++) {
+   rxep[i].mbuf = &rxq->fake_mbuf;
+   _mm_store_si128((__m128i *)&rxdp[i].read,
+   dma_addr0);
+   }
+   }
+   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+   ICE_RXQ_REARM_THRESH;
+   return;
+   }
+
+#ifndef RTE_LIBRTE_ICE_16BYTE_RX_DESC
+   struct rte_mbuf *mb0, *mb1;
+   __m128i dma_addr0, dma_addr1;
+   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+   RTE_PKTMBUF_HEADROOM);
+   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
+   for (i = 0; i < ICE_RXQ_REARM_THRESH; i += 2, rxep += 2) {
+   __m128i vaddr0, vaddr1;
+
+   mb0 = rxep[0].mbuf;
+   mb1 = rxep[1].mbuf;
+
+   /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
+   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+   offsetof(struct rte_mbuf, buf_addr) + 8);
+   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+   /* convert pa to dma_addr hdr/data */
+   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+   /* add headroom to pa values */
+   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+   /* flush desc with pa dma_addr */
+   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+   }
+#else
+   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+   struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
+   __m512i dma_addr0_3, dma_addr4_7;
+   __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
+   for (i = 0; i < ICE_RXQ_REARM_THRESH;
+   i += 8, rxep += 8, rxdp += 8) {
+   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
+   __m128i vaddr4, vaddr5, vaddr6, vaddr7;
+   __m256i vaddr0_1, vaddr2_3;
+   __m256i vaddr4_5, vaddr6_7;
+   __m512i vaddr0_3, vaddr4_7;
+
+   mb0 = rxep[0].mbuf;
+   mb1 = rxep[1].mbuf;
+   mb2 = rxep[2].mbuf;
+   mb3 = rxep[3].mbuf;
+   mb4 = rxep[4].mbuf;
+   mb5 = rxep[5].mbuf;
+   mb6 = rxep[6].mbuf;
+   mb7 = rxep[7].mbuf;
+
+   /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
+   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+   offsetof(struct rte_mbuf, buf_addr) + 8);
+   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
+   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+   vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
+   vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
+   vaddr6 = _mm_loadu_si128((__m128

[dpdk-dev] [PATCH 1/3] net/iavf: fix segment fault in AVX512

2021-03-11 Thread Wenzhuo Lu
Fix segment fault when failing to get the memory from the pool.

Fixes: 31737f2b66fb ("net/iavf: enable AVX512 for legacy Rx")
Cc: sta...@dpdk.org

Reported-by: David Coyle 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 130 
 1 file changed, 130 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 5cb4c7c..6134520 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -25,6 +25,9 @@
 
rxdp = rxq->rx_ring + rxq->rxrearm_start;
 
+   if (!cache)
+   goto normal;
+
/* We need to pull 'n' more MBUFs into the software ring from mempool
 * We inline the mempool function here, so we can vectorize the copy
 * from the cache into the shadow ring.
@@ -127,6 +130,133 @@
cache->len -= IAVF_DESCS_PER_LOOP_AVX;
}
 
+   goto done;
+
+normal:
+   /* Pull 'n' more MBUFs into the software ring */
+   if (rte_mempool_get_bulk(rxq->mp,
+(void *)rxp,
+IAVF_RXQ_REARM_THRESH) < 0) {
+   if (rxq->rxrearm_nb + IAVF_RXQ_REARM_THRESH >=
+   rxq->nb_rx_desc) {
+   __m128i dma_addr0;
+
+   dma_addr0 = _mm_setzero_si128();
+   for (i = 0; i < IAVF_DESCS_PER_LOOP_AVX; i++) {
+   rxp[i] = &rxq->fake_mbuf;
+   _mm_store_si128((__m128i *)&rxdp[i].read,
+   dma_addr0);
+   }
+   }
+   rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed +=
+   IAVF_RXQ_REARM_THRESH;
+   return;
+   }
+
+#ifndef RTE_LIBRTE_IAVF_16BYTE_RX_DESC
+   struct rte_mbuf *mb0, *mb1;
+   __m128i dma_addr0, dma_addr1;
+   __m128i hdr_room = _mm_set_epi64x(RTE_PKTMBUF_HEADROOM,
+   RTE_PKTMBUF_HEADROOM);
+   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
+   for (i = 0; i < IAVF_RXQ_REARM_THRESH; i += 2, rxp += 2) {
+   __m128i vaddr0, vaddr1;
+
+   mb0 = rxp[0];
+   mb1 = rxp[1];
+
+   /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
+   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+   offsetof(struct rte_mbuf, buf_addr) + 8);
+   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+
+   /* convert pa to dma_addr hdr/data */
+   dma_addr0 = _mm_unpackhi_epi64(vaddr0, vaddr0);
+   dma_addr1 = _mm_unpackhi_epi64(vaddr1, vaddr1);
+
+   /* add headroom to pa values */
+   dma_addr0 = _mm_add_epi64(dma_addr0, hdr_room);
+   dma_addr1 = _mm_add_epi64(dma_addr1, hdr_room);
+
+   /* flush desc with pa dma_addr */
+   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr0);
+   _mm_store_si128((__m128i *)&rxdp++->read, dma_addr1);
+   }
+#else
+   struct rte_mbuf *mb0, *mb1, *mb2, *mb3;
+   struct rte_mbuf *mb4, *mb5, *mb6, *mb7;
+   __m512i dma_addr0_3, dma_addr4_7;
+   __m512i hdr_room = _mm512_set1_epi64(RTE_PKTMBUF_HEADROOM);
+   /* Initialize the mbufs in vector, process 4 mbufs in one loop */
+   for (i = 0; i < IAVF_RXQ_REARM_THRESH;
+   i += 8, rxp += 8, rxdp += 8) {
+   __m128i vaddr0, vaddr1, vaddr2, vaddr3;
+   __m128i vaddr4, vaddr5, vaddr6, vaddr7;
+   __m256i vaddr0_1, vaddr2_3;
+   __m256i vaddr4_5, vaddr6_7;
+   __m512i vaddr0_3, vaddr4_7;
+
+   mb0 = rxp[0];
+   mb1 = rxp[1];
+   mb2 = rxp[2];
+   mb3 = rxp[3];
+   mb4 = rxp[4];
+   mb5 = rxp[5];
+   mb6 = rxp[6];
+   mb7 = rxp[7];
+
+   /* load buf_addr(lo 64bit) and buf_physaddr(hi 64bit) */
+   RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, buf_iova) !=
+   offsetof(struct rte_mbuf, buf_addr) + 8);
+   vaddr0 = _mm_loadu_si128((__m128i *)&mb0->buf_addr);
+   vaddr1 = _mm_loadu_si128((__m128i *)&mb1->buf_addr);
+   vaddr2 = _mm_loadu_si128((__m128i *)&mb2->buf_addr);
+   vaddr3 = _mm_loadu_si128((__m128i *)&mb3->buf_addr);
+   vaddr4 = _mm_loadu_si128((__m128i *)&mb4->buf_addr);
+   vaddr5 = _mm_loadu_si128((__m128i *)&mb5->buf_addr);
+   vaddr6 = _mm_loadu_si128((__m128i *)&mb6->buf_addr);
+

[dpdk-dev] [PATCH] net/iavf: fix missed pointer check

2020-11-09 Thread Wenzhuo Lu
The return value of rte_mempool_default_cache should be
checked as it can be NULL.

Fixes: 9ab9514c150e ("net/iavf: enable AVX512 for Tx")

Reported-by: Konstantin Ananyev 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 8680734..584d12e 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1424,7 +1424,12 @@
struct rte_mempool *mp = txep[0].mbuf->pool;
struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
rte_lcore_id());
-   void **cache_objs = &cache->objs[cache->len];
+   void **cache_objs;
+
+   if (!cache || cache->len == 0)
+   goto normal;
+
+   cache_objs = &cache->objs[cache->len];
 
if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
@@ -1462,6 +1467,7 @@
goto done;
}
 
+normal:
m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
if (likely(m)) {
free[0] = m;
-- 
1.9.3



[dpdk-dev] [PATCH] net/ice: fix missed pointer check

2020-11-09 Thread Wenzhuo Lu
The return value of rte_mempool_default_cache should be
checked as it can be NULL.

Fixes: a4e480de268e ("net/ice: optimize Tx by using AVX512")

Reported-by: Konstantin Ananyev 
Signed-off-by: Wenzhuo Lu 
---
 drivers/net/ice/ice_rxtx_vec_avx512.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ice/ice_rxtx_vec_avx512.c 
b/drivers/net/ice/ice_rxtx_vec_avx512.c
index e5e7cc1..af6b324 100644
--- a/drivers/net/ice/ice_rxtx_vec_avx512.c
+++ b/drivers/net/ice/ice_rxtx_vec_avx512.c
@@ -781,9 +781,14 @@
 
if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE && (n & 31) == 0) {
struct rte_mempool *mp = txep[0].mbuf->pool;
+   void **cache_objs;
struct rte_mempool_cache *cache = rte_mempool_default_cache(mp,
rte_lcore_id());
-   void **cache_objs = &cache->objs[cache->len];
+
+   if (!cache || cache->len == 0)
+   goto normal;
+
+   cache_objs = &cache->objs[cache->len];
 
if (n > RTE_MEMPOOL_CACHE_MAX_SIZE) {
rte_mempool_ops_enqueue_bulk(mp, (void *)txep, n);
@@ -821,6 +826,7 @@
goto done;
}
 
+normal:
m = rte_pktmbuf_prefree_seg(txep[0].mbuf);
if (likely(m)) {
free[0] = m;
-- 
1.9.3



[dpdk-dev] [PATCH v7 3/3] net/iavf: enable AVX512 for Tx

2020-10-28 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the Tx descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 doc/guides/rel_notes/release_20_11.rst  |   3 +
 drivers/net/iavf/iavf_ethdev.c  |   3 +-
 drivers/net/iavf/iavf_rxtx.c|  34 +++-
 drivers/net/iavf/iavf_rxtx.h|   7 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 304 
 5 files changed, 343 insertions(+), 8 deletions(-)

diff --git a/doc/guides/rel_notes/release_20_11.rst 
b/doc/guides/rel_notes/release_20_11.rst
index 89e0959..6d7c59d 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -20,6 +20,9 @@ DPDK Release 20.11
   make doc-guides-html
   xdg-open build/doc/html/guides/rel_notes/release_20_11.html
 
+   * **Added support of vector instructions on IAVF.**
+
+ Added support of AVX512 instructions in IAVF RX and TX path.
 
 New Features
 
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index 6a67990..7e3c26a 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -792,7 +792,8 @@ static int iavf_config_rx_queues_irqs(struct rte_eth_dev 
*dev,
DEV_TX_OFFLOAD_GRE_TNL_TSO |
DEV_TX_OFFLOAD_IPIP_TNL_TSO |
DEV_TX_OFFLOAD_GENEVE_TNL_TSO |
-   DEV_TX_OFFLOAD_MULTI_SEGS;
+   DEV_TX_OFFLOAD_MULTI_SEGS |
+   DEV_TX_OFFLOAD_MBUF_FAST_FREE;
 
dev_info->default_rxconf = (struct rte_eth_rxconf) {
.rx_free_thresh = IAVF_DEFAULT_RX_FREE_THRESH,
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index f51471f..baac5d6 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2417,20 +2417,22 @@
struct iavf_tx_queue *txq;
int i;
bool use_avx2 = false;
+#ifdef CC_AVX512_SUPPORT
+   bool use_avx512 = false;
+#endif
 
if (!iavf_tx_vec_dev_check(dev) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   for (i = 0; i < dev->data->nb_tx_queues; i++) {
-   txq = dev->data->tx_queues[i];
-   if (!txq)
-   continue;
-   iavf_txq_vec_setup(txq);
-   }
-
if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
 rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
use_avx2 = true;
+#ifdef CC_AVX512_SUPPORT
+   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+   rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+   use_avx512 = true;
+#endif
 
PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
use_avx2 ? "avx2 " : "",
@@ -2438,8 +2440,26 @@
dev->tx_pkt_burst = use_avx2 ?
iavf_xmit_pkts_vec_avx2 :
iavf_xmit_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+#endif
dev->tx_pkt_prepare = NULL;
 
+   for (i = 0; i < dev->data->nb_tx_queues; i++) {
+   txq = dev->data->tx_queues[i];
+   if (!txq)
+   continue;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   iavf_txq_vec_setup_avx512(txq);
+   else
+   iavf_txq_vec_setup(txq);
+#else
+   iavf_txq_vec_setup(txq);
+#endif
+   }
+
return;
}
 #endif
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 5bf91df..d4b4935 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -203,6 +203,10 @@ struct iavf_tx_entry {
uint16_t last_id;
 };
 
+struct iavf_tx_vec_entry {
+   struct rte_mbuf *mbuf;
+};
+
 /* Structure associated with each TX queue. */
 struct iavf_tx_queue {
const struct rte_memzone *mz;  /* memzone for Tx ring */
@@ -474,6 +478,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
  struct rte_mbuf **rx_pkts,
  uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struc

[dpdk-dev] [PATCH v7 2/3] net/iavf: enable AVX512 for flexible Rx

2020-10-28 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the flexible Rx descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 drivers/net/iavf/iavf_rxtx.c|  10 +
 drivers/net/iavf/iavf_rxtx.h|   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 707 
 3 files changed, 723 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index abadf0a..f51471f 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2343,6 +2343,11 @@
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+#endif
} else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_scattered_pkts_vec_avx2 :
@@ -2362,6 +2367,11 @@
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd;
+#endif
} else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2 :
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 3da7189..5bf91df 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -465,9 +465,15 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct 
rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 struct rte_mbuf **rx_pkts,
 uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
 
 uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 959067c..df0f43b 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -614,6 +614,631 @@
return received;
 }
 
+static inline __m256i
+flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)
+{
+#define FDID_MIS_MAGIC 0x
+   RTE_BUILD_BUG_ON(PKT_RX_FDIR != (1 << 2));
+   RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+   const __m256i pkt_fdir_bit = _mm256_set1_epi32(PKT_RX_FDIR |
+  PKT_RX_FDIR_ID);
+   /* desc->flow_id field == 0x means fdir mismatch */
+   const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC);
+   __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7,
+  fdir_mis_mask);
+   /* this XOR op results to bit-reverse the fdir_mask */
+   fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask);
+   const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
+
+   return fdir_flags;
+}
+
+static inline uint16_t
+_iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts, uint8_t *split_packet)
+{
+   const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+
+   const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
+   rxq->mbuf_initializer);
+   struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+   volatile union iavf_rx_flex_desc *rxdp =
+   (union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
+
+   rte_prefetch0(rxdp);
+
+   /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
+   nb_pkts = RTE_ALIGN

[dpdk-dev] [PATCH v7 0/3] enable AVX512 for iavf

2020-10-28 Thread Wenzhuo Lu
AVX512 instructions is supported by more and more platforms. These instructions 
can be used in the data path to enhance the per-core performance of packet 
processing.
Comparing with the existing implementation, this path set introduces some 
AVX512 instructions into the iavf data path, and we get a better per-code 
throughput.

v2:
Update meson.build.
Repalce the deprecated 'buf_physaddr' by 'buf_iova'.

v3:
Fix compile errors.

v4:
Fix wrong info in commnets.
Trivial adjustment of the arrangement.

v5:
Support "max SIMD bitwidth".

v6:
Rework meson build to fix compile issue for AVX512BW.

v7:
rebased on next-net_intel.

Wenzhuo Lu (3):
  net/iavf: enable AVX512 for legacy Rx
  net/iavf: enable AVX512 for flexible Rx
  net/iavf: enable AVX512 for Tx

 doc/guides/rel_notes/release_20_11.rst  |3 +
 drivers/net/iavf/iavf_ethdev.c  |3 +-
 drivers/net/iavf/iavf_rxtx.c|   73 +-
 drivers/net/iavf/iavf_rxtx.h|   18 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 1702 +++
 drivers/net/iavf/meson.build|   20 +
 6 files changed, 1807 insertions(+), 12 deletions(-)
 create mode 100644 drivers/net/iavf/iavf_rxtx_vec_avx512.c

-- 
1.9.3



[dpdk-dev] [PATCH v7 1/3] net/iavf: enable AVX512 for legacy Rx

2020-10-28 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the legacy Rx descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 drivers/net/iavf/iavf_rxtx.c|  29 +-
 drivers/net/iavf/iavf_rxtx.h|   5 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 691 
 drivers/net/iavf/meson.build|  20 +
 4 files changed, 741 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/iavf/iavf_rxtx_vec_avx512.c

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index d30aaf8..abadf0a 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2311,6 +2311,9 @@
struct iavf_rx_queue *rxq;
int i;
bool use_avx2 = false;
+#ifdef CC_AVX512_SUPPORT
+   bool use_avx512 = false;
+#endif
 
if (!iavf_rx_vec_dev_check(dev) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
@@ -2323,6 +2326,12 @@
 rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
use_avx2 = true;
+#ifdef CC_AVX512_SUPPORT
+   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+   rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+   use_avx512 = true;
+#endif
 
if (dev->data->scattered_rx) {
PMD_DRV_LOG(DEBUG,
@@ -2330,27 +2339,39 @@
use_avx2 ? "avx2 " : "",
dev->data->port_id);
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
-   else
+   } else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_scattered_pkts_vec_avx2 :
iavf_recv_scattered_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512;
+#endif
+   }
} else {
PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
use_avx2 ? "avx2 " : "",
dev->data->port_id);
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
-   else
+   } else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2 :
iavf_recv_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   iavf_recv_pkts_vec_avx512;
+#endif
+   }
}
 
return;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 02945b8..3da7189 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -463,6 +463,11 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct 
rte_mbuf **tx_pkts,
 int iavf_tx_vec_dev_check(struct rte_eth_dev *dev);
 int iavf_rxq_vec_setup(struct iavf_rx_queue *rxq);
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
+uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+  uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
+struct rte_mbuf **rx_pkts,
+uint16_t nb_pkts);
 
 uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
new file mode 100644
index 000..959067c

[dpdk-dev] [PATCH v6 0/3] enable AVX512 for iavf

2020-10-27 Thread Wenzhuo Lu
AVX512 instructions is supported by more and more platforms. These instructions 
can be used in the data path to enhance the per-core performance of packet 
processing.
Comparing with the existing implementation, this path set introduces some 
AVX512 instructions into the iavf data path, and we get a better per-code 
throughput.

v2:
Update meson.build.
Repalce the deprecated 'buf_physaddr' by 'buf_iova'.

v3:
Fix compile errors.

v4:
Fix wrong info in commnets.
Trivial adjustment of the arrangement.

v5:
Support "max SIMD bitwidth".

v6:
Rework meson build to fix compile issue for AVX512BW.

Wenzhuo Lu (3):
  net/iavf: enable AVX512 for legacy RX
  net/iavf: enable AVX512 for flexible RX
  net/iavf: enable AVX512 for TX

 doc/guides/rel_notes/release_20_11.rst  |3 +
 drivers/net/iavf/iavf_ethdev.c  |3 +-
 drivers/net/iavf/iavf_rxtx.c|   73 +-
 drivers/net/iavf/iavf_rxtx.h|   18 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 1702 +++
 drivers/net/iavf/meson.build|   20 +
 6 files changed, 1807 insertions(+), 12 deletions(-)
 create mode 100644 drivers/net/iavf/iavf_rxtx_vec_avx512.c

-- 
1.9.3



[dpdk-dev] [PATCH v6 2/3] net/iavf: enable AVX512 for flexible RX

2020-10-27 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the flexible RX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 drivers/net/iavf/iavf_rxtx.c|  10 +
 drivers/net/iavf/iavf_rxtx.h|   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 707 
 3 files changed, 723 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 6eedb12..69a4c3e 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2136,6 +2136,11 @@
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+#endif
} else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_scattered_pkts_vec_avx2 :
@@ -2155,6 +2160,11 @@
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd;
+#endif
} else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2 :
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 7c1f05f..03b095d 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -440,9 +440,15 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct 
rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 struct rte_mbuf **rx_pkts,
 uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
 
 const uint32_t *iavf_get_default_ptype_table(void);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 959067c..df0f43b 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -614,6 +614,631 @@
return received;
 }
 
+static inline __m256i
+flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)
+{
+#define FDID_MIS_MAGIC 0x
+   RTE_BUILD_BUG_ON(PKT_RX_FDIR != (1 << 2));
+   RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+   const __m256i pkt_fdir_bit = _mm256_set1_epi32(PKT_RX_FDIR |
+  PKT_RX_FDIR_ID);
+   /* desc->flow_id field == 0x means fdir mismatch */
+   const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC);
+   __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7,
+  fdir_mis_mask);
+   /* this XOR op results to bit-reverse the fdir_mask */
+   fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask);
+   const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
+
+   return fdir_flags;
+}
+
+static inline uint16_t
+_iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts, uint8_t *split_packet)
+{
+   const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+
+   const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
+   rxq->mbuf_initializer);
+   struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+   volatile union iavf_rx_flex_desc *rxdp =
+   (union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
+
+   rte_prefetch0(rxdp);
+
+   /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
+   nb_pkts = RTE_ALIGN_FL

[dpdk-dev] [PATCH v6 1/3] net/iavf: enable AVX512 for legacy RX

2020-10-27 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the legacy RX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 drivers/net/iavf/iavf_rxtx.c|  29 +-
 drivers/net/iavf/iavf_rxtx.h|   5 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 691 
 drivers/net/iavf/meson.build|  20 +
 4 files changed, 741 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/iavf/iavf_rxtx_vec_avx512.c

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index edb2dc3..6eedb12 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2104,6 +2104,9 @@
struct iavf_rx_queue *rxq;
int i;
bool use_avx2 = false;
+#ifdef CC_AVX512_SUPPORT
+   bool use_avx512 = false;
+#endif
 
if (!iavf_rx_vec_dev_check(dev) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
@@ -2116,6 +2119,12 @@
 rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
use_avx2 = true;
+#ifdef CC_AVX512_SUPPORT
+   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+   rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+   use_avx512 = true;
+#endif
 
if (dev->data->scattered_rx) {
PMD_DRV_LOG(DEBUG,
@@ -2123,27 +2132,39 @@
use_avx2 ? "avx2 " : "",
dev->data->port_id);
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
-   else
+   } else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_scattered_pkts_vec_avx2 :
iavf_recv_scattered_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512;
+#endif
+   }
} else {
PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
use_avx2 ? "avx2 " : "",
dev->data->port_id);
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
-   else
+   } else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2 :
iavf_recv_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   iavf_recv_pkts_vec_avx512;
+#endif
+   }
}
 
return;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 3d02c65..7c1f05f 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -438,6 +438,11 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct 
rte_mbuf **tx_pkts,
 int iavf_tx_vec_dev_check(struct rte_eth_dev *dev);
 int iavf_rxq_vec_setup(struct iavf_rx_queue *rxq);
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
+uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+  uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
+struct rte_mbuf **rx_pkts,
+uint16_t nb_pkts);
 
 const uint32_t *iavf_get_default_ptype_table(void);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
new file mode 100644
index 000..959067c

[dpdk-dev] [PATCH v6 3/3] net/iavf: enable AVX512 for TX

2020-10-27 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the TX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 doc/guides/rel_notes/release_20_11.rst  |   3 +
 drivers/net/iavf/iavf_ethdev.c  |   3 +-
 drivers/net/iavf/iavf_rxtx.c|  34 +++-
 drivers/net/iavf/iavf_rxtx.h|   7 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 304 
 5 files changed, 343 insertions(+), 8 deletions(-)

diff --git a/doc/guides/rel_notes/release_20_11.rst 
b/doc/guides/rel_notes/release_20_11.rst
index 0d45b50..c981b64 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -20,6 +20,9 @@ DPDK Release 20.11
   make doc-guides-html
   xdg-open build/doc/html/guides/rel_notes/release_20_11.html
 
+   * **Added support of vector instructions on IAVF.**
+
+ Added support of AVX512 instructions in IAVF RX and TX path.
 
 New Features
 
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index 0ef023c..fe6c8cb 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -606,7 +606,8 @@ static int iavf_config_rx_queues_irqs(struct rte_eth_dev 
*dev,
DEV_TX_OFFLOAD_GRE_TNL_TSO |
DEV_TX_OFFLOAD_IPIP_TNL_TSO |
DEV_TX_OFFLOAD_GENEVE_TNL_TSO |
-   DEV_TX_OFFLOAD_MULTI_SEGS;
+   DEV_TX_OFFLOAD_MULTI_SEGS |
+   DEV_TX_OFFLOAD_MBUF_FAST_FREE;
 
dev_info->default_rxconf = (struct rte_eth_rxconf) {
.rx_free_thresh = IAVF_DEFAULT_RX_FREE_THRESH,
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 69a4c3e..582afe7 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2210,20 +2210,22 @@
struct iavf_tx_queue *txq;
int i;
bool use_avx2 = false;
+#ifdef CC_AVX512_SUPPORT
+   bool use_avx512 = false;
+#endif
 
if (!iavf_tx_vec_dev_check(dev) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   for (i = 0; i < dev->data->nb_tx_queues; i++) {
-   txq = dev->data->tx_queues[i];
-   if (!txq)
-   continue;
-   iavf_txq_vec_setup(txq);
-   }
-
if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
 rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
use_avx2 = true;
+#ifdef CC_AVX512_SUPPORT
+   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+   rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+   use_avx512 = true;
+#endif
 
PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
use_avx2 ? "avx2 " : "",
@@ -2231,8 +2233,26 @@
dev->tx_pkt_burst = use_avx2 ?
iavf_xmit_pkts_vec_avx2 :
iavf_xmit_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+#endif
dev->tx_pkt_prepare = NULL;
 
+   for (i = 0; i < dev->data->nb_tx_queues; i++) {
+   txq = dev->data->tx_queues[i];
+   if (!txq)
+   continue;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   iavf_txq_vec_setup_avx512(txq);
+   else
+   iavf_txq_vec_setup(txq);
+#else
+   iavf_txq_vec_setup(txq);
+#endif
+   }
+
return;
}
 #endif
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 03b095d..b22ccc4 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -122,6 +122,10 @@ struct iavf_tx_entry {
uint16_t last_id;
 };
 
+struct iavf_tx_vec_entry {
+   struct rte_mbuf *mbuf;
+};
+
 /* Structure associated with each TX queue. */
 struct iavf_tx_queue {
const struct rte_memzone *mz;  /* memzone for Tx ring */
@@ -449,6 +453,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
  struct rte_mbuf **rx_pkts,
  uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struc

[dpdk-dev] [PATCH v5 0/3] enable AVX512 for iavf

2020-10-21 Thread Wenzhuo Lu
AVX512 instructions is supported by more and more platforms. These instructions 
can be used in the data path to enhance the per-core performance of packet 
processing.
Comparing with the existing implementation, this path set introduces some 
AVX512 instructions into the iavf data path, and we get a better per-code 
throughput.

v2:
Update meson.build.
Repalce the deprecated 'buf_physaddr' by 'buf_iova'.

v3:
Fix compile errors.

v4:
Fix wrong info in commnets.
Trivial adjustment of the arrangement.

v5:
Support "max SIMD bitwidth".

Wenzhuo Lu (3):
  net/iavf: enable AVX512 for legacy RX
  net/iavf: enable AVX512 for flexible RX
  net/iavf: enable AVX512 for TX

 doc/guides/rel_notes/release_20_11.rst  |3 +
 drivers/net/iavf/iavf_ethdev.c  |3 +-
 drivers/net/iavf/iavf_rxtx.c|   71 +-
 drivers/net/iavf/iavf_rxtx.h|   18 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 1702 +++
 drivers/net/iavf/meson.build|   17 +
 6 files changed, 1802 insertions(+), 12 deletions(-)
 create mode 100644 drivers/net/iavf/iavf_rxtx_vec_avx512.c

-- 
1.9.3



[dpdk-dev] [PATCH v5 3/3] net/iavf: enable AVX512 for TX

2020-10-21 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the TX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 doc/guides/rel_notes/release_20_11.rst  |   3 +
 drivers/net/iavf/iavf_ethdev.c  |   3 +-
 drivers/net/iavf/iavf_rxtx.c|  33 +++-
 drivers/net/iavf/iavf_rxtx.h|   7 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 304 
 5 files changed, 342 insertions(+), 8 deletions(-)

diff --git a/doc/guides/rel_notes/release_20_11.rst 
b/doc/guides/rel_notes/release_20_11.rst
index 0d45b50..c981b64 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -20,6 +20,9 @@ DPDK Release 20.11
   make doc-guides-html
   xdg-open build/doc/html/guides/rel_notes/release_20_11.html
 
+   * **Added support of vector instructions on IAVF.**
+
+ Added support of AVX512 instructions in IAVF RX and TX path.
 
 New Features
 
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index 0ef023c..fe6c8cb 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -606,7 +606,8 @@ static int iavf_config_rx_queues_irqs(struct rte_eth_dev 
*dev,
DEV_TX_OFFLOAD_GRE_TNL_TSO |
DEV_TX_OFFLOAD_IPIP_TNL_TSO |
DEV_TX_OFFLOAD_GENEVE_TNL_TSO |
-   DEV_TX_OFFLOAD_MULTI_SEGS;
+   DEV_TX_OFFLOAD_MULTI_SEGS |
+   DEV_TX_OFFLOAD_MBUF_FAST_FREE;
 
dev_info->default_rxconf = (struct rte_eth_rxconf) {
.rx_free_thresh = IAVF_DEFAULT_RX_FREE_THRESH,
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index fbcddd3..a94f646 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2209,20 +2209,21 @@
struct iavf_tx_queue *txq;
int i;
bool use_avx2 = false;
+#ifdef CC_AVX512_SUPPORT
+   bool use_avx512 = false;
+#endif
 
if (!iavf_tx_vec_dev_check(dev) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-   for (i = 0; i < dev->data->nb_tx_queues; i++) {
-   txq = dev->data->tx_queues[i];
-   if (!txq)
-   continue;
-   iavf_txq_vec_setup(txq);
-   }
-
if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
 rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
use_avx2 = true;
+#ifdef CC_AVX512_SUPPORT
+   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+   use_avx512 = true;
+#endif
 
PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
use_avx2 ? "avx2 " : "",
@@ -2230,8 +2231,26 @@
dev->tx_pkt_burst = use_avx2 ?
iavf_xmit_pkts_vec_avx2 :
iavf_xmit_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+#endif
dev->tx_pkt_prepare = NULL;
 
+   for (i = 0; i < dev->data->nb_tx_queues; i++) {
+   txq = dev->data->tx_queues[i];
+   if (!txq)
+   continue;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   iavf_txq_vec_setup_avx512(txq);
+   else
+   iavf_txq_vec_setup(txq);
+#else
+   iavf_txq_vec_setup(txq);
+#endif
+   }
+
return;
}
 #endif
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 03b095d..b22ccc4 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -122,6 +122,10 @@ struct iavf_tx_entry {
uint16_t last_id;
 };
 
+struct iavf_tx_vec_entry {
+   struct rte_mbuf *mbuf;
+};
+
 /* Structure associated with each TX queue. */
 struct iavf_tx_queue {
const struct rte_memzone *mz;  /* memzone for Tx ring */
@@ -449,6 +453,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
  struct rte_mbuf **rx_pkts,
  uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+  uint16_t nb_pkts);
+int iavf_txq_v

[dpdk-dev] [PATCH v5 2/3] net/iavf: enable AVX512 for flexible RX

2020-10-21 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the flexible RX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 drivers/net/iavf/iavf_rxtx.c|  10 +
 drivers/net/iavf/iavf_rxtx.h|   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 707 
 3 files changed, 723 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0067b64..fbcddd3 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2135,6 +2135,11 @@
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+#endif
} else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_scattered_pkts_vec_avx2 :
@@ -2154,6 +2159,11 @@
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd;
+#endif
} else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2 :
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 7c1f05f..03b095d 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -440,9 +440,15 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct 
rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 struct rte_mbuf **rx_pkts,
 uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
 
 const uint32_t *iavf_get_default_ptype_table(void);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 959067c..df0f43b 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -614,6 +614,631 @@
return received;
 }
 
+static inline __m256i
+flex_rxd_to_fdir_flags_vec_avx512(const __m256i fdir_id0_7)
+{
+#define FDID_MIS_MAGIC 0x
+   RTE_BUILD_BUG_ON(PKT_RX_FDIR != (1 << 2));
+   RTE_BUILD_BUG_ON(PKT_RX_FDIR_ID != (1 << 13));
+   const __m256i pkt_fdir_bit = _mm256_set1_epi32(PKT_RX_FDIR |
+  PKT_RX_FDIR_ID);
+   /* desc->flow_id field == 0x means fdir mismatch */
+   const __m256i fdir_mis_mask = _mm256_set1_epi32(FDID_MIS_MAGIC);
+   __m256i fdir_mask = _mm256_cmpeq_epi32(fdir_id0_7,
+  fdir_mis_mask);
+   /* this XOR op results to bit-reverse the fdir_mask */
+   fdir_mask = _mm256_xor_si256(fdir_mask, fdir_mis_mask);
+   const __m256i fdir_flags = _mm256_and_si256(fdir_mask, pkt_fdir_bit);
+
+   return fdir_flags;
+}
+
+static inline uint16_t
+_iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts, uint8_t *split_packet)
+{
+   const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+
+   const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
+   rxq->mbuf_initializer);
+   struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+   volatile union iavf_rx_flex_desc *rxdp =
+   (union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
+
+   rte_prefetch0(rxdp);
+
+   /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
+   nb_pkts = RTE_ALIGN_FL

[dpdk-dev] [PATCH v5 1/3] net/iavf: enable AVX512 for legacy RX

2020-10-21 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the legacy RX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 drivers/net/iavf/iavf_rxtx.c|  28 +-
 drivers/net/iavf/iavf_rxtx.h|   5 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 691 
 drivers/net/iavf/meson.build|  17 +
 4 files changed, 737 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/iavf/iavf_rxtx_vec_avx512.c

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index edb2dc3..0067b64 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2104,6 +2104,9 @@
struct iavf_rx_queue *rxq;
int i;
bool use_avx2 = false;
+#ifdef CC_AVX512_SUPPORT
+   bool use_avx512 = false;
+#endif
 
if (!iavf_rx_vec_dev_check(dev) &&
rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
@@ -2116,6 +2119,11 @@
 rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
rte_vect_get_max_simd_bitwidth() >= 
RTE_VECT_SIMD_256)
use_avx2 = true;
+#ifdef CC_AVX512_SUPPORT
+   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
+   rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_512)
+   use_avx512 = true;
+#endif
 
if (dev->data->scattered_rx) {
PMD_DRV_LOG(DEBUG,
@@ -2123,27 +2131,39 @@
use_avx2 ? "avx2 " : "",
dev->data->port_id);
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
-   else
+   } else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_scattered_pkts_vec_avx2 :
iavf_recv_scattered_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512;
+#endif
+   }
} else {
PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
use_avx2 ? "avx2 " : "",
dev->data->port_id);
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
-   else
+   } else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2 :
iavf_recv_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   iavf_recv_pkts_vec_avx512;
+#endif
+   }
}
 
return;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 3d02c65..7c1f05f 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -438,6 +438,11 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct 
rte_mbuf **tx_pkts,
 int iavf_tx_vec_dev_check(struct rte_eth_dev *dev);
 int iavf_rxq_vec_setup(struct iavf_rx_queue *rxq);
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
+uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+  uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
+struct rte_mbuf **rx_pkts,
+uint16_t nb_pkts);
 
 const uint32_t *iavf_get_default_ptype_table(void);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
new file mode 100644
index 000..959067c
--- /dev/null
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -0,0 +1,691 @@
+/* SPDX

[dpdk-dev] [PATCH v4 2/3] net/iavf: enable AVX512 for flexible RX

2020-09-26 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the flexible RX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 drivers/net/iavf/iavf_rxtx.c|  10 +
 drivers/net/iavf/iavf_rxtx.h|   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 688 
 3 files changed, 704 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index c36e809..0818107 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2132,6 +2132,11 @@
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+#endif
} else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_scattered_pkts_vec_avx2 :
@@ -2151,6 +2156,11 @@
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd;
+#endif
} else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2 :
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index cb12888..9653e0c 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -439,9 +439,15 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct 
rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 struct rte_mbuf **rx_pkts,
 uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
 
 const uint32_t *iavf_get_default_ptype_table(void);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index a28c39b..63320e6 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -633,6 +633,612 @@
return fdir_flags;
 }
 
+static inline uint16_t
+_iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts, uint8_t *split_packet)
+{
+   const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+
+   const __m256i mbuf_init = _mm256_set_epi64x(0, 0, 0,
+   rxq->mbuf_initializer);
+   struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+   volatile union iavf_rx_flex_desc *rxdp =
+   (union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
+
+   rte_prefetch0(rxdp);
+
+   /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
+   nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+
+   /* See if we need to rearm the RX queue - gives the prefetch a bit
+* of time to act
+*/
+   if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
+   iavf_rxq_rearm(rxq);
+
+   /* Before we start moving massive data around, check to see if
+* there is actually a packet available
+*/
+   if (!(rxdp->wb.status_error0 &
+ rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
+   return 0;
+
+   /* constants used in processing loop */
+   const __m512i crc_adjust =
+   _mm512_set_epi32
+   (/* 1st descriptor */
+0, /* ignore non-length fields */
+-rxq->crc_len, /* sub crc on data_len */
+-rxq->crc_len, /* 

[dpdk-dev] [PATCH v4 1/3] net/iavf: enable AVX512 for legacy RX

2020-09-26 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the legacy RX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 drivers/net/iavf/iavf_rxtx.c|  27 +-
 drivers/net/iavf/iavf_rxtx.h|   5 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 710 
 drivers/net/iavf/meson.build|  17 +
 4 files changed, 755 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/iavf/iavf_rxtx_vec_avx512.c

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 05a7dd8..c36e809 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2104,6 +2104,9 @@
struct iavf_rx_queue *rxq;
int i;
bool use_avx2 = false;
+#ifdef CC_AVX512_SUPPORT
+   bool use_avx512 = false;
+#endif
 
if (!iavf_rx_vec_dev_check(dev)) {
for (i = 0; i < dev->data->nb_rx_queues; i++) {
@@ -2114,6 +2117,10 @@
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
use_avx2 = true;
+#ifdef CC_AVX512_SUPPORT
+   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
+   use_avx512 = true;
+#endif
 
if (dev->data->scattered_rx) {
PMD_DRV_LOG(DEBUG,
@@ -2121,27 +2128,39 @@
use_avx2 ? "avx2 " : "",
dev->data->port_id);
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
-   else
+   } else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_scattered_pkts_vec_avx2 :
iavf_recv_scattered_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512;
+#endif
+   }
} else {
PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
use_avx2 ? "avx2 " : "",
dev->data->port_id);
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
-   else
+   } else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2 :
iavf_recv_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   iavf_recv_pkts_vec_avx512;
+#endif
+   }
}
 
return;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 59625a9..cb12888 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -437,6 +437,11 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct 
rte_mbuf **tx_pkts,
 int iavf_tx_vec_dev_check(struct rte_eth_dev *dev);
 int iavf_rxq_vec_setup(struct iavf_rx_queue *rxq);
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
+uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+  uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
+struct rte_mbuf **rx_pkts,
+uint16_t nb_pkts);
 
 const uint32_t *iavf_get_default_ptype_table(void);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
new file mode 100644
index 000..a28c39b
--- /dev/null
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -0,0 +1,710 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include "iavf_rxtx_vec_common.h"
+
+#include 
+

[dpdk-dev] [PATCH v4 3/3] net/iavf: enable AVX512 for TX

2020-09-26 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the TX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 doc/guides/rel_notes/release_20_11.rst  |   3 +
 drivers/net/iavf/iavf_ethdev.c  |   3 +-
 drivers/net/iavf/iavf_rxtx.c|  32 +++-
 drivers/net/iavf/iavf_rxtx.h|   7 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 304 
 5 files changed, 341 insertions(+), 8 deletions(-)

diff --git a/doc/guides/rel_notes/release_20_11.rst 
b/doc/guides/rel_notes/release_20_11.rst
index df227a1..d40b8d6 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -55,6 +55,9 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+   * **Added support of vector instructions on IAVF.**
+
+ Added support of AVX512 instructions in IAVF RX and TX path.
 
 Removed Items
 -
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index c3aa4cd..5bc2851 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -528,7 +528,8 @@ static int iavf_config_rx_queues_irqs(struct rte_eth_dev 
*dev,
DEV_TX_OFFLOAD_GRE_TNL_TSO |
DEV_TX_OFFLOAD_IPIP_TNL_TSO |
DEV_TX_OFFLOAD_GENEVE_TNL_TSO |
-   DEV_TX_OFFLOAD_MULTI_SEGS;
+   DEV_TX_OFFLOAD_MULTI_SEGS |
+   DEV_TX_OFFLOAD_MBUF_FAST_FREE;
 
dev_info->default_rxconf = (struct rte_eth_rxconf) {
.rx_free_thresh = IAVF_DEFAULT_RX_FREE_THRESH,
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0818107..04dcd48 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2206,18 +2206,18 @@
struct iavf_tx_queue *txq;
int i;
bool use_avx2 = false;
+#ifdef CC_AVX512_SUPPORT
+   bool use_avx512 = false;
+#endif
 
if (!iavf_tx_vec_dev_check(dev)) {
-   for (i = 0; i < dev->data->nb_tx_queues; i++) {
-   txq = dev->data->tx_queues[i];
-   if (!txq)
-   continue;
-   iavf_txq_vec_setup(txq);
-   }
-
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
use_avx2 = true;
+#ifdef CC_AVX512_SUPPORT
+   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
+   use_avx512 = true;
+#endif
 
PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
use_avx2 ? "avx2 " : "",
@@ -2225,8 +2225,26 @@
dev->tx_pkt_burst = use_avx2 ?
iavf_xmit_pkts_vec_avx2 :
iavf_xmit_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+#endif
dev->tx_pkt_prepare = NULL;
 
+   for (i = 0; i < dev->data->nb_tx_queues; i++) {
+   txq = dev->data->tx_queues[i];
+   if (!txq)
+   continue;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   iavf_txq_vec_setup_avx512(txq);
+   else
+   iavf_txq_vec_setup(txq);
+#else
+   iavf_txq_vec_setup(txq);
+#endif
+   }
+
return;
}
 #endif
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 9653e0c..08eebb0 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -122,6 +122,10 @@ struct iavf_tx_entry {
uint16_t last_id;
 };
 
+struct iavf_tx_vec_entry {
+   struct rte_mbuf *mbuf;
+};
+
 /* Structure associated with each TX queue. */
 struct iavf_tx_queue {
const struct rte_memzone *mz;  /* memzone for Tx ring */
@@ -448,6 +452,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
  struct rte_mbuf **rx_pkts,
  uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+  uint16_t nb_pkts);
+int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq);
 
 const uint32_t *iavf_get_default_ptype_table(void);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 63320e6..0de34f0 100644
--- a/drivers/net/iavf/iavf_rx

[dpdk-dev] [PATCH v4 0/3] enable AVX512 for iavf

2020-09-26 Thread Wenzhuo Lu
AVX512 instructions is supported by more and more platforms. These instructions 
can be used in the data path to enhance the per-core performance of packet 
processing.
Comparing with the existing implementation, this path set introduces some 
AVX512 instructions into the iavf data path, and we get a better per-code 
throughput.

v2:
Update meson.build.
Repalce the deprecated 'buf_physaddr' by 'buf_iova'.

v3:
Fix compile errors.

v4:
Fix wrong info in commnets.
Trivial adjustment of the arrangement.

Wenzhuo Lu (3):
  net/iavf: enable AVX512 for legacy RX
  net/iavf: enable AVX512 for flexible RX
  net/iavf: enable AVX512 for TX

 doc/guides/rel_notes/release_20_11.rst  |3 +
 drivers/net/iavf/iavf_ethdev.c  |3 +-
 drivers/net/iavf/iavf_rxtx.c|   69 +-
 drivers/net/iavf/iavf_rxtx.h|   18 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 1702 +++
 drivers/net/iavf/meson.build|   17 +
 6 files changed, 1800 insertions(+), 12 deletions(-)
 create mode 100644 drivers/net/iavf/iavf_rxtx_vec_avx512.c

-- 
1.9.3



[dpdk-dev] [PATCH v3 3/3] net/iavf: enable AVX512 for TX

2020-09-21 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the TX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 doc/guides/rel_notes/release_20_11.rst  |   3 +
 drivers/net/iavf/iavf_ethdev.c  |   3 +-
 drivers/net/iavf/iavf_rxtx.c|  32 +++-
 drivers/net/iavf/iavf_rxtx.h|   7 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 304 
 5 files changed, 341 insertions(+), 8 deletions(-)

diff --git a/doc/guides/rel_notes/release_20_11.rst 
b/doc/guides/rel_notes/release_20_11.rst
index df227a1..d40b8d6 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -55,6 +55,9 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+   * **Added support of vector instructions on IAVF.**
+
+ Added support of AVX512 instructions in IAVF RX and TX path.
 
 Removed Items
 -
diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
index c3aa4cd..5bc2851 100644
--- a/drivers/net/iavf/iavf_ethdev.c
+++ b/drivers/net/iavf/iavf_ethdev.c
@@ -528,7 +528,8 @@ static int iavf_config_rx_queues_irqs(struct rte_eth_dev 
*dev,
DEV_TX_OFFLOAD_GRE_TNL_TSO |
DEV_TX_OFFLOAD_IPIP_TNL_TSO |
DEV_TX_OFFLOAD_GENEVE_TNL_TSO |
-   DEV_TX_OFFLOAD_MULTI_SEGS;
+   DEV_TX_OFFLOAD_MULTI_SEGS |
+   DEV_TX_OFFLOAD_MBUF_FAST_FREE;
 
dev_info->default_rxconf = (struct rte_eth_rxconf) {
.rx_free_thresh = IAVF_DEFAULT_RX_FREE_THRESH,
diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 0818107..04dcd48 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2206,18 +2206,18 @@
struct iavf_tx_queue *txq;
int i;
bool use_avx2 = false;
+#ifdef CC_AVX512_SUPPORT
+   bool use_avx512 = false;
+#endif
 
if (!iavf_tx_vec_dev_check(dev)) {
-   for (i = 0; i < dev->data->nb_tx_queues; i++) {
-   txq = dev->data->tx_queues[i];
-   if (!txq)
-   continue;
-   iavf_txq_vec_setup(txq);
-   }
-
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
use_avx2 = true;
+#ifdef CC_AVX512_SUPPORT
+   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
+   use_avx512 = true;
+#endif
 
PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
use_avx2 ? "avx2 " : "",
@@ -2225,8 +2225,26 @@
dev->tx_pkt_burst = use_avx2 ?
iavf_xmit_pkts_vec_avx2 :
iavf_xmit_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+#endif
dev->tx_pkt_prepare = NULL;
 
+   for (i = 0; i < dev->data->nb_tx_queues; i++) {
+   txq = dev->data->tx_queues[i];
+   if (!txq)
+   continue;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   iavf_txq_vec_setup_avx512(txq);
+   else
+   iavf_txq_vec_setup(txq);
+#else
+   iavf_txq_vec_setup(txq);
+#endif
+   }
+
return;
}
 #endif
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 9653e0c..08eebb0 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -122,6 +122,10 @@ struct iavf_tx_entry {
uint16_t last_id;
 };
 
+struct iavf_tx_vec_entry {
+   struct rte_mbuf *mbuf;
+};
+
 /* Structure associated with each TX queue. */
 struct iavf_tx_queue {
const struct rte_memzone *mz;  /* memzone for Tx ring */
@@ -448,6 +452,9 @@ uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
  struct rte_mbuf **rx_pkts,
  uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+  uint16_t nb_pkts);
+int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq);
 
 const uint32_t *iavf_get_default_ptype_table(void);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 2b6c99f..4a33930 100644
--- a/drivers/net/iavf/iavf_rx

[dpdk-dev] [PATCH v3 2/3] net/iavf: enable AVX512 for flexible RX

2020-09-21 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the flexible RX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 drivers/net/iavf/iavf_rxtx.c|  10 +
 drivers/net/iavf/iavf_rxtx.h|   6 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 687 
 3 files changed, 703 insertions(+)

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index c36e809..0818107 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2132,6 +2132,11 @@
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512_flex_rxd;
+#endif
} else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_scattered_pkts_vec_avx2 :
@@ -2151,6 +2156,11 @@
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_pkts_vec_avx512_flex_rxd;
+#endif
} else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2 :
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index cb12888..9653e0c 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -439,9 +439,15 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct 
rte_mbuf **tx_pkts,
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
 uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
   uint16_t nb_pkts);
+uint16_t iavf_recv_pkts_vec_avx512_flex_rxd(void *rx_queue,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts);
 uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
 struct rte_mbuf **rx_pkts,
 uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
+ struct rte_mbuf **rx_pkts,
+ uint16_t nb_pkts);
 
 const uint32_t *iavf_get_default_ptype_table(void);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 04c2df8..2b6c99f 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -634,6 +634,612 @@
return fdir_flags;
 }
 
+static inline uint16_t
+_iavf_recv_raw_pkts_vec_avx512_flex_rxd(struct iavf_rx_queue *rxq,
+   struct rte_mbuf **rx_pkts,
+   uint16_t nb_pkts, uint8_t *split_packet)
+{
+   const uint32_t *type_table = rxq->vsi->adapter->ptype_tbl;
+
+   const __m256i mbuf_init = _mm256_set_epi64x(0, 0,
+   0, rxq->mbuf_initializer);
+   struct rte_mbuf **sw_ring = &rxq->sw_ring[rxq->rx_tail];
+   volatile union iavf_rx_flex_desc *rxdp =
+   (union iavf_rx_flex_desc *)rxq->rx_ring + rxq->rx_tail;
+
+   rte_prefetch0(rxdp);
+
+   /* nb_pkts has to be floor-aligned to IAVF_DESCS_PER_LOOP_AVX */
+   nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, IAVF_DESCS_PER_LOOP_AVX);
+
+   /* See if we need to rearm the RX queue - gives the prefetch a bit
+* of time to act
+*/
+   if (rxq->rxrearm_nb > IAVF_RXQ_REARM_THRESH)
+   iavf_rxq_rearm(rxq);
+
+   /* Before we start moving massive data around, check to see if
+* there is actually a packet available
+*/
+   if (!(rxdp->wb.status_error0 &
+   rte_cpu_to_le_32(1 << IAVF_RX_FLEX_DESC_STATUS0_DD_S)))
+   return 0;
+
+   /* constants used in processing loop */
+   const __m512i crc_adjust =
+   _mm512_set_epi32
+   (/* 1st descriptor */
+0, /* ignore non-length fields */
+-rxq->crc_len, /* sub crc on data_len */
+-rxq->crc_len, /* sub crc on pkt_

[dpdk-dev] [PATCH v3 1/3] net/iavf: enable AVX512 for legacy RX

2020-09-21 Thread Wenzhuo Lu
To enhance the per-core performance, this patch adds some AVX512
instructions to the data path to handle the legacy RX descriptors.

Signed-off-by: Wenzhuo Lu 
Signed-off-by: Bruce Richardson 
Signed-off-by: Leyi Rong 
---
 drivers/net/iavf/iavf_rxtx.c|  27 +-
 drivers/net/iavf/iavf_rxtx.h|   5 +
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 711 
 drivers/net/iavf/meson.build|  17 +
 4 files changed, 756 insertions(+), 4 deletions(-)
 create mode 100644 drivers/net/iavf/iavf_rxtx_vec_avx512.c

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index 05a7dd8..c36e809 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -2104,6 +2104,9 @@
struct iavf_rx_queue *rxq;
int i;
bool use_avx2 = false;
+#ifdef CC_AVX512_SUPPORT
+   bool use_avx512 = false;
+#endif
 
if (!iavf_rx_vec_dev_check(dev)) {
for (i = 0; i < dev->data->nb_rx_queues; i++) {
@@ -2114,6 +2117,10 @@
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
use_avx2 = true;
+#ifdef CC_AVX512_SUPPORT
+   if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1)
+   use_avx512 = true;
+#endif
 
if (dev->data->scattered_rx) {
PMD_DRV_LOG(DEBUG,
@@ -2121,27 +2128,39 @@
use_avx2 ? "avx2 " : "",
dev->data->port_id);
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
dev->rx_pkt_burst = use_avx2 ?

iavf_recv_scattered_pkts_vec_avx2_flex_rxd :
iavf_recv_scattered_pkts_vec_flex_rxd;
-   else
+   } else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_scattered_pkts_vec_avx2 :
iavf_recv_scattered_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   
iavf_recv_scattered_pkts_vec_avx512;
+#endif
+   }
} else {
PMD_DRV_LOG(DEBUG, "Using %sVector Rx (port %d).",
use_avx2 ? "avx2 " : "",
dev->data->port_id);
if (vf->vf_res->vf_cap_flags &
-   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC)
+   VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC) {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2_flex_rxd :
iavf_recv_pkts_vec_flex_rxd;
-   else
+   } else {
dev->rx_pkt_burst = use_avx2 ?
iavf_recv_pkts_vec_avx2 :
iavf_recv_pkts_vec;
+#ifdef CC_AVX512_SUPPORT
+   if (use_avx512)
+   dev->rx_pkt_burst =
+   iavf_recv_pkts_vec_avx512;
+#endif
+   }
}
 
return;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index 59625a9..cb12888 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -437,6 +437,11 @@ uint16_t iavf_xmit_pkts_vec_avx2(void *tx_queue, struct 
rte_mbuf **tx_pkts,
 int iavf_tx_vec_dev_check(struct rte_eth_dev *dev);
 int iavf_rxq_vec_setup(struct iavf_rx_queue *rxq);
 int iavf_txq_vec_setup(struct iavf_tx_queue *txq);
+uint16_t iavf_recv_pkts_vec_avx512(void *rx_queue, struct rte_mbuf **rx_pkts,
+  uint16_t nb_pkts);
+uint16_t iavf_recv_scattered_pkts_vec_avx512(void *rx_queue,
+struct rte_mbuf **rx_pkts,
+uint16_t nb_pkts);
 
 const uint32_t *iavf_get_default_ptype_table(void);
 
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c 
b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
new file mode 100644
index 000..04c2df8
--- /dev/null
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -0,0 +1,711 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+
+#include "iavf_rxtx_vec_common.h"
+
+#include 
+

  1   2   3   4   5   6   7   8   9   10   >