Performance improvement is done for Tx fastpath flag MBUF_NOFF when
tx_compl_ena is false and mbuf has an external buffer.
In such case, Instead of individual external mbuf free before LMTST,
a chain of external mbuf will be created and free all after LMTST.
This not only improve the performance but also fixes SQ corruption.

CN10k performance improvement is ~14%.
CN9k performance improvement is ~20%.

Fixes: 51a636528515 ("net/cnxk: fix crash during Tx completion")
Cc: sta...@dpdk.org

Signed-off-by: Rahul Bhansali <rbhans...@marvell.com>
---
Changes in v2: updated release_24_03.rst for SW mbuf free optimization

 doc/guides/rel_notes/release_24_03.rst |  1 +
 drivers/event/cnxk/cn10k_tx_worker.h   |  8 ++-
 drivers/event/cnxk/cn9k_worker.h       |  9 ++-
 drivers/net/cnxk/cn10k_tx.h            | 97 ++++++++++++++++++--------
 drivers/net/cnxk/cn9k_tx.h             | 88 +++++++++++++++--------
 5 files changed, 136 insertions(+), 67 deletions(-)

diff --git a/doc/guides/rel_notes/release_24_03.rst 
b/doc/guides/rel_notes/release_24_03.rst
index 8d440d56a5..39ffef11b0 100644
--- a/doc/guides/rel_notes/release_24_03.rst
+++ b/doc/guides/rel_notes/release_24_03.rst
@@ -111,6 +111,7 @@ New Features
   * Added support for ``RTE_FLOW_ITEM_TYPE_PPPOES`` flow item.
   * Added support for ``RTE_FLOW_ACTION_TYPE_SAMPLE`` flow item.
   * Added support for Rx inject.
+  * Optimized SW external mbuf free for better performance and avoid SQ 
corruption.

 * **Updated Marvell OCTEON EP driver.**

diff --git a/drivers/event/cnxk/cn10k_tx_worker.h 
b/drivers/event/cnxk/cn10k_tx_worker.h
index 53e0dde20c..256237b895 100644
--- a/drivers/event/cnxk/cn10k_tx_worker.h
+++ b/drivers/event/cnxk/cn10k_tx_worker.h
@@ -70,6 +70,7 @@ cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf 
*m, uint64_t *cmd,
                 const uint64_t *txq_data, const uint32_t flags)
 {
        uint8_t lnum = 0, loff = 0, shft = 0;
+       struct rte_mbuf *extm = NULL;
        struct cn10k_eth_txq *txq;
        uintptr_t laddr;
        uint16_t segdw;
@@ -90,7 +91,7 @@ cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf 
*m, uint64_t *cmd,
        if (flags & NIX_TX_OFFLOAD_TSO_F)
                cn10k_nix_xmit_prepare_tso(m, flags);

-       cn10k_nix_xmit_prepare(txq, m, cmd, flags, txq->lso_tun_fmt, &sec,
+       cn10k_nix_xmit_prepare(txq, m, &extm, cmd, flags, txq->lso_tun_fmt, 
&sec,
                               txq->mark_flag, txq->mark_fmt);

        laddr = lmt_addr;
@@ -105,7 +106,7 @@ cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf 
*m, uint64_t *cmd,
        cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);

        if (flags & NIX_TX_MULTI_SEG_F)
-               segdw = cn10k_nix_prepare_mseg(txq, m, (uint64_t *)laddr, 
flags);
+               segdw = cn10k_nix_prepare_mseg(txq, m, &extm, (uint64_t 
*)laddr, flags);
        else
                segdw = cn10k_nix_tx_ext_subs(flags) + 2;

@@ -127,6 +128,9 @@ cn10k_sso_tx_one(struct cn10k_sso_hws *ws, struct rte_mbuf 
*m, uint64_t *cmd,
        /* Memory barrier to make sure lmtst store completes */
        rte_io_wmb();

+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena)
+               cn10k_nix_free_extmbuf(extm);
+
        return 1;
 }

diff --git a/drivers/event/cnxk/cn9k_worker.h b/drivers/event/cnxk/cn9k_worker.h
index 0451157812..107265d54b 100644
--- a/drivers/event/cnxk/cn9k_worker.h
+++ b/drivers/event/cnxk/cn9k_worker.h
@@ -746,7 +746,7 @@ static __rte_always_inline uint16_t
 cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, uint64_t *cmd,
                      uint64_t *txq_data, const uint32_t flags)
 {
-       struct rte_mbuf *m = ev->mbuf;
+       struct rte_mbuf *m = ev->mbuf, *extm = NULL;
        struct cn9k_eth_txq *txq;

        /* Perform header writes before barrier for TSO */
@@ -767,7 +767,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, 
uint64_t *cmd,
        if (cn9k_sso_sq_depth(txq) <= 0)
                return 0;
        cn9k_nix_tx_skeleton(txq, cmd, flags, 0);
-       cn9k_nix_xmit_prepare(txq, m, cmd, flags, txq->lso_tun_fmt, 
txq->mark_flag,
+       cn9k_nix_xmit_prepare(txq, m, &extm, cmd, flags, txq->lso_tun_fmt, 
txq->mark_flag,
                              txq->mark_fmt);

        if (flags & NIX_TX_OFFLOAD_SECURITY_F) {
@@ -789,7 +789,7 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, 
uint64_t *cmd,
        }

        if (flags & NIX_TX_MULTI_SEG_F) {
-               const uint16_t segdw = cn9k_nix_prepare_mseg(txq, m, cmd, 
flags);
+               const uint16_t segdw = cn9k_nix_prepare_mseg(txq, m, &extm, 
cmd, flags);
                cn9k_nix_xmit_prepare_tstamp(txq, cmd, m->ol_flags, segdw,
                                             flags);
                if (!CNXK_TT_FROM_EVENT(ev->event)) {
@@ -819,6 +819,9 @@ cn9k_sso_hws_event_tx(uint64_t base, struct rte_event *ev, 
uint64_t *cmd,
        }

 done:
+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena)
+               cn9k_nix_free_extmbuf(extm);
+
        return 1;
 }

diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index 266c899a05..5c4b9e559e 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -733,8 +733,19 @@ cn10k_nix_prep_sec(struct rte_mbuf *m, uint64_t *cmd, 
uintptr_t *nixtx_addr,
 }
 #endif

+static inline void
+cn10k_nix_free_extmbuf(struct rte_mbuf *m)
+{
+       struct rte_mbuf *m_next;
+       while (m != NULL) {
+               m_next = m->next;
+               rte_pktmbuf_free_seg(m);
+               m = m_next;
+       }
+}
+
 static __rte_always_inline uint64_t
-cn10k_nix_prefree_seg(struct rte_mbuf *m, struct cn10k_eth_txq *txq,
+cn10k_nix_prefree_seg(struct rte_mbuf *m, struct rte_mbuf **extm, struct 
cn10k_eth_txq *txq,
                      struct nix_send_hdr_s *send_hdr, uint64_t *aura)
 {
        struct rte_mbuf *prev = NULL;
@@ -742,7 +753,8 @@ cn10k_nix_prefree_seg(struct rte_mbuf *m, struct 
cn10k_eth_txq *txq,

        if (RTE_MBUF_HAS_EXTBUF(m)) {
                if (unlikely(txq->tx_compl.ena == 0)) {
-                       rte_pktmbuf_free_seg(m);
+                       m->next = *extm;
+                       *extm = m;
                        return 1;
                }
                if (send_hdr->w0.pnc) {
@@ -766,7 +778,8 @@ cn10k_nix_prefree_seg(struct rte_mbuf *m, struct 
cn10k_eth_txq *txq,
 #if defined(RTE_ARCH_ARM64)
 /* Only called for first segments of single segmented mbufs */
 static __rte_always_inline void
-cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn10k_eth_txq *txq,
+cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct rte_mbuf **extm,
+                         struct cn10k_eth_txq *txq,
                          uint64x2_t *senddesc01_w0, uint64x2_t *senddesc23_w0,
                          uint64x2_t *senddesc01_w1, uint64x2_t *senddesc23_w1)
 {
@@ -790,7 +803,8 @@ cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct 
cn10k_eth_txq *txq,
                w1 = vgetq_lane_u64(*senddesc01_w1, 0);
                w1 &= ~0xFFFF000000000000UL;
                if (unlikely(!tx_compl_ena)) {
-                       rte_pktmbuf_free_seg(m0);
+                       m0->next = *extm;
+                       *extm = m0;
                } else {
                        sqe_id = 
rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
                                                               
rte_memory_order_relaxed);
@@ -820,7 +834,8 @@ cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct 
cn10k_eth_txq *txq,
                w1 = vgetq_lane_u64(*senddesc01_w1, 1);
                w1 &= ~0xFFFF000000000000UL;
                if (unlikely(!tx_compl_ena)) {
-                       rte_pktmbuf_free_seg(m1);
+                       m1->next = *extm;
+                       *extm = m1;
                } else {
                        sqe_id = 
rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
                                                               
rte_memory_order_relaxed);
@@ -850,7 +865,8 @@ cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct 
cn10k_eth_txq *txq,
                w1 = vgetq_lane_u64(*senddesc23_w1, 0);
                w1 &= ~0xFFFF000000000000UL;
                if (unlikely(!tx_compl_ena)) {
-                       rte_pktmbuf_free_seg(m2);
+                       m2->next = *extm;
+                       *extm = m2;
                } else {
                        sqe_id = 
rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
                                                               
rte_memory_order_relaxed);
@@ -880,7 +896,8 @@ cn10k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct 
cn10k_eth_txq *txq,
                w1 = vgetq_lane_u64(*senddesc23_w1, 1);
                w1 &= ~0xFFFF000000000000UL;
                if (unlikely(!tx_compl_ena)) {
-                       rte_pktmbuf_free_seg(m3);
+                       m3->next = *extm;
+                       *extm = m3;
                } else {
                        sqe_id = 
rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
                                                               
rte_memory_order_relaxed);
@@ -962,9 +979,9 @@ cn10k_nix_xmit_prepare_tso(struct rte_mbuf *m, const 
uint64_t flags)

 static __rte_always_inline void
 cn10k_nix_xmit_prepare(struct cn10k_eth_txq *txq,
-                      struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
-                      const uint64_t lso_tun_fmt, bool *sec, uint8_t mark_flag,
-                      uint64_t mark_fmt)
+                      struct rte_mbuf *m, struct rte_mbuf **extm, uint64_t 
*cmd,
+                      const uint16_t flags, const uint64_t lso_tun_fmt, bool 
*sec,
+                      uint8_t mark_flag, uint64_t mark_fmt)
 {
        uint8_t mark_off = 0, mark_vlan = 0, markptr = 0;
        struct nix_send_ext_s *send_hdr_ext;
@@ -1164,7 +1181,7 @@ cn10k_nix_xmit_prepare(struct cn10k_eth_txq *txq,
                         * DF bit = 0 otherwise
                         */
                        aura = send_hdr->w0.aura;
-                       send_hdr->w0.df = cn10k_nix_prefree_seg(m, txq, 
send_hdr, &aura);
+                       send_hdr->w0.df = cn10k_nix_prefree_seg(m, extm, txq, 
send_hdr, &aura);
                        send_hdr->w0.aura = aura;
                }
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
@@ -1240,8 +1257,8 @@ cn10k_nix_xmit_prepare_tstamp(struct cn10k_eth_txq *txq, 
uintptr_t lmt_addr,
 }

 static __rte_always_inline uint16_t
-cn10k_nix_prepare_mseg(struct cn10k_eth_txq *txq,
-                      struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
+cn10k_nix_prepare_mseg(struct cn10k_eth_txq *txq, struct rte_mbuf *m, struct 
rte_mbuf **extm,
+                      uint64_t *cmd, const uint16_t flags)
 {
        uint64_t prefree = 0, aura0, aura, nb_segs, segdw;
        struct nix_send_hdr_s *send_hdr;
@@ -1284,7 +1301,7 @@ cn10k_nix_prepare_mseg(struct cn10k_eth_txq *txq,
        /* Set invert df if buffer is not to be freed by H/W */
        if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
                aura = send_hdr->w0.aura;
-               prefree = cn10k_nix_prefree_seg(m, txq, send_hdr, &aura);
+               prefree = cn10k_nix_prefree_seg(m, extm, txq, send_hdr, &aura);
                send_hdr->w0.aura = aura;
                l_sg.i1 = prefree;
        }
@@ -1331,7 +1348,7 @@ cn10k_nix_prepare_mseg(struct cn10k_eth_txq *txq,
                cookie = RTE_MBUF_DIRECT(m) ? m : rte_mbuf_from_indirect(m);
                if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
                        aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
-                       prefree = cn10k_nix_prefree_seg(m, txq, send_hdr, 
&aura);
+                       prefree = cn10k_nix_prefree_seg(m, extm, txq, send_hdr, 
&aura);
                        is_sg2 = aura != aura0 && !prefree;
                }

@@ -1425,6 +1442,7 @@ cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct 
rte_mbuf **tx_pkts,
        uint8_t lnum, c_lnum, c_shft, c_loff;
        uintptr_t pa, lbase = txq->lmt_base;
        uint16_t lmt_id, burst, left, i;
+       struct rte_mbuf *extm = NULL;
        uintptr_t c_lbase = lbase;
        uint64_t lso_tun_fmt = 0;
        uint64_t mark_fmt = 0;
@@ -1479,7 +1497,7 @@ cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct 
rte_mbuf **tx_pkts,
                if (flags & NIX_TX_OFFLOAD_TSO_F)
                        cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);

-               cn10k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
+               cn10k_nix_xmit_prepare(txq, tx_pkts[i], &extm, cmd, flags, 
lso_tun_fmt,
                                       &sec, mark_flag, mark_fmt);

                laddr = (uintptr_t)LMT_OFF(lbase, lnum, 0);
@@ -1554,6 +1572,11 @@ cn10k_nix_xmit_pkts(void *tx_queue, uint64_t *ws, struct 
rte_mbuf **tx_pkts,
        }

        rte_io_wmb();
+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena) {
+               cn10k_nix_free_extmbuf(extm);
+               extm = NULL;
+       }
+
        if (left)
                goto again;

@@ -1569,6 +1592,7 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
        uintptr_t pa0, pa1, lbase = txq->lmt_base;
        const rte_iova_t io_addr = txq->io_addr;
        uint16_t segdw, lmt_id, burst, left, i;
+       struct rte_mbuf *extm = NULL;
        uint8_t lnum, c_lnum, c_loff;
        uintptr_t c_lbase = lbase;
        uint64_t lso_tun_fmt = 0;
@@ -1630,7 +1654,7 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
                if (flags & NIX_TX_OFFLOAD_TSO_F)
                        cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);

-               cn10k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
+               cn10k_nix_xmit_prepare(txq, tx_pkts[i], &extm, cmd, flags, 
lso_tun_fmt,
                                       &sec, mark_flag, mark_fmt);

                laddr = (uintptr_t)LMT_OFF(lbase, lnum, 0);
@@ -1644,7 +1668,7 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
                /* Move NIX desc to LMT/NIXTX area */
                cn10k_nix_xmit_mv_lmt_base(laddr, cmd, flags);
                /* Store sg list directly on lmt line */
-               segdw = cn10k_nix_prepare_mseg(txq, tx_pkts[i], (uint64_t 
*)laddr,
+               segdw = cn10k_nix_prepare_mseg(txq, tx_pkts[i], &extm, 
(uint64_t *)laddr,
                                               flags);
                cn10k_nix_xmit_prepare_tstamp(txq, laddr, tx_pkts[i]->ol_flags,
                                              segdw, flags);
@@ -1717,6 +1741,11 @@ cn10k_nix_xmit_pkts_mseg(void *tx_queue, uint64_t *ws,
        }

        rte_io_wmb();
+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena) {
+               cn10k_nix_free_extmbuf(extm);
+               extm = NULL;
+       }
+
        if (left)
                goto again;

@@ -1767,7 +1796,7 @@ cn10k_nix_prepare_tso(struct rte_mbuf *m, union 
nix_send_hdr_w1_u *w1,

 static __rte_always_inline uint16_t
 cn10k_nix_prepare_mseg_vec_noff(struct cn10k_eth_txq *txq,
-                               struct rte_mbuf *m, uint64_t *cmd,
+                               struct rte_mbuf *m, struct rte_mbuf **extm, 
uint64_t *cmd,
                                uint64x2_t *cmd0, uint64x2_t *cmd1,
                                uint64x2_t *cmd2, uint64x2_t *cmd3,
                                const uint32_t flags)
@@ -1782,7 +1811,7 @@ cn10k_nix_prepare_mseg_vec_noff(struct cn10k_eth_txq *txq,
                vst1q_u64(cmd + 2, *cmd1); /* sg */
        }

-       segdw = cn10k_nix_prepare_mseg(txq, m, cmd, flags);
+       segdw = cn10k_nix_prepare_mseg(txq, m, extm, cmd, flags);

        if (flags & NIX_TX_OFFLOAD_TSTAMP_F)
                vst1q_u64(cmd + segdw * 2 - 2, *cmd3);
@@ -1892,7 +1921,7 @@ cn10k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t 
*cmd, uint64x2_t *cmd0,

 static __rte_always_inline uint8_t
 cn10k_nix_prep_lmt_mseg_vector(struct cn10k_eth_txq *txq,
-                              struct rte_mbuf **mbufs, uint64x2_t *cmd0,
+                              struct rte_mbuf **mbufs, struct rte_mbuf **extm, 
uint64x2_t *cmd0,
                               uint64x2_t *cmd1, uint64x2_t *cmd2,
                               uint64x2_t *cmd3, uint8_t *segdw,
                               uint64_t *lmt_addr, __uint128_t *data128,
@@ -1910,7 +1939,7 @@ cn10k_nix_prep_lmt_mseg_vector(struct cn10k_eth_txq *txq,
                                lmt_addr += 16;
                                off = 0;
                        }
-                       off += cn10k_nix_prepare_mseg_vec_noff(txq, mbufs[j],
+                       off += cn10k_nix_prepare_mseg_vec_noff(txq, mbufs[j], 
extm,
                                        lmt_addr + off * 2, &cmd0[j], &cmd1[j],
                                        &cmd2[j], &cmd3[j], flags);
                }
@@ -2063,14 +2092,14 @@ cn10k_nix_lmt_next(uint8_t dw, uintptr_t laddr, uint8_t 
*lnum, uint8_t *loff,

 static __rte_always_inline void
 cn10k_nix_xmit_store(struct cn10k_eth_txq *txq,
-                    struct rte_mbuf *mbuf, uint8_t segdw, uintptr_t laddr,
+                    struct rte_mbuf *mbuf, struct rte_mbuf **extm, uint8_t 
segdw, uintptr_t laddr,
                     uint64x2_t cmd0, uint64x2_t cmd1, uint64x2_t cmd2,
                     uint64x2_t cmd3, const uint16_t flags)
 {
        uint8_t off;

        if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
-               cn10k_nix_prepare_mseg_vec_noff(txq, mbuf, LMT_OFF(laddr, 0, 0),
+               cn10k_nix_prepare_mseg_vec_noff(txq, mbuf, extm, LMT_OFF(laddr, 
0, 0),
                                                &cmd0, &cmd1, &cmd2, &cmd3,
                                                flags);
                return;
@@ -2154,6 +2183,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
                __uint128_t data128;
                uint64_t data[2];
        } wd;
+       struct rte_mbuf *extm = NULL;

        if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && txq->tx_compl.ena)
                handle_tx_completion_pkts(txq, flags & NIX_TX_VWQE_F);
@@ -3003,8 +3033,8 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
                    !(flags & NIX_TX_MULTI_SEG_F) &&
                    !(flags & NIX_TX_OFFLOAD_SECURITY_F)) {
                        /* Set don't free bit if reference count > 1 */
-                       cn10k_nix_prefree_seg_vec(tx_pkts, txq, &senddesc01_w0, 
&senddesc23_w0,
-                                                 &senddesc01_w1, 
&senddesc23_w1);
+                       cn10k_nix_prefree_seg_vec(tx_pkts, &extm, txq, 
&senddesc01_w0,
+                                                 &senddesc23_w0, 
&senddesc01_w1, &senddesc23_w1);
                } else if (!(flags & NIX_TX_MULTI_SEG_F) &&
                           !(flags & NIX_TX_OFFLOAD_SECURITY_F)) {
                        /* Move mbufs to iova */
@@ -3076,7 +3106,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
                                                   &shift, &wd.data128, &next);

                        /* Store mbuf0 to LMTLINE/CPT NIXTX area */
-                       cn10k_nix_xmit_store(txq, tx_pkts[0], segdw[0], next,
+                       cn10k_nix_xmit_store(txq, tx_pkts[0], &extm, segdw[0], 
next,
                                             cmd0[0], cmd1[0], cmd2[0], cmd3[0],
                                             flags);

@@ -3092,7 +3122,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
                                                   &shift, &wd.data128, &next);

                        /* Store mbuf1 to LMTLINE/CPT NIXTX area */
-                       cn10k_nix_xmit_store(txq, tx_pkts[1], segdw[1], next,
+                       cn10k_nix_xmit_store(txq, tx_pkts[1], &extm, segdw[1], 
next,
                                             cmd0[1], cmd1[1], cmd2[1], cmd3[1],
                                             flags);

@@ -3108,7 +3138,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
                                                   &shift, &wd.data128, &next);

                        /* Store mbuf2 to LMTLINE/CPT NIXTX area */
-                       cn10k_nix_xmit_store(txq, tx_pkts[2], segdw[2], next,
+                       cn10k_nix_xmit_store(txq, tx_pkts[2], &extm, segdw[2], 
next,
                                             cmd0[2], cmd1[2], cmd2[2], cmd3[2],
                                             flags);

@@ -3124,7 +3154,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
                                                   &shift, &wd.data128, &next);

                        /* Store mbuf3 to LMTLINE/CPT NIXTX area */
-                       cn10k_nix_xmit_store(txq, tx_pkts[3], segdw[3], next,
+                       cn10k_nix_xmit_store(txq, tx_pkts[3], &extm, segdw[3], 
next,
                                             cmd0[3], cmd1[3], cmd2[3], cmd3[3],
                                             flags);

@@ -3132,7 +3162,7 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
                        uint8_t j;

                        segdw[4] = 8;
-                       j = cn10k_nix_prep_lmt_mseg_vector(txq, tx_pkts, cmd0, 
cmd1,
+                       j = cn10k_nix_prep_lmt_mseg_vector(txq, tx_pkts, &extm, 
cmd0, cmd1,
                                                          cmd2, cmd3, segdw,
                                                          (uint64_t *)
                                                          LMT_OFF(laddr, lnum,
@@ -3282,6 +3312,11 @@ cn10k_nix_xmit_pkts_vector(void *tx_queue, uint64_t *ws,
        }

        rte_io_wmb();
+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena) {
+               cn10k_nix_free_extmbuf(extm);
+               extm = NULL;
+       }
+
        if (left)
                goto again;

diff --git a/drivers/net/cnxk/cn9k_tx.h b/drivers/net/cnxk/cn9k_tx.h
index 94acbe64fa..018fae2eb7 100644
--- a/drivers/net/cnxk/cn9k_tx.h
+++ b/drivers/net/cnxk/cn9k_tx.h
@@ -82,16 +82,28 @@ cn9k_nix_tx_skeleton(struct cn9k_eth_txq *txq, uint64_t 
*cmd,
        }
 }

+static __rte_always_inline void
+cn9k_nix_free_extmbuf(struct rte_mbuf *m)
+{
+       struct rte_mbuf *m_next;
+       while (m != NULL) {
+               m_next = m->next;
+               rte_pktmbuf_free_seg(m);
+               m = m_next;
+       }
+}
+
 static __rte_always_inline uint64_t
-cn9k_nix_prefree_seg(struct rte_mbuf *m, struct cn9k_eth_txq *txq, struct 
nix_send_hdr_s *send_hdr,
-                    uint64_t *aura)
+cn9k_nix_prefree_seg(struct rte_mbuf *m, struct rte_mbuf **extm, struct 
cn9k_eth_txq *txq,
+                    struct nix_send_hdr_s *send_hdr, uint64_t *aura)
 {
        struct rte_mbuf *prev;
        uint32_t sqe_id;

        if (RTE_MBUF_HAS_EXTBUF(m)) {
                if (unlikely(txq->tx_compl.ena == 0)) {
-                       rte_pktmbuf_free_seg(m);
+                       m->next = *extm;
+                       *extm = m;
                        return 1;
                }
                if (send_hdr->w0.pnc) {
@@ -115,7 +127,7 @@ cn9k_nix_prefree_seg(struct rte_mbuf *m, struct 
cn9k_eth_txq *txq, struct nix_se
 #if defined(RTE_ARCH_ARM64)
 /* Only called for first segments of single segmented mbufs */
 static __rte_always_inline void
-cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct cn9k_eth_txq *txq,
+cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct rte_mbuf **extm, 
struct cn9k_eth_txq *txq,
                         uint64x2_t *senddesc01_w0, uint64x2_t *senddesc23_w0,
                         uint64x2_t *senddesc01_w1, uint64x2_t *senddesc23_w1)
 {
@@ -139,7 +151,8 @@ cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct 
cn9k_eth_txq *txq,
                w1 = vgetq_lane_u64(*senddesc01_w1, 0);
                w1 &= ~0xFFFF000000000000UL;
                if (unlikely(!tx_compl_ena)) {
-                       rte_pktmbuf_free_seg(m0);
+                       m0->next = *extm;
+                       *extm = m0;
                } else {
                        sqe_id = 
rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
                                                               
rte_memory_order_relaxed);
@@ -169,7 +182,8 @@ cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct 
cn9k_eth_txq *txq,
                w1 = vgetq_lane_u64(*senddesc01_w1, 1);
                w1 &= ~0xFFFF000000000000UL;
                if (unlikely(!tx_compl_ena)) {
-                       rte_pktmbuf_free_seg(m1);
+                       m1->next = *extm;
+                       *extm = m1;
                } else {
                        sqe_id = 
rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
                                                               
rte_memory_order_relaxed);
@@ -199,7 +213,8 @@ cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct 
cn9k_eth_txq *txq,
                w1 = vgetq_lane_u64(*senddesc23_w1, 0);
                w1 &= ~0xFFFF000000000000UL;
                if (unlikely(!tx_compl_ena)) {
-                       rte_pktmbuf_free_seg(m2);
+                       m2->next = *extm;
+                       *extm = m2;
                } else {
                        sqe_id = 
rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
                                                               
rte_memory_order_relaxed);
@@ -229,7 +244,8 @@ cn9k_nix_prefree_seg_vec(struct rte_mbuf **mbufs, struct 
cn9k_eth_txq *txq,
                w1 = vgetq_lane_u64(*senddesc23_w1, 1);
                w1 &= ~0xFFFF000000000000UL;
                if (unlikely(!tx_compl_ena)) {
-                       rte_pktmbuf_free_seg(m3);
+                       m3->next = *extm;
+                       *extm = m3;
                } else {
                        sqe_id = 
rte_atomic_fetch_add_explicit(&txq->tx_compl.sqe_id, 1,
                                                               
rte_memory_order_relaxed);
@@ -310,10 +326,9 @@ cn9k_nix_xmit_prepare_tso(struct rte_mbuf *m, const 
uint64_t flags)
 }

 static __rte_always_inline void
-cn9k_nix_xmit_prepare(struct cn9k_eth_txq *txq,
-                     struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags,
-                     const uint64_t lso_tun_fmt, uint8_t mark_flag,
-                     uint64_t mark_fmt)
+cn9k_nix_xmit_prepare(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct 
rte_mbuf **extm,
+                     uint64_t *cmd, const uint16_t flags, const uint64_t 
lso_tun_fmt,
+                     uint8_t mark_flag, uint64_t mark_fmt)
 {
        uint8_t mark_off = 0, mark_vlan = 0, markptr = 0;
        struct nix_send_ext_s *send_hdr_ext;
@@ -509,7 +524,7 @@ cn9k_nix_xmit_prepare(struct cn9k_eth_txq *txq,
                         * DF bit = 0 otherwise
                         */
                        aura = send_hdr->w0.aura;
-                       send_hdr->w0.df = cn9k_nix_prefree_seg(m, txq, 
send_hdr, &aura);
+                       send_hdr->w0.df = cn9k_nix_prefree_seg(m, extm, txq, 
send_hdr, &aura);
                        send_hdr->w0.aura = aura;
                        /* Ensuring mbuf fields which got updated in
                         * cnxk_nix_prefree_seg are written before LMTST.
@@ -600,8 +615,8 @@ cn9k_nix_xmit_submit_lmt_release(const rte_iova_t io_addr)
 }

 static __rte_always_inline uint16_t
-cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq,
-                     struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
+cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct 
rte_mbuf **extm,
+                     uint64_t *cmd, const uint16_t flags)
 {
        struct nix_send_hdr_s *send_hdr;
        uint64_t prefree = 0, aura;
@@ -634,7 +649,7 @@ cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq,
        /* Set invert df if buffer is not to be freed by H/W */
        if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
                aura = send_hdr->w0.aura;
-               prefree = (cn9k_nix_prefree_seg(m, txq, send_hdr, &aura) << 55);
+               prefree = (cn9k_nix_prefree_seg(m, extm, txq, send_hdr, &aura) 
<< 55);
                send_hdr->w0.aura = aura;
                sg_u |= prefree;
                rte_io_wmb();
@@ -664,7 +679,7 @@ cn9k_nix_prepare_mseg(struct cn9k_eth_txq *txq,
                cookie = RTE_MBUF_DIRECT(m) ? m : rte_mbuf_from_indirect(m);
                /* Set invert df if buffer is not to be freed by H/W */
                if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
-                       sg_u |= (cn9k_nix_prefree_seg(m, txq, send_hdr, NULL) 
<< (i + 55));
+                       sg_u |= (cn9k_nix_prefree_seg(m, extm, txq, send_hdr, 
NULL) << (i + 55));
                        /* Commit changes to mbuf */
                        rte_io_wmb();
                }
@@ -748,6 +763,7 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf 
**tx_pkts, uint16_t pkts,
        const rte_iova_t io_addr = txq->io_addr;
        uint64_t lso_tun_fmt = 0, mark_fmt = 0;
        void *lmt_addr = txq->lmt_addr;
+       struct rte_mbuf *extm = NULL;
        uint8_t mark_flag = 0;
        uint16_t i;

@@ -778,13 +794,16 @@ cn9k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf 
**tx_pkts, uint16_t pkts,
                rte_io_wmb();

        for (i = 0; i < pkts; i++) {
-               cn9k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
+               cn9k_nix_xmit_prepare(txq, tx_pkts[i], &extm, cmd, flags, 
lso_tun_fmt,
                                      mark_flag, mark_fmt);
                cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags, 4,
                                             flags);
                cn9k_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
        }

+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena)
+               cn9k_nix_free_extmbuf(extm);
+
        /* Reduce the cached count */
        txq->fc_cache_pkts -= pkts;

@@ -799,6 +818,7 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf 
**tx_pkts,
        const rte_iova_t io_addr = txq->io_addr;
        uint64_t lso_tun_fmt = 0, mark_fmt = 0;
        void *lmt_addr = txq->lmt_addr;
+       struct rte_mbuf *extm = NULL;
        uint8_t mark_flag = 0;
        uint16_t segdw;
        uint64_t i;
@@ -830,14 +850,17 @@ cn9k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf 
**tx_pkts,
                rte_io_wmb();

        for (i = 0; i < pkts; i++) {
-               cn9k_nix_xmit_prepare(txq, tx_pkts[i], cmd, flags, lso_tun_fmt,
+               cn9k_nix_xmit_prepare(txq, tx_pkts[i], &extm, cmd, flags, 
lso_tun_fmt,
                                      mark_flag, mark_fmt);
-               segdw = cn9k_nix_prepare_mseg(txq, tx_pkts[i], cmd, flags);
+               segdw = cn9k_nix_prepare_mseg(txq, tx_pkts[i], &extm, cmd, 
flags);
                cn9k_nix_xmit_prepare_tstamp(txq, cmd, tx_pkts[i]->ol_flags,
                                             segdw, flags);
                cn9k_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
        }

+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena)
+               cn9k_nix_free_extmbuf(extm);
+
        /* Reduce the cached count */
        txq->fc_cache_pkts -= pkts;

@@ -885,7 +908,7 @@ cn9k_nix_prepare_tso(struct rte_mbuf *m, union 
nix_send_hdr_w1_u *w1,

 static __rte_always_inline uint8_t
 cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
-                              struct rte_mbuf *m, uint64_t *cmd,
+                              struct rte_mbuf *m, struct rte_mbuf **extm, 
uint64_t *cmd,
                               struct nix_send_hdr_s *send_hdr,
                               union nix_send_sg_s *sg, const uint32_t flags)
 {
@@ -910,7 +933,7 @@ cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
        cookie = RTE_MBUF_DIRECT(m) ? m : rte_mbuf_from_indirect(m);
        if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
                aura = send_hdr->w0.aura;
-               sg_u |= (cn9k_nix_prefree_seg(m, txq, send_hdr, &aura) << 55);
+               sg_u |= (cn9k_nix_prefree_seg(m, extm, txq, send_hdr, &aura) << 
55);
                send_hdr->w0.aura = aura;
        }
        /* Mark mempool object as "put" since it is freed by NIX */
@@ -935,7 +958,7 @@ cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
                cookie = RTE_MBUF_DIRECT(m) ? m : rte_mbuf_from_indirect(m);
                /* Set invert df if buffer is not to be freed by H/W */
                if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
-                       sg_u |= (cn9k_nix_prefree_seg(m, txq, send_hdr, &aura) 
<< (i + 55));
+                       sg_u |= (cn9k_nix_prefree_seg(m, extm, txq, send_hdr, 
&aura) << (i + 55));
                        /* Mark mempool object as "put" since it is freed by NIX
                         */
 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
@@ -981,9 +1004,8 @@ cn9k_nix_prepare_mseg_vec_list(struct cn9k_eth_txq *txq,
 }

 static __rte_always_inline uint8_t
-cn9k_nix_prepare_mseg_vec(struct cn9k_eth_txq *txq,
-                         struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
-                         uint64x2_t *cmd1, const uint32_t flags)
+cn9k_nix_prepare_mseg_vec(struct cn9k_eth_txq *txq, struct rte_mbuf *m, struct 
rte_mbuf **extm,
+                         uint64_t *cmd, uint64x2_t *cmd0, uint64x2_t *cmd1, 
const uint32_t flags)
 {
        struct nix_send_hdr_s send_hdr;
        struct rte_mbuf *cookie;
@@ -998,7 +1020,7 @@ cn9k_nix_prepare_mseg_vec(struct cn9k_eth_txq *txq,
                        send_hdr.w1.u = vgetq_lane_u64(cmd0[0], 1);
                        sg.u = vgetq_lane_u64(cmd1[0], 0);
                        aura = send_hdr.w0.aura;
-                       sg.u |= (cn9k_nix_prefree_seg(m, txq, &send_hdr, &aura) 
<< 55);
+                       sg.u |= (cn9k_nix_prefree_seg(m, extm, txq, &send_hdr, 
&aura) << 55);
                        send_hdr.w0.aura = aura;
                        cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
                        cmd0[0] = vsetq_lane_u64(send_hdr.w0.u, cmd0[0], 0);
@@ -1021,7 +1043,7 @@ cn9k_nix_prepare_mseg_vec(struct cn9k_eth_txq *txq,
        send_hdr.w1.u = vgetq_lane_u64(cmd0[0], 1);
        sg.u = vgetq_lane_u64(cmd1[0], 0);

-       ret = cn9k_nix_prepare_mseg_vec_list(txq, m, cmd, &send_hdr, &sg, 
flags);
+       ret = cn9k_nix_prepare_mseg_vec_list(txq, m, extm, cmd, &send_hdr, &sg, 
flags);

        cmd0[0] = vsetq_lane_u64(send_hdr.w0.u, cmd0[0], 0);
        cmd0[0] = vsetq_lane_u64(send_hdr.w1.u, cmd0[0], 1);
@@ -1168,6 +1190,7 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf 
**tx_pkts,
        uint64_t *lmt_addr = txq->lmt_addr;
        rte_iova_t io_addr = txq->io_addr;
        uint64x2_t ltypes01, ltypes23;
+       struct rte_mbuf *extm = NULL;
        uint64x2_t xtmp128, ytmp128;
        uint64x2_t xmask01, xmask23;
        uint64_t lmt_status, i;
@@ -1933,8 +1956,8 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf 
**tx_pkts,
                if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
                    !(flags & NIX_TX_MULTI_SEG_F)) {
                        /* Set don't free bit if reference count > 1 */
-                       cn9k_nix_prefree_seg_vec(tx_pkts, txq, &senddesc01_w0, 
&senddesc23_w0,
-                                                &senddesc01_w1, 
&senddesc23_w1);
+                       cn9k_nix_prefree_seg_vec(tx_pkts, &extm, txq, 
&senddesc01_w0,
+                                                &senddesc23_w0, 
&senddesc01_w1, &senddesc23_w1);
                        /* Ensuring mbuf fields which got updated in
                         * cnxk_nix_prefree_seg are written before LMTST.
                         */
@@ -1995,7 +2018,7 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf 
**tx_pkts,
                        /* Build mseg list for each packet individually. */
                        for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
                                segdw[j] = cn9k_nix_prepare_mseg_vec(txq,
-                                                       tx_pkts[j],
+                                                       tx_pkts[j], &extm,
                                                        seg_list[j], &cmd0[j],
                                                        &cmd1[j], flags);
                        segdw[4] = 8;
@@ -2070,6 +2093,9 @@ cn9k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf 
**tx_pkts,
                tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
        }

+       if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F && !txq->tx_compl.ena)
+               cn9k_nix_free_extmbuf(extm);
+
        if (unlikely(pkts_left)) {
                if (flags & NIX_TX_MULTI_SEG_F)
                        pkts += cn9k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
--
2.25.1

Reply via email to