-----Original Message-----
From: Richardson, Bruce <bruce.richard...@intel.com>
Sent: 09 June 2025 20:09
To: Hore, Soumyadeep <soumyadeep.h...@intel.com>
Cc: dev@dpdk.org; Singh, Aman Deep <aman.deep.si...@intel.com>; Subbarao, Manoj
Kumar <manoj.kumar.subba...@intel.com>
Subject: Re: [PATCH v3 3/6] net/intel: add TxPP Support for E830
On Sun, Jun 08, 2025 at 11:32:20AM +0000, Soumyadeep Hore wrote:
> Add support for Tx Time based queues. This is used to schedule packets
> based on Tx timestamp.
>
> Signed-off-by: Soumyadeep Hore <soumyadeep.h...@intel.com>
Hi
more review comments inline below.
> ---
> drivers/net/intel/common/tx.h | 14 ++
> drivers/net/intel/ice/base/ice_lan_tx_rx.h | 4 +
> drivers/net/intel/ice/ice_ethdev.c | 3 +-
> drivers/net/intel/ice/ice_ethdev.h | 12 ++
> drivers/net/intel/ice/ice_rxtx.c | 232 ++++++++++++++++++++-
> drivers/net/intel/ice/ice_rxtx.h | 9 +
> 6 files changed, 265 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/intel/common/tx.h
> b/drivers/net/intel/common/tx.h index b0a68bae44..8b958bf8e5 100644
> --- a/drivers/net/intel/common/tx.h
> +++ b/drivers/net/intel/common/tx.h
> @@ -30,6 +30,19 @@ struct ci_tx_entry_vec {
>
> typedef void (*ice_tx_release_mbufs_t)(struct ci_tx_queue *txq);
>
> +/**
> + * Structure associated with Tx Time based queue */ struct
> +ice_txtime {
> + volatile struct ice_ts_desc *ice_ts_ring; /* Tx time ring virtual
> address */
> + uint16_t nb_ts_desc; /* number of Tx Time descriptors */
> + uint16_t ts_tail; /* current value of tail register */
> + rte_iova_t ts_ring_dma; /* TX time ring DMA address */
> + const struct rte_memzone *ts_mz;
> + int ts_offset; /* dynamic mbuf Tx timestamp field offset */
> + uint64_t ts_flag; /* dynamic mbuf Tx timestamp flag */
> +};
> +
> struct ci_tx_queue {
> union { /* TX ring virtual address */
> volatile struct i40e_tx_desc *i40e_tx_ring; @@ -77,6 +90,7 @@
> struct ci_tx_queue {
> union {
> struct { /* ICE driver specific values */
> uint32_t q_teid; /* TX schedule node id. */
> + struct ice_txtime tsq; /* Tx Time based queue */
> };
> struct { /* I40E driver specific values */
> uint8_t dcb_tc;
> diff --git a/drivers/net/intel/ice/base/ice_lan_tx_rx.h
> b/drivers/net/intel/ice/base/ice_lan_tx_rx.h
> index f92382346f..8b6c1a07a3 100644
> --- a/drivers/net/intel/ice/base/ice_lan_tx_rx.h
> +++ b/drivers/net/intel/ice/base/ice_lan_tx_rx.h
> @@ -1278,6 +1278,8 @@ struct ice_ts_desc {
> #define ICE_TXTIME_MAX_QUEUE 2047
> #define ICE_SET_TXTIME_MAX_Q_AMOUNT 127
> #define ICE_OP_TXTIME_MAX_Q_AMOUNT 2047
> +#define ICE_TXTIME_FETCH_TS_DESC_DFLT 8
> +#define ICE_TXTIME_FETCH_PROFILE_CNT 16
> /* Tx Time queue context data
> *
> * The sizes of the variables may be larger than needed due to
> crossing byte @@ -1303,8 +1305,10 @@ struct ice_txtime_ctx {
> u8 drbell_mode_32;
> #define ICE_TXTIME_CTX_DRBELL_MODE_32 1
> u8 ts_res;
> +#define ICE_TXTIME_CTX_RESOLUTION_128NS 7
> u8 ts_round_type;
> u8 ts_pacing_slot;
> +#define ICE_TXTIME_CTX_FETCH_PROF_ID_0 0
> u8 merging_ena;
> u8 ts_fetch_prof_id;
> u8 ts_fetch_cache_line_aln_thld;
> diff --git a/drivers/net/intel/ice/ice_ethdev.c
> b/drivers/net/intel/ice/ice_ethdev.c
> index 9478ba92df..3af9f6ba38 100644
> --- a/drivers/net/intel/ice/ice_ethdev.c
> +++ b/drivers/net/intel/ice/ice_ethdev.c
> @@ -4139,7 +4139,8 @@ ice_dev_info_get(struct rte_eth_dev *dev, struct
> rte_eth_dev_info *dev_info)
> RTE_ETH_TX_OFFLOAD_VXLAN_TNL_TSO |
> RTE_ETH_TX_OFFLOAD_GRE_TNL_TSO |
> RTE_ETH_TX_OFFLOAD_IPIP_TNL_TSO |
> - RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO;
> + RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO |
> + RTE_ETH_TX_OFFLOAD_SEND_ON_TIMESTAMP;
> dev_info->flow_type_rss_offloads |= ICE_RSS_OFFLOAD_ALL;
> }
>
> diff --git a/drivers/net/intel/ice/ice_ethdev.h
> b/drivers/net/intel/ice/ice_ethdev.h
> index bfe093afca..dd86bd030c 100644
> --- a/drivers/net/intel/ice/ice_ethdev.h
> +++ b/drivers/net/intel/ice/ice_ethdev.h
> @@ -17,6 +17,18 @@
> #include "base/ice_flow.h"
> #include "base/ice_sched.h"
>
> +#define __bf_shf(x) rte_bsf32(x)
> +#define FIELD_GET(_mask, _reg) \
> + (__extension__ ({ \
> + typeof(_mask) _x = (_mask); \
> + (typeof(_x))(((_reg) & (_x)) >> __bf_shf(_x)); \
> + }))
> +#define FIELD_PREP(_mask, _val) \
> + (__extension__ ({ \
> + typeof(_mask) _x = (_mask); \
> + ((typeof(_x))(_val) << __bf_shf(_x)) & (_x); \
> + }))
> +
__bf_shf() macro is only used in this driver in these two macros. Therefore
there is no reason to use the alias at all - just call rte_bsf32 directly, and
make the code more readable.
> #define ICE_ADMINQ_LEN 32
> #define ICE_SBIOQ_LEN 32
> #define ICE_MAILBOXQ_LEN 32
> diff --git a/drivers/net/intel/ice/ice_rxtx.c
> b/drivers/net/intel/ice/ice_rxtx.c
> index ba1435b9de..0c5844e067 100644
> --- a/drivers/net/intel/ice/ice_rxtx.c
> +++ b/drivers/net/intel/ice/ice_rxtx.c
> @@ -740,6 +740,53 @@ ice_rx_queue_stop(struct rte_eth_dev *dev, uint16_t
> rx_queue_id)
> return 0;
> }
>
> +/**
> + * ice_setup_txtime_ctx - setup a struct ice_txtime_ctx instance
> + * @txq: The queue on which tstamp ring to configure
> + * @txtime_ctx: Pointer to the Tx time queue context structure to be
> +initialized
> + * @txtime_ena: Tx time enable flag, set to true if Tx time should be
> +enabled */ static int ice_setup_txtime_ctx(struct ci_tx_queue *txq,
> + struct ice_txtime_ctx *txtime_ctx, bool txtime_ena) {
> + struct ice_vsi *vsi = txq->ice_vsi;
> + struct ice_hw *hw = ICE_VSI_TO_HW(vsi);
> +
> + txtime_ctx->base = txq->tsq.ts_ring_dma >>
> +ICE_TX_CMPLTNQ_CTX_BASE_S;
> +
> + /* Tx time Queue Length */
> + txtime_ctx->qlen = txq->tsq.nb_ts_desc;
> +
> + if (txtime_ena)
> + txtime_ctx->txtime_ena_q = 1;
> +
> + /* PF number */
> + txtime_ctx->pf_num = hw->pf_id;
> +
> + switch (vsi->type) {
> + case ICE_VSI_LB:
> + case ICE_VSI_CTRL:
> + case ICE_VSI_ADI:
> + case ICE_VSI_PF:
> + txtime_ctx->vmvf_type = ICE_TLAN_CTX_VMVF_TYPE_PF;
> + break;
> + default:
> + PMD_DRV_LOG(ERR, "Unable to set VMVF type for VSI type %d",
> + vsi->type);
> + return -EINVAL;
> + }
> +
> + /* make sure the context is associated with the right VSI */
> + txtime_ctx->src_vsi = vsi->vsi_id;
> +
> + txtime_ctx->ts_res = ICE_TXTIME_CTX_RESOLUTION_128NS;
> + txtime_ctx->drbell_mode_32 = ICE_TXTIME_CTX_DRBELL_MODE_32;
> + txtime_ctx->ts_fetch_prof_id = ICE_TXTIME_CTX_FETCH_PROF_ID_0;
> +
> + return 0;
> +}
> +
> int
> ice_tx_queue_start(struct rte_eth_dev *dev, uint16_t tx_queue_id) {
> @@ -799,11 +846,6 @@ ice_tx_queue_start(struct rte_eth_dev *dev, uint16_t
> tx_queue_id)
> ice_set_ctx(hw, (uint8_t *)&tx_ctx, txq_elem->txqs[0].txq_ctx,
> ice_tlan_ctx_info);
>
> - txq->qtx_tail = hw->hw_addr + QTX_COMM_DBELL(txq->reg_idx);
> -
> - /* Init the Tx tail register*/
> - ICE_PCI_REG_WRITE(txq->qtx_tail, 0);
> -
> /* Fix me, we assume TC always 0 here */
> err = ice_ena_vsi_txq(hw->port_info, vsi->idx, 0, tx_queue_id, 1,
> txq_elem, buf_len, NULL);
> @@ -826,6 +868,40 @@ ice_tx_queue_start(struct rte_eth_dev *dev, uint16_t
> tx_queue_id)
> /* record what kind of descriptor cleanup we need on teardown */
> txq->vector_tx = ad->tx_vec_allowed;
>
> + if (txq->tsq.ts_flag > 0) {
> + struct ice_aqc_set_txtime_qgrp *ts_elem;
> + u8 ts_buf_len = ice_struct_size(ts_elem, txtimeqs, 1);
> + struct ice_txtime_ctx txtime_ctx = { 0 };
> +
> + ts_elem = ice_malloc(hw, ts_buf_len);
> + ice_setup_txtime_ctx(txq, &txtime_ctx,
> + true);
> + ice_set_ctx(hw, (u8 *)&txtime_ctx,
> + ts_elem->txtimeqs[0].txtime_ctx,
> + ice_txtime_ctx_info);
> +
> + txq->qtx_tail = hw->hw_addr +
> +
> E830_GLQTX_TXTIME_DBELL_LSB(txq->reg_idx);
> +
> + /* Init the Tx time tail register*/
> + ICE_PCI_REG_WRITE(txq->qtx_tail, 0);
> +
> + err = ice_aq_set_txtimeq(hw, txq->reg_idx, 1, ts_elem,
> + ts_buf_len,
> NULL);
> + if (err) {
> + PMD_DRV_LOG(ERR, "Failed to set Tx Time queue context,
> error: %d", err);
> + rte_free(txq_elem);
> + rte_free(ts_elem);
> + return err;
> + }
> + rte_free(ts_elem);
Small suggestion - if you move the rte_free up immediately after the
set_txtimeq call, you can avoid having to repeat the same op in the error leg
and in the non-error case.
> + } else {
> + txq->qtx_tail = hw->hw_addr + QTX_COMM_DBELL(txq->reg_idx);
> +
> + /* Init the Tx tail register*/
> + ICE_PCI_REG_WRITE(txq->qtx_tail, 0);
> + }
> +
> dev->data->tx_queue_state[tx_queue_id] =
> RTE_ETH_QUEUE_STATE_STARTED;
>
> rte_free(txq_elem);
> @@ -1046,6 +1122,20 @@ ice_reset_tx_queue(struct ci_tx_queue *txq)
>
> txq->last_desc_cleaned = (uint16_t)(txq->nb_tx_desc - 1);
> txq->nb_tx_free = (uint16_t)(txq->nb_tx_desc - 1);
> +
> + if (txq->tsq.ts_flag > 0) {
> + size = sizeof(struct ice_ts_desc) * txq->tsq.nb_ts_desc;
> + for (i = 0; i < size; i++)
> + ((volatile char *)txq->tsq.ice_ts_ring)[i] = 0;
Why not just use memset here?
I have used the similar mechanism used in tx ring clearing in the same code
above.
> +
> + for (i = 0; i < txq->tsq.nb_ts_desc; i++) {
> + volatile struct ice_ts_desc *tsd =
> +
> &txq->tsq.ice_ts_ring[i];
> + tsd->tx_desc_idx_tstamp = 0;
> + }
Should the tx_desc_idx_tstamp not already be zero from the clearing op done
just above?
> +
> + txq->tsq.ts_tail = 0;
> + }
> }
>
> int
> @@ -1080,6 +1170,19 @@ ice_tx_queue_stop(struct rte_eth_dev *dev, uint16_t
> tx_queue_id)
> q_ids[0] = txq->reg_idx;
> q_teids[0] = txq->q_teid;
>
> + if (txq->tsq.ts_flag > 0) {
> + struct ice_aqc_ena_dis_txtime_qgrp txtime_pg;
> + status = ice_aq_ena_dis_txtimeq(hw, q_ids[0], 1, 0,
> + &txtime_pg, NULL);
> + if (status != ICE_SUCCESS) {
> + PMD_DRV_LOG(DEBUG, "Failed to disable Tx time queue");
> + return -EINVAL;
> + }
> + txq->tsq.ts_flag = 0;
> + txq->tsq.ts_offset = -1;
> + dev->dev_ops->timesync_disable(dev);
> + }
> +
> /* Fix me, we assume TC always 0 here */
> status = ice_dis_vsi_txq(hw->port_info, vsi->idx, 0, 1, &q_handle,
> q_ids, q_teids, ICE_NO_RESET, 0, NULL); @@
> -1166,6 +1269,7 @@
> ice_rx_queue_setup(struct rte_eth_dev *dev,
> struct rte_mempool *mp)
> {
> struct ice_pf *pf = ICE_DEV_PRIVATE_TO_PF(dev->data->dev_private);
> + struct ice_hw *hw = ICE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> struct ice_adapter *ad =
> ICE_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
> struct ice_vsi *vsi = pf->main_vsi;
> @@ -1249,7 +1353,7 @@ ice_rx_queue_setup(struct rte_eth_dev *dev,
> rxq->xtr_field_offs = ad->devargs.xtr_field_offs;
>
> /* Allocate the maximum number of RX ring hardware descriptor. */
> - len = ICE_MAX_RING_DESC;
> + len = ICE_MAX_NUM_DESC_BY_MAC(hw);
>
> /**
> * Allocating a little more memory because vectorized/bulk_alloc Rx
> @@ -1337,6 +1441,36 @@ ice_rx_queue_release(void *rxq)
> rte_free(q);
> }
>
> +/**
> + * ice_calc_ts_ring_count - Calculate the number of timestamp
> +descriptors
> + * @hw: pointer to the hardware structure
> + * @tx_desc_count: number of Tx descriptors in the ring
> + *
> + * Return: the number of timestamp descriptors */ static uint16_t
> +ice_calc_ts_ring_count(struct ice_hw *hw, u16 tx_desc_count)
Use DPDK style for declarations, putting the "static uint16_t" on its own line.
> +{
> + u16 prof = ICE_TXTIME_CTX_FETCH_PROF_ID_0;
> + u16 max_fetch_desc = 0;
> + u16 fetch;
> + u32 reg;
> + u16 i;
> +
> + for (i = 0; i < ICE_TXTIME_FETCH_PROFILE_CNT; i++) {
> + reg = rd32(hw, E830_GLTXTIME_FETCH_PROFILE(prof, 0));
> + fetch = FIELD_GET(E830_GLTXTIME_FETCH_PROFILE_FETCH_TS_DESC_M,
> + reg);
> + max_fetch_desc = max(fetch, max_fetch_desc);
> + }
> +
> + if (!max_fetch_desc)
> + max_fetch_desc = ICE_TXTIME_FETCH_TS_DESC_DFLT;
> +
> + max_fetch_desc = RTE_ALIGN(max_fetch_desc, ICE_REQ_DESC_MULTIPLE);
> +
> + return tx_desc_count + max_fetch_desc; }
> +
> int
> ice_tx_queue_setup(struct rte_eth_dev *dev,
> uint16_t queue_idx,
> @@ -1345,6 +1479,7 @@ ice_tx_queue_setup(struct rte_eth_dev *dev,
> const struct rte_eth_txconf *tx_conf) {
> struct ice_pf *pf = ICE_DEV_PRIVATE_TO_PF(dev->data->dev_private);
> + struct ice_hw *hw = ICE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
> struct ice_vsi *vsi = pf->main_vsi;
> struct ci_tx_queue *txq;
> const struct rte_memzone *tz;
> @@ -1469,7 +1604,8 @@ ice_tx_queue_setup(struct rte_eth_dev *dev,
> }
>
> /* Allocate TX hardware ring descriptors. */
> - ring_size = sizeof(struct ice_tx_desc) * ICE_MAX_RING_DESC;
> + ring_size = sizeof(struct ice_tx_desc) *
> + ICE_MAX_NUM_DESC_BY_MAC(hw);
> ring_size = RTE_ALIGN(ring_size, ICE_DMA_MEM_ALIGN);
> tz = rte_eth_dma_zone_reserve(dev, "ice_tx_ring", queue_idx,
> ring_size, ICE_RING_BASE_ALIGN, @@
> -1507,6 +1643,42 @@
> ice_tx_queue_setup(struct rte_eth_dev *dev,
> return -ENOMEM;
> }
>
> + if (vsi->type == ICE_VSI_PF &&
> + (offloads & RTE_ETH_TX_OFFLOAD_SEND_ON_TIMESTAMP) &&
> + txq->tsq.ts_offset == 0 && hw->phy_model == ICE_PHY_E830) {
> + int ret =
> + rte_mbuf_dyn_tx_timestamp_register(&txq->tsq.ts_offset,
> + &txq->tsq.ts_flag);
> + if (ret) {
> + PMD_INIT_LOG(ERR, "Cannot register Tx mbuf field/flag "
> + "for timestamp");
> + return -EINVAL;
> + }
> + dev->dev_ops->timesync_enable(dev);
> +
> + ring_size = sizeof(struct ice_ts_desc) *
> + ICE_MAX_NUM_DESC_BY_MAC(hw);
> + ring_size = RTE_ALIGN(ring_size, ICE_DMA_MEM_ALIGN);
> + const struct rte_memzone *ts_z =
> + rte_eth_dma_zone_reserve(dev,
> "ice_tstamp_ring",
> + queue_idx, ring_size,
> ICE_RING_BASE_ALIGN,
> + socket_id);
> + if (!ts_z) {
> + ice_tx_queue_release(txq);
> + PMD_INIT_LOG(ERR, "Failed to reserve DMA memory "
> + "for TX timestamp");
> + return -ENOMEM;
> + }
> + txq->tsq.ts_mz = ts_z;
> + txq->tsq.ice_ts_ring = ts_z->addr;
> + txq->tsq.ts_ring_dma = ts_z->iova;
> + txq->tsq.nb_ts_desc =
> + ice_calc_ts_ring_count(ICE_VSI_TO_HW(vsi),
> + txq->nb_tx_desc);
This looks wrong, and leading to overflow. What happens if:
txq->nb_tx_desc == ICE_MAX_NUM_DESC_BY_MAC(hw) Since ice_calc_ts_ring_count
always returns a value greater than
txq->nb_tx_desc, you will overwrite other data on reset.
No we store the number of ts descriptor separately and it is always greater
than Tx ring desc. This is a HW workaround for a
MDD event fix.
> + } else {
> + txq->tsq.ice_ts_ring = NULL;
> + }
> +
> ice_reset_tx_queue(txq);
> txq->q_set = true;
> dev->data->tx_queues[queue_idx] = txq; @@ -1539,6 +1711,8 @@
> ice_tx_queue_release(void *txq)
>
> ci_txq_release_all_mbufs(q, false);
> rte_free(q->sw_ring);
> + if (q->tsq.ts_mz)
> + rte_memzone_free(q->tsq.ts_mz);
> rte_memzone_free(q->mz);
> rte_free(q);
> }
> @@ -2961,6 +3135,7 @@ ice_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
> struct rte_mbuf *m_seg;
> uint32_t cd_tunneling_params;
> uint16_t tx_id;
> + uint16_t ts_id = -1;
> uint16_t nb_tx;
> uint16_t nb_used;
> uint16_t nb_ctx;
> @@ -2979,6 +3154,9 @@ ice_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
> tx_id = txq->tx_tail;
> txe = &sw_ring[tx_id];
>
> + if (txq->tsq.ts_flag > 0)
> + ts_id = txq->tsq.ts_tail;
> +
> /* Check if the descriptor ring needs to be cleaned. */
> if (txq->nb_tx_free < txq->tx_free_thresh)
> (void)ice_xmit_cleanup(txq);
> @@ -3166,10 +3344,48 @@ ice_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
> txd->cmd_type_offset_bsz |=
> rte_cpu_to_le_64(((uint64_t)td_cmd) <<
> ICE_TXD_QW1_CMD_S);
> +
> + if (txq->tsq.ts_flag > 0) {
> + uint64_t txtime = *RTE_MBUF_DYNFIELD(tx_pkt,
> + txq->tsq.ts_offset, uint64_t *);
> + uint32_t tstamp = (uint32_t)(txtime % NS_PER_S) >>
> + ICE_TXTIME_CTX_RESOLUTION_128NS;
> + if (tx_id == 0)
> + txq->tsq.ice_ts_ring[ts_id].tx_desc_idx_tstamp =
> +
> rte_cpu_to_le_32(FIELD_PREP(ICE_TXTIME_TX_DESC_IDX_M,
> + txq->nb_tx_desc) |
> FIELD_PREP(ICE_TXTIME_STAMP_M,
> + tstamp));
This indentation needs fixing. It looks like a block of 4 statements!
> + else
> + txq->tsq.ice_ts_ring[ts_id].tx_desc_idx_tstamp =
> +
> rte_cpu_to_le_32(FIELD_PREP(ICE_TXTIME_TX_DESC_IDX_M,
> + tx_id) | FIELD_PREP(ICE_TXTIME_STAMP_M,
> tstamp));
Same here, and below also.
> + ts_id++;
> + /* Handling MDD issue causing Tx Hang */
> + if (ts_id == txq->tsq.nb_ts_desc) {
> + uint16_t fetch = txq->tsq.nb_ts_desc -
> txq->nb_tx_desc;
> + ts_id = 0;
> + for (; ts_id < fetch; ts_id++) {
> + if (tx_id == 0)
> +
> txq->tsq.ice_ts_ring[ts_id].tx_desc_idx_tstamp =
> +
> rte_cpu_to_le_32(FIELD_PREP(ICE_TXTIME_TX_DESC_IDX_M,
> + txq->nb_tx_desc) |
> FIELD_PREP(ICE_TXTIME_STAMP_M,
> + tstamp));
> + else
> +
> txq->tsq.ice_ts_ring[ts_id].tx_desc_idx_tstamp =
> +
> rte_cpu_to_le_32(FIELD_PREP(ICE_TXTIME_TX_DESC_IDX_M,
> + tx_id) |
> FIELD_PREP(ICE_TXTIME_STAMP_M, tstamp));
> + }
> + }
> + }
> }
<snip>