Implement TSO (TCP segmentation offload) in ixgbe driver. The driver is now able to use PKT_TX_TCP_SEG mbuf flag and mbuf hardware offload infos (l2_len, l3_len, l4_len, tso_segsz) to configure the hardware support of TCP segmentation.
In ixgbe, when doing TSO, the IP length must not be included in the TCP pseudo header checksum. A new function ixgbe_fix_tcp_phdr_cksum() is used to fix the pseudo header checksum of the packet before giving it to the hardware. In the patch, the tx_desc_cksum_flags_to_olinfo() and tx_desc_ol_flags_to_cmdtype() functions have been reworked to make them clearer. This should not impact performance as gcc (version 4.8 in my case) is smart enough to convert the tests into a code that does not contain any branch instruction. Signed-off-by: Olivier Matz <olivier.matz at 6wind.com> --- lib/librte_pmd_ixgbe/ixgbe_ethdev.c | 3 +- lib/librte_pmd_ixgbe/ixgbe_rxtx.c | 220 +++++++++++++++++++++++++----------- lib/librte_pmd_ixgbe/ixgbe_rxtx.h | 19 ++-- 3 files changed, 167 insertions(+), 75 deletions(-) diff --git a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c index 9c73a30..1ab433a 100644 --- a/lib/librte_pmd_ixgbe/ixgbe_ethdev.c +++ b/lib/librte_pmd_ixgbe/ixgbe_ethdev.c @@ -1961,7 +1961,8 @@ ixgbe_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info) DEV_TX_OFFLOAD_IPV4_CKSUM | DEV_TX_OFFLOAD_UDP_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM | - DEV_TX_OFFLOAD_SCTP_CKSUM; + DEV_TX_OFFLOAD_SCTP_CKSUM | + DEV_TX_OFFLOAD_TCP_TSO; dev_info->default_rxconf = (struct rte_eth_rxconf) { .rx_thresh = { diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c index 54a0fc1..79f7395 100644 --- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.c +++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.c @@ -354,62 +354,132 @@ ixgbe_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts, return nb_tx; } +/* When doing TSO, the IP length must not be included in the pseudo + * header checksum of the packet given to the hardware */ +static inline void +ixgbe_fix_tcp_phdr_cksum(struct rte_mbuf *m) +{ + char *data; + uint16_t *cksum_ptr; + uint16_t prev_cksum; + uint16_t new_cksum; + uint16_t ip_len, ip_paylen; + uint32_t tmp; + uint8_t ip_version; + + /* get phdr cksum at offset 16 of TCP header */ + data = rte_pktmbuf_mtod(m, char *); + cksum_ptr = (uint16_t *)(data + m->l2_len + m->l3_len + 16); + prev_cksum = *cksum_ptr; + + /* get ip_version */ + ip_version = (*(uint8_t *)(data + m->l2_len)) >> 4; + + /* get ip_len at offset 2 of IP header or offset 4 of IPv6 header */ + if (ip_version == 4) { + /* override ip cksum to 0 */ + data[m->l2_len + 10] = 0; + data[m->l2_len + 11] = 0; + + ip_len = *(uint16_t *)(data + m->l2_len + 2); + ip_paylen = rte_cpu_to_be_16(rte_be_to_cpu_16(ip_len) - + m->l3_len); + } else { + ip_paylen = *(uint16_t *)(data + m->l2_len + 4); + } + + /* calculate the new phdr checksum that doesn't include ip_paylen */ + tmp = prev_cksum; + if (tmp < ip_paylen) + tmp += 0xffff; + tmp -= ip_paylen; + new_cksum = tmp; + + /* replace it in the packet */ + *cksum_ptr = new_cksum; +} + static inline void ixgbe_set_xmit_ctx(struct igb_tx_queue* txq, volatile struct ixgbe_adv_tx_context_desc *ctx_txd, - uint64_t ol_flags, uint32_t vlan_macip_lens) + uint64_t ol_flags, union ixgbe_tx_offload tx_offload) { uint32_t type_tucmd_mlhl; - uint32_t mss_l4len_idx; + uint32_t mss_l4len_idx = 0; uint32_t ctx_idx; - uint32_t cmp_mask; + uint32_t vlan_macip_lens; + union ixgbe_tx_offload tx_offload_mask; ctx_idx = txq->ctx_curr; - cmp_mask = 0; + tx_offload_mask.data = 0; type_tucmd_mlhl = 0; + /* Specify which HW CTX to upload. */ + mss_l4len_idx |= (ctx_idx << IXGBE_ADVTXD_IDX_SHIFT); + if (ol_flags & PKT_TX_VLAN_PKT) { - cmp_mask |= TX_VLAN_CMP_MASK; + tx_offload_mask.vlan_tci = ~0; } - if (ol_flags & PKT_TX_IP_CKSUM) { - type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4; - cmp_mask |= TX_MACIP_LEN_CMP_MASK; - } + /* check if TCP segmentation required for this packet */ + if (ol_flags & PKT_TX_TCP_SEG) { + /* implies IP cksum and TCP cksum */ + type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4 | + IXGBE_ADVTXD_TUCMD_L4T_TCP | + IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT;; + + tx_offload_mask.l2_len = ~0; + tx_offload_mask.l3_len = ~0; + tx_offload_mask.l4_len = ~0; + tx_offload_mask.tso_segsz = ~0; + mss_l4len_idx |= tx_offload.tso_segsz << IXGBE_ADVTXD_MSS_SHIFT; + mss_l4len_idx |= tx_offload.l4_len << IXGBE_ADVTXD_L4LEN_SHIFT; + } else { /* no TSO, check if hardware checksum is needed */ + if (ol_flags & PKT_TX_IP_CKSUM) { + type_tucmd_mlhl = IXGBE_ADVTXD_TUCMD_IPV4; + tx_offload_mask.l2_len = ~0; + tx_offload_mask.l3_len = ~0; + } - /* Specify which HW CTX to upload. */ - mss_l4len_idx = (ctx_idx << IXGBE_ADVTXD_IDX_SHIFT); - switch (ol_flags & PKT_TX_L4_MASK) { - case PKT_TX_UDP_CKSUM: - type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP | + switch (ol_flags & PKT_TX_L4_MASK) { + case PKT_TX_UDP_CKSUM: + type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_UDP | IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT; - mss_l4len_idx |= sizeof(struct udp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT; - cmp_mask |= TX_MACIP_LEN_CMP_MASK; - break; - case PKT_TX_TCP_CKSUM: - type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP | + mss_l4len_idx |= sizeof(struct udp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT; + tx_offload_mask.l2_len = ~0; + tx_offload_mask.l3_len = ~0; + break; + case PKT_TX_TCP_CKSUM: + type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_TCP | IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT; - mss_l4len_idx |= sizeof(struct tcp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT; - cmp_mask |= TX_MACIP_LEN_CMP_MASK; - break; - case PKT_TX_SCTP_CKSUM: - type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP | + mss_l4len_idx |= sizeof(struct tcp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT; + tx_offload_mask.l2_len = ~0; + tx_offload_mask.l3_len = ~0; + tx_offload_mask.l4_len = ~0; + break; + case PKT_TX_SCTP_CKSUM: + type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_SCTP | IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT; - mss_l4len_idx |= sizeof(struct sctp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT; - cmp_mask |= TX_MACIP_LEN_CMP_MASK; - break; - default: - type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_RSV | + mss_l4len_idx |= sizeof(struct sctp_hdr) << IXGBE_ADVTXD_L4LEN_SHIFT; + tx_offload_mask.l2_len = ~0; + tx_offload_mask.l3_len = ~0; + break; + default: + type_tucmd_mlhl |= IXGBE_ADVTXD_TUCMD_L4T_RSV | IXGBE_ADVTXD_DTYP_CTXT | IXGBE_ADVTXD_DCMD_DEXT; - break; + break; + } } txq->ctx_cache[ctx_idx].flags = ol_flags; - txq->ctx_cache[ctx_idx].cmp_mask = cmp_mask; - txq->ctx_cache[ctx_idx].vlan_macip_lens.data = - vlan_macip_lens & cmp_mask; + txq->ctx_cache[ctx_idx].tx_offload.data = + tx_offload_mask.data & tx_offload.data; + txq->ctx_cache[ctx_idx].tx_offload_mask = tx_offload_mask; ctx_txd->type_tucmd_mlhl = rte_cpu_to_le_32(type_tucmd_mlhl); + vlan_macip_lens = tx_offload.l3_len; + vlan_macip_lens |= (tx_offload.l2_len << IXGBE_ADVTXD_MACLEN_SHIFT); + vlan_macip_lens |= ((uint32_t)tx_offload.vlan_tci << IXGBE_ADVTXD_VLAN_SHIFT); ctx_txd->vlan_macip_lens = rte_cpu_to_le_32(vlan_macip_lens); ctx_txd->mss_l4len_idx = rte_cpu_to_le_32(mss_l4len_idx); ctx_txd->seqnum_seed = 0; @@ -421,20 +491,20 @@ ixgbe_set_xmit_ctx(struct igb_tx_queue* txq, */ static inline uint32_t what_advctx_update(struct igb_tx_queue *txq, uint64_t flags, - uint32_t vlan_macip_lens) + union ixgbe_tx_offload tx_offload) { /* If match with the current used context */ if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) && - (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data == - (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) { + (txq->ctx_cache[txq->ctx_curr].tx_offload.data == + (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) { return txq->ctx_curr; } /* What if match with the next context */ txq->ctx_curr ^= 1; if (likely((txq->ctx_cache[txq->ctx_curr].flags == flags) && - (txq->ctx_cache[txq->ctx_curr].vlan_macip_lens.data == - (txq->ctx_cache[txq->ctx_curr].cmp_mask & vlan_macip_lens)))) { + (txq->ctx_cache[txq->ctx_curr].tx_offload.data == + (txq->ctx_cache[txq->ctx_curr].tx_offload_mask.data & tx_offload.data)))) { return txq->ctx_curr; } @@ -445,20 +515,25 @@ what_advctx_update(struct igb_tx_queue *txq, uint64_t flags, static inline uint32_t tx_desc_cksum_flags_to_olinfo(uint64_t ol_flags) { - static const uint32_t l4_olinfo[2] = {0, IXGBE_ADVTXD_POPTS_TXSM}; - static const uint32_t l3_olinfo[2] = {0, IXGBE_ADVTXD_POPTS_IXSM}; - uint32_t tmp; - - tmp = l4_olinfo[(ol_flags & PKT_TX_L4_MASK) != PKT_TX_L4_NO_CKSUM]; - tmp |= l3_olinfo[(ol_flags & PKT_TX_IP_CKSUM) != 0]; + uint32_t tmp = 0; + if ((ol_flags & PKT_TX_L4_MASK) != PKT_TX_L4_NO_CKSUM) + tmp |= IXGBE_ADVTXD_POPTS_TXSM; + if (ol_flags & PKT_TX_IP_CKSUM) + tmp |= IXGBE_ADVTXD_POPTS_IXSM; + if (ol_flags & PKT_TX_TCP_SEG) + tmp |= IXGBE_ADVTXD_POPTS_TXSM; return tmp; } static inline uint32_t -tx_desc_vlan_flags_to_cmdtype(uint64_t ol_flags) +tx_desc_ol_flags_to_cmdtype(uint64_t ol_flags) { - static const uint32_t vlan_cmd[2] = {0, IXGBE_ADVTXD_DCMD_VLE}; - return vlan_cmd[(ol_flags & PKT_TX_VLAN_PKT) != 0]; + uint32_t cmdtype = 0; + if (ol_flags & PKT_TX_VLAN_PKT) + cmdtype |= IXGBE_ADVTXD_DCMD_VLE; + if (ol_flags & PKT_TX_TCP_SEG) + cmdtype |= IXGBE_ADVTXD_DCMD_TSE; + return cmdtype; } /* Default RS bit threshold values */ @@ -539,14 +614,6 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, volatile union ixgbe_adv_tx_desc *txd; struct rte_mbuf *tx_pkt; struct rte_mbuf *m_seg; - union ixgbe_vlan_macip vlan_macip_lens; - union { - uint16_t u16; - struct { - uint16_t l3_len:9; - uint16_t l2_len:7; - }; - } l2_l3_len; uint64_t buf_dma_addr; uint32_t olinfo_status; uint32_t cmd_type_len; @@ -560,6 +627,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint64_t tx_ol_req; uint32_t ctx = 0; uint32_t new_ctx; + union ixgbe_tx_offload tx_offload = { .data = 0 }; txq = tx_queue; sw_ring = txq->sw_ring; @@ -587,17 +655,19 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, ol_flags = tx_pkt->ol_flags; /* If hardware offload required */ - tx_ol_req = ol_flags & (PKT_TX_VLAN_PKT | PKT_TX_IP_CKSUM | - PKT_TX_L4_MASK); + tx_ol_req = ol_flags & + (PKT_TX_VLAN_PKT | PKT_TX_IP_CKSUM | PKT_TX_L4_MASK | + PKT_TX_TCP_SEG); if (tx_ol_req) { - l2_l3_len.l2_len = tx_pkt->l2_len; - l2_l3_len.l3_len = tx_pkt->l3_len; - vlan_macip_lens.f.vlan_tci = tx_pkt->vlan_tci; - vlan_macip_lens.f.l2_l3_len = l2_l3_len.u16; + tx_offload.l2_len = tx_pkt->l2_len; + tx_offload.l3_len = tx_pkt->l3_len; + tx_offload.l4_len = tx_pkt->l4_len; + tx_offload.vlan_tci = tx_pkt->vlan_tci; + tx_offload.tso_segsz = tx_pkt->tso_segsz; /* If new context need be built or reuse the exist ctx. */ ctx = what_advctx_update(txq, tx_ol_req, - vlan_macip_lens.data); + tx_offload); /* Only allocate context descriptor if required*/ new_ctx = (ctx == IXGBE_CTX_NUM); ctx = txq->ctx_curr; @@ -712,13 +782,26 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, */ cmd_type_len = IXGBE_ADVTXD_DTYP_DATA | IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT; - olinfo_status = (pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT); + #ifdef RTE_LIBRTE_IEEE1588 if (ol_flags & PKT_TX_IEEE1588_TMST) cmd_type_len |= IXGBE_ADVTXD_MAC_1588; #endif + olinfo_status = 0; if (tx_ol_req) { + + if (ol_flags & PKT_TX_TCP_SEG) { + /* when TSO is on, paylen in descriptor is the + * not the packet len but the tcp payload len */ + pkt_len -= (tx_offload.l2_len + + tx_offload.l3_len + tx_offload.l4_len); + + /* the pseudo header checksum must be modified: + * it should not include the ip_len */ + ixgbe_fix_tcp_phdr_cksum(tx_pkt); + } + /* * Setup the TX Advanced Context Descriptor if required */ @@ -739,7 +822,7 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, } ixgbe_set_xmit_ctx(txq, ctx_txd, tx_ol_req, - vlan_macip_lens.data); + tx_offload); txe->last_id = tx_last; tx_id = txe->next_id; @@ -751,11 +834,13 @@ ixgbe_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, * This path will go through * whatever new/reuse the context descriptor */ - cmd_type_len |= tx_desc_vlan_flags_to_cmdtype(ol_flags); + cmd_type_len |= tx_desc_ol_flags_to_cmdtype(ol_flags); olinfo_status |= tx_desc_cksum_flags_to_olinfo(ol_flags); olinfo_status |= ctx << IXGBE_ADVTXD_IDX_SHIFT; } + olinfo_status |= (pkt_len << IXGBE_ADVTXD_PAYLEN_SHIFT); + m_seg = tx_pkt; do { txd = &txr[tx_id]; @@ -3600,9 +3685,10 @@ ixgbe_dev_tx_init(struct rte_eth_dev *dev) PMD_INIT_FUNC_TRACE(); hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private); - /* Enable TX CRC (checksum offload requirement) */ + /* Enable TX CRC (checksum offload requirement) and hw padding + * (TSO requirement) */ hlreg0 = IXGBE_READ_REG(hw, IXGBE_HLREG0); - hlreg0 |= IXGBE_HLREG0_TXCRCEN; + hlreg0 |= (IXGBE_HLREG0_TXCRCEN | IXGBE_HLREG0_TXPADEN); IXGBE_WRITE_REG(hw, IXGBE_HLREG0, hlreg0); /* Setup the Base and Length of the Tx Descriptor Rings */ diff --git a/lib/librte_pmd_ixgbe/ixgbe_rxtx.h b/lib/librte_pmd_ixgbe/ixgbe_rxtx.h index eb89715..13099af 100644 --- a/lib/librte_pmd_ixgbe/ixgbe_rxtx.h +++ b/lib/librte_pmd_ixgbe/ixgbe_rxtx.h @@ -145,13 +145,16 @@ enum ixgbe_advctx_num { }; /** Offload features */ -union ixgbe_vlan_macip { - uint32_t data; +union ixgbe_tx_offload { + uint64_t data; struct { - uint16_t l2_l3_len; /**< combined 9-bit l3, 7-bit l2 lengths */ - uint16_t vlan_tci; + uint64_t l2_len:7; /**< L2 (MAC) Header Length. */ + uint64_t l3_len:9; /**< L3 (IP) Header Length. */ + uint64_t l4_len:8; /**< L4 (TCP/UDP) Header Length. */ + uint64_t tso_segsz:16; /**< TCP TSO segment size */ + uint64_t vlan_tci:16; /**< VLAN Tag Control Identifier (CPU order). */ - } f; + }; }; /* @@ -170,8 +173,10 @@ union ixgbe_vlan_macip { struct ixgbe_advctx_info { uint64_t flags; /**< ol_flags for context build. */ - uint32_t cmp_mask; /**< compare mask for vlan_macip_lens */ - union ixgbe_vlan_macip vlan_macip_lens; /**< vlan, mac ip length. */ + /**< tx offload: vlan, tso, l2-l3-l4 lengths. */ + union ixgbe_tx_offload tx_offload; + /** compare mask for tx offload. */ + union ixgbe_tx_offload tx_offload_mask; }; /** -- 2.1.0