On 11/24/22 06:30, Mike Pattrick wrote:
> From: Flavio Leitner <f...@sysclose.org>
> 
> The netdev receiving packets is supposed to provide the flags
> indicating if the IP checksum was verified and it is GOOD or BAD,
> otherwise the stack will check when appropriate by software.
> 
> If the packet comes with good checksum, then postpone the
> checksum calculation to the egress device if needed.
> 
> When encapsulate a packet with that flag, set the checksum
> of the inner IP header since that is not yet supported.
> 
> Calculate the IP checksum when the packet is going to be sent over
> a device that doesn't support the feature.
> 
> Linux devices don't support IP checksum offload alone, so the
> support is not enabled.
> 
> Signed-off-by: Flavio Leitner <f...@sysclose.org>
> Co-authored-by: Mike Pattrick <m...@redhat.com>
> Signed-off-by: Mike Pattrick <m...@redhat.com>
> ---

Hi.

I didn't test this patch.  Only a visual review.

See comments inline.

Best regards, Ilya Maximets.

>  lib/conntrack.c                     | 17 +++---
>  lib/dp-packet.c                     | 15 ++++++
>  lib/dp-packet.h                     | 60 +++++++++++++++++++--
>  lib/dpif-netdev.c                   |  4 ++
>  lib/flow.c                          | 15 ++++--
>  lib/ipf.c                           | 11 ++--
>  lib/netdev-dpdk.c                   | 81 ++++++++++++++++-------------
>  lib/netdev-dummy.c                  | 23 ++++++++
>  lib/netdev-native-tnl.c             | 21 +++++---
>  lib/netdev.c                        | 16 ++++++
>  lib/odp-execute.c                   | 21 ++++++--
>  lib/packets.c                       | 34 +++++++++---
>  tests/automake.mk                   |  1 +
>  tests/system-userspace-offload.at   | 79 ++++++++++++++++++++++++++++
>  tests/system-userspace-testsuite.at |  1 +
>  15 files changed, 328 insertions(+), 71 deletions(-)
>  create mode 100644 tests/system-userspace-offload.at
> 
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 550b2be9b..12194cce8 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -2101,16 +2101,15 @@ conn_key_extract(struct conntrack *ct, struct 
> dp_packet *pkt, ovs_be16 dl_type,
>      ctx->key.dl_type = dl_type;
>  
>      if (ctx->key.dl_type == htons(ETH_TYPE_IP)) {
> -        bool hwol_bad_l3_csum = dp_packet_ip_checksum_bad(pkt);
> -        if (hwol_bad_l3_csum) {
> +        if (dp_packet_ip_checksum_bad(pkt)) {
>              ok = false;
>              COVERAGE_INC(conntrack_l3csum_err);
>          } else {
> -            bool hwol_good_l3_csum = dp_packet_ip_checksum_valid(pkt)
> -                                     || dp_packet_hwol_is_ipv4(pkt);
> -            /* Validate the checksum only when hwol is not supported. */
> +            /* Validate the checksum only when hwol is not supported and the
> +             * packets checksum status is not known. */

s/packets/packet's/

>              ok = extract_l3_ipv4(&ctx->key, l3, dp_packet_l3_size(pkt), NULL,
> -                                 !hwol_good_l3_csum);
> +                                 !dp_packet_hwol_is_ipv4(pkt) &&
> +                                 !dp_packet_ip_checksum_good(pkt));
>          }
>      } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) {
>          ok = extract_l3_ipv6(&ctx->key, l3, dp_packet_l3_size(pkt), NULL);
> @@ -2121,7 +2120,7 @@ conn_key_extract(struct conntrack *ct, struct dp_packet 
> *pkt, ovs_be16 dl_type,
>      if (ok) {
>          bool hwol_bad_l4_csum = dp_packet_l4_checksum_bad(pkt);
>          if (!hwol_bad_l4_csum) {
> -            bool  hwol_good_l4_csum = dp_packet_l4_checksum_valid(pkt)
> +            bool  hwol_good_l4_csum = dp_packet_l4_checksum_good(pkt)

Might make sense to remove the extra space, since we're here.

>                                        || dp_packet_hwol_tx_l4_checksum(pkt);
>              /* Validate the checksum only when hwol is not supported. */
>              if (extract_l4(&ctx->key, l4, dp_packet_l4_size(pkt),
> @@ -3431,7 +3430,9 @@ handle_ftp_ctl(struct conntrack *ct, const struct 
> conn_lookup_ctx *ctx,
>                  }
>                  if (seq_skew) {
>                      ip_len = ntohs(l3_hdr->ip_tot_len) + seq_skew;
> -                    if (!dp_packet_hwol_is_ipv4(pkt)) {
> +                    if (dp_packet_hwol_tx_ip_csum(pkt)) {
> +                        dp_packet_ol_reset_ip_csum_good(pkt);
> +                    } else {
>                          l3_hdr->ip_csum = recalc_csum16(l3_hdr->ip_csum,
>                                                          l3_hdr->ip_tot_len,
>                                                          htons(ip_len));
> diff --git a/lib/dp-packet.c b/lib/dp-packet.c
> index 4538d2a61..90ef85de3 100644
> --- a/lib/dp-packet.c
> +++ b/lib/dp-packet.c
> @@ -21,6 +21,7 @@
>  #include "dp-packet.h"
>  #include "netdev-afxdp.h"
>  #include "netdev-dpdk.h"
> +#include "netdev-provider.h"
>  #include "openvswitch/dynamic-string.h"
>  #include "util.h"
>  
> @@ -530,3 +531,17 @@ dp_packet_compare_offsets(struct dp_packet *b1, struct 
> dp_packet *b2,
>      }
>      return true;
>  }
> +
> +/* Checks if the packet 'p' is compatible with netdev_ol_flags 'flags'
> + * and if not, update the packet with the software fall back. */

s/update/updates/

> +void
> +dp_packet_ol_send_prepare(struct dp_packet *p, const uint64_t flags)

s/const uint64_t/uint64_t/

> +{
> +    if (dp_packet_ip_checksum_good(p) || !dp_packet_hwol_tx_ip_csum(p)) {
> +        dp_packet_hwol_reset_tx_ip_csum(p);
> +    } else if (!(flags & NETDEV_TX_OFFLOAD_IPV4_CKSUM)) {
> +        dp_packet_ip_set_header_csum(p);
> +        dp_packet_ol_set_ip_csum_good(p);
> +        dp_packet_hwol_reset_tx_ip_csum(p);
> +    }
> +}
> diff --git a/lib/dp-packet.h b/lib/dp-packet.h
> index 55eeaab2c..f60618716 100644
> --- a/lib/dp-packet.h
> +++ b/lib/dp-packet.h
> @@ -25,6 +25,7 @@
>  #include <rte_mbuf.h>
>  #endif
>  
> +#include "csum.h"
>  #include "netdev-afxdp.h"
>  #include "netdev-dpdk.h"
>  #include "openvswitch/list.h"
> @@ -83,6 +84,8 @@ enum dp_packet_offload_mask {
>      DEF_OL_FLAG(DP_PACKET_OL_TX_UDP_CKSUM, RTE_MBUF_F_TX_UDP_CKSUM, 0x400),
>      /* Offload SCTP checksum. */
>      DEF_OL_FLAG(DP_PACKET_OL_TX_SCTP_CKSUM, RTE_MBUF_F_TX_SCTP_CKSUM, 0x800),
> +    /* Offload IP checksum. */
> +    DEF_OL_FLAG(DP_PACKET_OL_TX_IP_CKSUM, RTE_MBUF_F_TX_IP_CKSUM, 0x1000),
>      /* Adding new field requires adding to DP_PACKET_OL_SUPPORTED_MASK. */
>  };
>  
> @@ -97,7 +100,8 @@ enum dp_packet_offload_mask {
>                                       DP_PACKET_OL_TX_IPV6          | \
>                                       DP_PACKET_OL_TX_TCP_CKSUM     | \
>                                       DP_PACKET_OL_TX_UDP_CKSUM     | \
> -                                     DP_PACKET_OL_TX_SCTP_CKSUM)
> +                                     DP_PACKET_OL_TX_SCTP_CKSUM    | \
> +                                     DP_PACKET_OL_TX_IP_CKSUM)
>  
>  #define DP_PACKET_OL_TX_L4_MASK (DP_PACKET_OL_TX_TCP_CKSUM | \
>                                   DP_PACKET_OL_TX_UDP_CKSUM | \
> @@ -239,6 +243,7 @@ static inline bool dp_packet_equal(const struct dp_packet 
> *,
>  bool dp_packet_compare_offsets(struct dp_packet *good,
>                                 struct dp_packet *test,
>                                 struct ds *err_str);
> +void dp_packet_ol_send_prepare(struct dp_packet *, const uint64_t);

s/const//

>  
>  
>  /* Frees memory that 'b' points to, as well as 'b' itself. */
> @@ -1028,6 +1033,26 @@ dp_packet_hwol_set_tx_ipv6(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_IPV6;
>  }
>  
> +/* Returns 'true' if packet 'p' is marked for IPv4 checksum offloading. */
> +static inline bool
> +dp_packet_hwol_tx_ip_csum(const struct dp_packet *p)
> +{
> +    return !!(*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_TX_IP_CKSUM);
> +}
> +
> +/* Marks packet 'p' for IPv4 checksum offloading. */
> +static inline void
> +dp_packet_hwol_set_tx_ip_csum(struct dp_packet *p)
> +{
> +    *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_TX_IP_CKSUM;
> +}
> +
> +static inline void
> +dp_packet_hwol_reset_tx_ip_csum(struct dp_packet *p)
> +{
> +    *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_TX_IP_CKSUM;
> +}
> +
>  /* Mark packet 'b' for TCP checksum offloading.  It implies that either
>   * the packet 'b' is marked for IPv4 or IPv6 checksum offloading. */
>  static inline void
> @@ -1061,13 +1086,31 @@ dp_packet_hwol_set_tcp_seg(struct dp_packet *b)
>      *dp_packet_ol_flags_ptr(b) |= DP_PACKET_OL_TX_TCP_SEG;
>  }
>  
> +/* Returns 'true' is the IP has good integrity and the

s/is the IP has/if the IP header has a/

> + * checksum in it is complete. */
>  static inline bool
> -dp_packet_ip_checksum_valid(const struct dp_packet *p)
> +dp_packet_ip_checksum_good(const struct dp_packet *p)
>  {
>      return (*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_RX_IP_CKSUM_MASK) ==
>              DP_PACKET_OL_RX_IP_CKSUM_GOOD;
>  }
>  
> +/* Marks packet 'p' with good IPv4 checksum. */
> +static inline void
> +dp_packet_ol_set_ip_csum_good(const struct dp_packet *p)
> +{
> +    *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_IP_CKSUM_BAD;
> +    *dp_packet_ol_flags_ptr(p) |= DP_PACKET_OL_RX_IP_CKSUM_GOOD;
> +}
> +
> +/* Resets IP good checksum flag in packet 'p'. */
> +static inline void
> +dp_packet_ol_reset_ip_csum_good(const struct dp_packet *p)

s/const//

> +{
> +    *dp_packet_ol_flags_ptr(p) &= ~DP_PACKET_OL_RX_IP_CKSUM_GOOD;
> +}
> +
> +/* Marks packet 'p' with bad IPv4 checksum. */
>  static inline bool
>  dp_packet_ip_checksum_bad(const struct dp_packet *p)
>  {
> @@ -1075,8 +1118,19 @@ dp_packet_ip_checksum_bad(const struct dp_packet *p)
>              DP_PACKET_OL_RX_IP_CKSUM_BAD;
>  }
>  
> +/* Calculate and set the IPv4 header checksum in packet 'p'. */
> +static inline void
> +dp_packet_ip_set_header_csum(struct dp_packet *p)
> +{
> +    struct ip_header *ip = dp_packet_l3(p);
> +
> +    ovs_assert(ip);
> +    ip->ip_csum = 0;
> +    ip->ip_csum = csum(ip, sizeof *ip);
> +}
> +
>  static inline bool
> -dp_packet_l4_checksum_valid(const struct dp_packet *p)
> +dp_packet_l4_checksum_good(const struct dp_packet *p)
>  {
>      return (*dp_packet_ol_flags_ptr(p) & DP_PACKET_OL_RX_L4_CKSUM_MASK) ==
>              DP_PACKET_OL_RX_L4_CKSUM_GOOD;
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index ef50e62b8..ca5f10e4f 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -7861,6 +7861,10 @@ dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, 
> struct dp_packet *packet_,
>          ds_destroy(&ds);
>      }
>  
> +    /* The packet is going to be encapsulated and sent to
> +     * the controller. */

This comment is not true.  We're just going to trigger an upcall.
It's not decided which actions are going to be executed.
Or we can be here because we are executing an explicit ACTION upcall,
in which case we maybe sending a packet to a controller, yes.


> +    dp_packet_ol_send_prepare(packet_, 0);

So, should this be moved to dp_execute_userspace_action() ?

> +
>      return dp->upcall_cb(packet_, flow, ufid, pmd->core_id, type, userdata,
>                           actions, wc, put_actions, dp->upcall_aux);
>  }
> diff --git a/lib/flow.c b/lib/flow.c
> index c3a3aa3ce..6c8bf7fc0 100644
> --- a/lib/flow.c
> +++ b/lib/flow.c
> @@ -907,6 +907,10 @@ miniflow_extract(struct dp_packet *packet, struct 
> miniflow *dst)
>          nw_proto = nh->ip_proto;
>          nw_frag = ipv4_get_nw_frag(nh);
>          data_pull(&data, &size, ip_len);
> +        dp_packet_hwol_set_tx_ipv4(packet);
> +        if (dp_packet_ip_checksum_good(packet)) {
> +            dp_packet_hwol_set_tx_ip_csum(packet);
> +        }

Don't we need the same changes for avx512 version?
Somewhere in the nighborhood of mfex_ipv4_set_l2_pad_size().


>      } else if (dl_type == htons(ETH_TYPE_IPV6)) {
>          const struct ovs_16aligned_ip6_hdr *nh = data;
>          ovs_be32 tc_flow;
> @@ -920,6 +924,7 @@ miniflow_extract(struct dp_packet *packet, struct 
> miniflow *dst)
>          }
>          data_pull(&data, &size, sizeof *nh);
>  
> +        dp_packet_hwol_set_tx_ipv6(packet);

Same here.

>          plen = ntohs(nh->ip6_plen);
>          dp_packet_set_l2_pad_size(packet, size - plen);
>          size = plen;   /* Never pull padding. */
> @@ -3221,9 +3226,12 @@ packet_expand(struct dp_packet *p, const struct flow 
> *flow, size_t size)
>              struct ip_header *ip = dp_packet_l3(p);
>  
>              ip->ip_tot_len = htons(p->l4_ofs - p->l3_ofs + l4_len);
> -            ip->ip_csum = 0;
> -            ip->ip_csum = csum(ip, sizeof *ip);
> -
> +            if (dp_packet_hwol_tx_ip_csum(p)) {
> +                dp_packet_ol_reset_ip_csum_good(p);
> +            } else {
> +                dp_packet_ip_set_header_csum(p);
> +                dp_packet_ol_set_ip_csum_good(p);
> +            }
>              pseudo_hdr_csum = packet_csum_pseudoheader(ip);
>          } else { /* ETH_TYPE_IPV6 */
>              struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(p);
> @@ -3313,6 +3321,7 @@ flow_compose(struct dp_packet *p, const struct flow 
> *flow,
>          /* Checksum has already been zeroed by put_zeros call. */
>          ip->ip_csum = csum(ip, sizeof *ip);
>  
> +        dp_packet_ol_set_ip_csum_good(p);
>          pseudo_hdr_csum = packet_csum_pseudoheader(ip);
>          flow_compose_l4_csum(p, flow, pseudo_hdr_csum);
>      } else if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
> diff --git a/lib/ipf.c b/lib/ipf.c
> index d45266374..18c98576a 100644
> --- a/lib/ipf.c
> +++ b/lib/ipf.c
> @@ -433,7 +433,9 @@ ipf_reassemble_v4_frags(struct ipf_list *ipf_list)
>      len += rest_len;
>      l3 = dp_packet_l3(pkt);
>      ovs_be16 new_ip_frag_off = l3->ip_frag_off & ~htons(IP_MORE_FRAGMENTS);
> -    if (!dp_packet_hwol_is_ipv4(pkt)) {
> +    if (dp_packet_hwol_tx_ip_csum(pkt)) {
> +        dp_packet_ol_reset_ip_csum_good(pkt);
> +    } else {
>          l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_frag_off,
>                                      new_ip_frag_off);
>          l3->ip_csum = recalc_csum16(l3->ip_csum, l3->ip_tot_len, htons(len));
> @@ -608,8 +610,7 @@ ipf_is_valid_v4_frag(struct ipf *ipf, struct dp_packet 
> *pkt)
>          goto invalid_pkt;
>      }
>  
> -    if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(pkt)
> -                     && !dp_packet_hwol_is_ipv4(pkt)
> +    if (OVS_UNLIKELY(!dp_packet_ip_checksum_good(pkt)
>                       && csum(l3, ip_hdr_len) != 0)) {
>          COVERAGE_INC(ipf_l3csum_err);
>          goto invalid_pkt;
> @@ -1185,7 +1186,9 @@ ipf_post_execute_reass_pkts(struct ipf *ipf,
>                      } else {
>                          struct ip_header *l3_frag = 
> dp_packet_l3(frag_i->pkt);
>                          struct ip_header *l3_reass = dp_packet_l3(pkt);
> -                        if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) {
> +                        if (dp_packet_hwol_tx_ip_csum(frag_i->pkt)) {
> +                            dp_packet_ol_reset_ip_csum_good(frag_i->pkt);
> +                        } else {
>                              ovs_be32 reass_ip =
>                                  get_16aligned_be32(&l3_reass->ip_src);
>                              ovs_be32 frag_ip =
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
> index e4b3465e0..4ccc56b0e 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -406,8 +406,9 @@ enum dpdk_hw_ol_features {
>      NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
>      NETDEV_RX_HW_CRC_STRIP = 1 << 1,
>      NETDEV_RX_HW_SCATTER = 1 << 2,
> -    NETDEV_TX_TSO_OFFLOAD = 1 << 3,
> -    NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
> +    NETDEV_TX_IPV4_CKSUM_OFFLOAD = 1 << 3,
> +    NETDEV_TX_TSO_OFFLOAD = 1 << 4,
> +    NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 5,
>  };
>  
>  /*
> @@ -1035,6 +1036,10 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int 
> n_rxq, int n_txq)
>          conf.rxmode.offloads |= RTE_ETH_RX_OFFLOAD_KEEP_CRC;
>      }
>  
> +    if (dev->hw_ol_features & NETDEV_TX_IPV4_CKSUM_OFFLOAD) {
> +        conf.txmode.offloads |= RTE_ETH_TX_OFFLOAD_IPV4_CKSUM;
> +    }
> +
>      if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
>          conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS;
>          if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) {
> @@ -1175,6 +1180,12 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
>          dev->hw_ol_features &= ~NETDEV_RX_HW_SCATTER;
>      }
>  
> +    if (info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_IPV4_CKSUM) {
> +        dev->hw_ol_features |= NETDEV_TX_IPV4_CKSUM_OFFLOAD;
> +    } else {
> +        dev->hw_ol_features &= ~NETDEV_TX_IPV4_CKSUM_OFFLOAD;
> +    }
> +
>      dev->hw_ol_features &= ~NETDEV_TX_TSO_OFFLOAD;
>      if (userspace_tso_enabled()) {
>          if ((info.tx_offload_capa & tx_tso_offload_capa)
> @@ -1744,16 +1755,12 @@ netdev_dpdk_get_config(const struct netdev *netdev, 
> struct smap *args)
>                          dev->requested_txq_size);
>          smap_add_format(args, "configured_txq_descriptors", "%d",
>                          dev->txq_size);
> -        if (dev->hw_ol_features & NETDEV_RX_CHECKSUM_OFFLOAD) {
> -            smap_add(args, "rx_csum_offload", "true");
> -        } else {
> -            smap_add(args, "rx_csum_offload", "false");
> -        }
> -        if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
> -            smap_add(args, "tx_tso_offload", "true");
> -        } else {
> -            smap_add(args, "tx_tso_offload", "false");
> -        }
> +#define HWOL_SMAP_ADD(FIELD, FLAG) \
> +        smap_add(args, FIELD, dev->hw_ol_features & FLAG ? "true" : "false");
> +        HWOL_SMAP_ADD("rx_csum_offload", NETDEV_RX_CHECKSUM_OFFLOAD);
> +        HWOL_SMAP_ADD("tx_ip_csum_offload", NETDEV_TX_IPV4_CKSUM_OFFLOAD);
> +        HWOL_SMAP_ADD("tx_tso_offload", NETDEV_TX_TSO_OFFLOAD);
> +#undef HWOL_SMAP_ADD

This isn't a config, this is status.  If we're going to report the
status in the generic way, we don't need to report these here at all.

>          smap_add(args, "lsc_interrupt_mode",
>                   dev->lsc_interrupt_mode ? "true" : "false");
>  
> @@ -2196,13 +2203,16 @@ netdev_dpdk_prep_hwol_packet(struct netdev_dpdk *dev, 
> struct rte_mbuf *mbuf)
>  {
>      struct dp_packet *pkt = CONTAINER_OF(mbuf, struct dp_packet, mbuf);
>  
> -    if (mbuf->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
> -        mbuf->l2_len = (char *)dp_packet_l3(pkt) - (char 
> *)dp_packet_eth(pkt);
> -        mbuf->l3_len = (char *)dp_packet_l4(pkt) - (char *)dp_packet_l3(pkt);
> -        mbuf->outer_l2_len = 0;
> -        mbuf->outer_l3_len = 0;
> +    if (!(mbuf->ol_flags & (RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_L4_MASK
> +                            | RTE_MBUF_F_TX_TCP_SEG))) {
> +        return true;
>      }
>  
> +    mbuf->l2_len = (char *) dp_packet_l3(pkt) - (char *) dp_packet_eth(pkt);
> +    mbuf->l3_len = (char *) dp_packet_l4(pkt) - (char *) dp_packet_l3(pkt);
> +    mbuf->outer_l2_len = 0;
> +    mbuf->outer_l3_len = 0;
> +
>      if (mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG) {
>          struct tcp_header *th = dp_packet_l4(pkt);
>  
> @@ -2261,13 +2271,11 @@ netdev_dpdk_eth_tx_burst(struct netdev_dpdk *dev, int 
> qid,
>      uint32_t nb_tx = 0;
>      uint16_t nb_tx_prep = cnt;
>  
> -    if (userspace_tso_enabled()) {
> -        nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> -        if (nb_tx_prep != cnt) {
> -            VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> -                         "Only %u/%u are valid: %s", dev->up.name, 
> nb_tx_prep,
> -                         cnt, rte_strerror(rte_errno));
> -        }
> +    nb_tx_prep = rte_eth_tx_prepare(dev->port_id, qid, pkts, cnt);
> +    if (nb_tx_prep != cnt) {
> +        VLOG_WARN_RL(&rl, "%s: Output batch contains invalid packets. "
> +                     "Only %u/%u are valid: %s", dev->up.name, nb_tx_prep,

netdev_get_name(&dev->up)

> +                     cnt, rte_strerror(rte_errno));
>      }
>  
>      while (nb_tx != nb_tx_prep) {
> @@ -2707,12 +2715,10 @@ dpdk_copy_dp_packet_to_mbuf(struct rte_mempool *mp, 
> struct dp_packet *pkt_orig)
>      memcpy(&pkt_dest->l2_pad_size, &pkt_orig->l2_pad_size,
>             sizeof(struct dp_packet) - offsetof(struct dp_packet, 
> l2_pad_size));
>  
> -    if (mbuf_dest->ol_flags & RTE_MBUF_F_TX_L4_MASK) {
> -        mbuf_dest->l2_len = (char *)dp_packet_l3(pkt_dest)
> -                                - (char *)dp_packet_eth(pkt_dest);
> -        mbuf_dest->l3_len = (char *)dp_packet_l4(pkt_dest)
> -                                - (char *) dp_packet_l3(pkt_dest);
> -    }
> +    mbuf_dest->l2_len = (char *) dp_packet_l3(pkt_dest)
> +                            - (char *) dp_packet_eth(pkt_dest);
> +    mbuf_dest->l3_len = (char *) dp_packet_l4(pkt_dest)
> +                            - (char *) dp_packet_l3(pkt_dest);

Can some of these pointers be NULL here?

>  
>      return pkt_dest;
>  }
> @@ -2769,11 +2775,9 @@ netdev_dpdk_common_send(struct netdev *netdev, struct 
> dp_packet_batch *batch,
>      pkt_cnt = cnt;
>  
>      /* Prepare each mbuf for hardware offloading. */
> -    if (userspace_tso_enabled()) {
> -        cnt = netdev_dpdk_prep_hwol_batch(dev, pkts, pkt_cnt);
> -        stats->tx_invalid_hwol_drops += pkt_cnt - cnt;
> -        pkt_cnt = cnt;
> -    }
> +    cnt = netdev_dpdk_prep_hwol_batch(dev, pkts, pkt_cnt);
> +    stats->tx_invalid_hwol_drops += pkt_cnt - cnt;
> +    pkt_cnt = cnt;
>  
>      /* Apply Quality of Service policy. */
>      cnt = netdev_dpdk_qos_run(dev, pkts, pkt_cnt, true);
> @@ -4988,6 +4992,13 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
>      }
>  
>      err = dpdk_eth_dev_init(dev);
> +
> +    if (dev->hw_ol_features & NETDEV_TX_IPV4_CKSUM_OFFLOAD) {
> +        netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> +    } else {
> +        netdev->ol_flags &= ~NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> +    }
> +
>      if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
>          netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
>          netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM;
> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
> index 72cb95471..db07ee46c 100644
> --- a/lib/netdev-dummy.c
> +++ b/lib/netdev-dummy.c
> @@ -148,6 +148,11 @@ struct netdev_dummy {
>      int requested_n_txq OVS_GUARDED;
>      int requested_n_rxq OVS_GUARDED;
>      int requested_numa_id OVS_GUARDED;
> +
> +    /* Enable netdev IP csum offload. */
> +    bool ol_ip_csum OVS_GUARDED;
> +    /* Flag RX packet with good csum. */
> +    bool ol_ip_csum_set_good OVS_GUARDED;
>  };
>  
>  /* Max 'recv_queue_len' in struct netdev_dummy. */
> @@ -910,6 +915,13 @@ netdev_dummy_set_config(struct netdev *netdev_, const 
> struct smap *args,
>          }
>      }
>  
> +    netdev->ol_ip_csum_set_good = smap_get_bool(args, "ol_ip_csum_set_good",
> +                                                false);
> +    netdev->ol_ip_csum = smap_get_bool(args, "ol_ip_csum", false);
> +    if (netdev->ol_ip_csum) {
> +        netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM;
> +    }
> +
>      netdev_change_seq_changed(netdev_);
>  
>      /* 'dummy-pmd' specific config. */
> @@ -1088,6 +1100,10 @@ netdev_dummy_rxq_recv(struct netdev_rxq *rxq_, struct 
> dp_packet_batch *batch,
>      netdev->rxq_stats[rxq_->queue_id].bytes += dp_packet_size(packet);
>      netdev->custom_stats[0].value++;
>      netdev->custom_stats[1].value++;
> +    if (netdev->ol_ip_csum_set_good) {
> +        /* The netdev hardware sets the flag when the packet has good csum. 
> */
> +        dp_packet_ol_set_ip_csum_good(packet);
> +    }
>      ovs_mutex_unlock(&netdev->mutex);
>  
>      dp_packet_batch_init_packet(batch, packet);
> @@ -1170,6 +1186,13 @@ netdev_dummy_send(struct netdev *netdev, int qid,
>          }
>  
>          ovs_mutex_lock(&dev->mutex);
> +        if (dp_packet_hwol_tx_ip_csum(packet)) {
> +            if (!dp_packet_ip_checksum_good(packet)) {
> +                dp_packet_ip_set_header_csum(packet);
> +                dp_packet_ol_set_ip_csum_good(packet);
> +            }
> +        }
> +
>          dev->stats.tx_packets++;
>          dev->txq_stats[qid].packets++;
>          dev->stats.tx_bytes += size;
> diff --git a/lib/netdev-native-tnl.c b/lib/netdev-native-tnl.c
> index b89dfdd52..754e2d78d 100644
> --- a/lib/netdev-native-tnl.c
> +++ b/lib/netdev-native-tnl.c
> @@ -88,7 +88,10 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, 
> struct flow_tnl *tnl,
>  
>          ovs_be32 ip_src, ip_dst;
>  
> -        if (OVS_UNLIKELY(!dp_packet_ip_checksum_valid(packet))) {
> +        /* A packet coming from a network device might have the
> +         * csum already checked. In this case, skip the check. */
> +        if (OVS_UNLIKELY(!dp_packet_ip_checksum_good(packet))
> +            && !dp_packet_hwol_tx_ip_csum(packet)) {
>              if (csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) {
>                  VLOG_WARN_RL(&err_rl, "ip packet has invalid checksum");
>                  return NULL;
> @@ -142,7 +145,8 @@ netdev_tnl_ip_extract_tnl_md(struct dp_packet *packet, 
> struct flow_tnl *tnl,
>   *
>   * This function sets the IP header's ip_tot_len field (which should be 
> zeroed
>   * as part of 'header') and puts its value into '*ip_tot_size' as well.  Also
> - * updates IP header checksum, as well as the l3 and l4 offsets in 'packet'.
> + * updates IP header checksum if not offloaded, as well as the l3 and l4
> + * offsets in 'packet'.

in the 'packet'.

>   *
>   * Return pointer to the L4 header added to 'packet'. */
>  void *
> @@ -167,11 +171,16 @@ netdev_tnl_push_ip_header(struct dp_packet *packet,
>          *ip_tot_size -= IPV6_HEADER_LEN;
>          ip6->ip6_plen = htons(*ip_tot_size);
>          packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size;
> +        dp_packet_hwol_set_tx_ipv6(packet);
> +        dp_packet_ol_reset_ip_csum_good(packet);
>          return ip6 + 1;
>      } else {
>          ip = netdev_tnl_ip_hdr(eth);
>          ip->ip_tot_len = htons(*ip_tot_size);
> -        ip->ip_csum = recalc_csum16(ip->ip_csum, 0, ip->ip_tot_len);
> +        /* Postpone checksum to when the packet is pushed to the port. */
> +        dp_packet_hwol_set_tx_ipv4(packet);
> +        dp_packet_hwol_set_tx_ip_csum(packet);
> +        dp_packet_ol_reset_ip_csum_good(packet);
>          *ip_tot_size -= IP_HEADER_LEN;
>          packet->l4_ofs = dp_packet_size(packet) - *ip_tot_size;
>          return ip + 1;
> @@ -190,7 +199,7 @@ udp_extract_tnl_md(struct dp_packet *packet, struct 
> flow_tnl *tnl,
>      }
>  
>      if (udp->udp_csum) {
> -        if (OVS_UNLIKELY(!dp_packet_l4_checksum_valid(packet))) {
> +        if (OVS_UNLIKELY(!dp_packet_l4_checksum_good(packet))) {
>              uint32_t csum;
>              if (netdev_tnl_is_header_ipv6(dp_packet_data(packet))) {
>                  csum = packet_csum_pseudoheader6(dp_packet_l3(packet));
> @@ -297,8 +306,8 @@ netdev_tnl_ip_build_header(struct ovs_action_push_tnl 
> *data,
>          ip->ip_frag_off = (params->flow->tunnel.flags & 
> FLOW_TNL_F_DONT_FRAGMENT) ?
>                            htons(IP_DF) : 0;
>  
> -        /* Checksum has already been zeroed by eth_build_header. */
> -        ip->ip_csum = csum(ip, sizeof *ip);
> +        /* The checksum will be calculated when the headers are pushed
> +         * to the packet if offloading is not enabled. */
>  
>          data->header_len += IP_HEADER_LEN;
>          return ip + 1;
> diff --git a/lib/netdev.c b/lib/netdev.c
> index bd068507a..6d3f678f0 100644
> --- a/lib/netdev.c
> +++ b/lib/netdev.c
> @@ -807,6 +807,14 @@ netdev_send_prepare_packet(const uint64_t netdev_flags,
>              return false;
>      }
>  
> +    /* Packet with IP csum offloading enabled was received with verified 
> csum.

This is not always true here.  Should this comment be more conditional?

> +     * Leave the IP csum offloading enabled even with good checksum to the
> +     * netdev to decide what would be the best to do.
> +     * Provide a software fallback in case the device doesn't support IP csum
> +     * offloading. Note: Encapsulated packet must have the inner IP header
> +     * csum already calculated. */
> +    dp_packet_ol_send_prepare(packet, netdev_flags);
> +
>      l4_mask = dp_packet_hwol_l4_mask(packet);
>      if (l4_mask) {
>          if (dp_packet_hwol_l4_is_tcp(packet)) {
> @@ -974,7 +982,15 @@ netdev_push_header(const struct netdev *netdev,
>                           "not supported: packet dropped",
>                           netdev_get_name(netdev));
>          } else {
> +            /* The packet is going to be encapsulated and there is
> +             * no support yet for inner network header csum offloading. */
> +            if (dp_packet_hwol_tx_ip_csum(packet)
> +                && !dp_packet_ip_checksum_good(packet)) {
> +                dp_packet_ip_set_header_csum(packet);
> +            }
> +
>              netdev->netdev_class->push_header(netdev, packet, data);
> +
>              pkt_metadata_init(&packet->md, data->out_port);
>              dp_packet_batch_refill(batch, packet, i);
>          }
> diff --git a/lib/odp-execute.c b/lib/odp-execute.c
> index 5cf6fbec0..37f0f717a 100644
> --- a/lib/odp-execute.c
> +++ b/lib/odp-execute.c
> @@ -169,9 +169,14 @@ odp_set_ipv4(struct dp_packet *packet, const struct 
> ovs_key_ipv4 *key,
>          new_tos = key->ipv4_tos | (nh->ip_tos & ~mask->ipv4_tos);
>  
>          if (nh->ip_tos != new_tos) {
> -            nh->ip_csum = recalc_csum16(nh->ip_csum,
> -                                        htons((uint16_t) nh->ip_tos),
> -                                        htons((uint16_t) new_tos));
> +            if (dp_packet_hwol_tx_ip_csum(packet)) {
> +                dp_packet_ol_reset_ip_csum_good(packet);
> +            } else {
> +                nh->ip_csum = recalc_csum16(nh->ip_csum,
> +                                            htons((uint16_t) nh->ip_tos),
> +                                            htons((uint16_t) new_tos));
> +            }
> +
>              nh->ip_tos = new_tos;
>          }
>      }
> @@ -180,8 +185,14 @@ odp_set_ipv4(struct dp_packet *packet, const struct 
> ovs_key_ipv4 *key,
>          new_ttl = key->ipv4_ttl | (nh->ip_ttl & ~mask->ipv4_ttl);
>  
>          if (OVS_LIKELY(nh->ip_ttl != new_ttl)) {
> -            nh->ip_csum = recalc_csum16(nh->ip_csum, htons(nh->ip_ttl << 8),
> -                                        htons(new_ttl << 8));
> +            if (dp_packet_hwol_tx_ip_csum(packet)) {
> +                dp_packet_ol_reset_ip_csum_good(packet);
> +            } else {
> +                nh->ip_csum = recalc_csum16(nh->ip_csum,
> +                                            htons(nh->ip_ttl << 8),
> +                                            htons(new_ttl << 8));
> +            }
> +
>              nh->ip_ttl = new_ttl;
>          }
>      }
> diff --git a/lib/packets.c b/lib/packets.c
> index 1dcd4a6fc..a1d668190 100644
> --- a/lib/packets.c
> +++ b/lib/packets.c
> @@ -1144,7 +1144,12 @@ packet_set_ipv4_addr(struct dp_packet *packet,
>              }
>          }
>      }
> -    nh->ip_csum = recalc_csum32(nh->ip_csum, old_addr, new_addr);
> +
> +    if (dp_packet_hwol_tx_ip_csum(packet)) {
> +        dp_packet_ol_reset_ip_csum_good(packet);
> +    } else {
> +        nh->ip_csum = recalc_csum32(nh->ip_csum, old_addr, new_addr);
> +    }

I suppose, action_avx512_ipv4_set_addrs() needs some relevant changes as well.

>      put_16aligned_be32(addr, new_addr);
>  }
>  
> @@ -1311,16 +1316,26 @@ packet_set_ipv4(struct dp_packet *packet, ovs_be32 
> src, ovs_be32 dst,
>      if (nh->ip_tos != tos) {
>          uint8_t *field = &nh->ip_tos;
>  
> -        nh->ip_csum = recalc_csum16(nh->ip_csum, htons((uint16_t) *field),
> -                                    htons((uint16_t) tos));
> +        if (dp_packet_hwol_tx_ip_csum(packet)) {
> +            dp_packet_ol_reset_ip_csum_good(packet);
> +        } else {
> +            nh->ip_csum = recalc_csum16(nh->ip_csum, htons((uint16_t) 
> *field),
> +                                        htons((uint16_t) tos));
> +        }
> +
>          *field = tos;
>      }
>  
>      if (nh->ip_ttl != ttl) {
>          uint8_t *field = &nh->ip_ttl;
>  
> -        nh->ip_csum = recalc_csum16(nh->ip_csum, htons(*field << 8),
> -                                    htons(ttl << 8));
> +        if (dp_packet_hwol_tx_ip_csum(packet)) {
> +            dp_packet_ol_reset_ip_csum_good(packet);
> +        } else {
> +            nh->ip_csum = recalc_csum16(nh->ip_csum, htons(*field << 8),
> +                                        htons(ttl << 8));
> +        }
> +
>          *field = ttl;
>      }
>  }
> @@ -1931,8 +1946,13 @@ IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6)
>  
>          tos |= IP_ECN_CE;
>          if (nh->ip_tos != tos) {
> -            nh->ip_csum = recalc_csum16(nh->ip_csum, htons(nh->ip_tos),
> -                                        htons((uint16_t) tos));
> +            if (dp_packet_hwol_tx_ip_csum(pkt)) {
> +                dp_packet_ol_reset_ip_csum_good(pkt);
> +            } else {
> +                nh->ip_csum = recalc_csum16(nh->ip_csum, htons(nh->ip_tos),
> +                                            htons((uint16_t) tos));
> +            }
> +
>              nh->ip_tos = tos;
>          }
>      }
> diff --git a/tests/automake.mk b/tests/automake.mk
> index d509cf935..29f639bd8 100644
> --- a/tests/automake.mk
> +++ b/tests/automake.mk
> @@ -160,6 +160,7 @@ SYSTEM_KMOD_TESTSUITE_AT = \
>  SYSTEM_USERSPACE_TESTSUITE_AT = \
>       tests/system-userspace-testsuite.at \
>       tests/system-userspace-macros.at \
> +     tests/system-userspace-offload.at \
>       tests/system-userspace-packet-type-aware.at \
>       tests/system-route.at
>  
> diff --git a/tests/system-userspace-offload.at 
> b/tests/system-userspace-offload.at
> new file mode 100644
> index 000000000..4d7f3ef89
> --- /dev/null
> +++ b/tests/system-userspace-offload.at
> @@ -0,0 +1,79 @@
> +AT_BANNER([userspace offload])
> +
> +AT_SETUP([userspace offload - ip csum offload])
> +OVS_VSWITCHD_START(
> +  [add-br br1 -- set bridge br1 datapath-type=dummy -- \
> +   add-port br1 p1 -- \
> +       set Interface p1 type=dummy -- \
> +   add-port br1 p2 -- \
> +       set Interface p2 type=dummy --])

These tests are using dummy datapath and dummy ports, they should be
part of a normal 'make check' testsuite.

> +
> +# Modify the ip_dst addr to force changing the IP csum.
> +AT_CHECK([ovs-ofctl add-flow br1 
> in_port=p1,actions=mod_nw_dst:192.168.1.1,output:p2])
> +
> +# Check if no offload remains ok.
> +AT_CHECK([ovs-vsctl set Interface p2 options:tx_pcap=p2.pcap])
> +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false])
> +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=false])
> +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \
> +0a8f394fe0738abf7e2f058408004500003433e0400040068f8fc0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4
> +])
> +
> +# Checksum should change to 0x990 with ip_dst changed to 192.168.1.1
> +# by the datapath while processing the packet.
> +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1])
> +AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl
> +0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4
> +])
> +
> +# Check if packets entering the datapath with csum offloading
> +# enabled gets the csum updated properly by egress handling
> +# in the datapath and not by the netdev.
> +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false])
> +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true])
> +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \
> +0a8f394fe0738abf7e2f058408004500003433e0400040068f8fc0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4
> +])
> +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1])
> +AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl
> +0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4
> +])
> +
> +# Check if packets entering the datapath with csum offloading
> +# enabled gets the csum updated properly by netdev and not
> +# by the datapath.
> +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=true])
> +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true])
> +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \
> +0a8f394fe0738abf7e2f058408004500003433e0400040068f8fc0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4
> +])
> +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1])
> +AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl
> +0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4
> +])
> +
> +# Push a packet with bad csum and offloading disabled to check
> +# if the datapath updates the csum, but does not fix the issue.
> +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=false])
> +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=false])
> +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \
> +0a8f394fe0738abf7e2f058408004500003433e0400040068f03c0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4
> +])
> +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1])
> +AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl
> +0a8f394fe0738abf7e2f058408004500003433e0400040060904c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4
> +])
> +
> +# Push a packet with bad csum and offloading enabled to check
> +# if the driver updates and fixes the csum.
> +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum=true])
> +AT_CHECK([ovs-vsctl set Interface p1 options:ol_ip_csum_set_good=true])
> +AT_CHECK([ovs-appctl netdev-dummy/receive p1 \
> +0a8f394fe0738abf7e2f058408004500003433e0400040068f03c0a87b02c0a87b01d4781451a962ad5417ed297b801000e547fd00000101080a2524d2345c7fe1c4
> +])
> +AT_CHECK([ovs-pcap p2.pcap > p2.pcap.txt 2>&1])
> +AT_CHECK([tail -n 1 p2.pcap.txt], [0], [dnl
> +0a8f394fe0738abf7e2f058408004500003433e0400040060990c0a87b02c0a80101d4781451a962ad5417ed297b801000e5c1fd00000101080a2524d2345c7fe1c4
> +])
> +OVS_VSWITCHD_STOP
> +AT_CLEANUP
> diff --git a/tests/system-userspace-testsuite.at 
> b/tests/system-userspace-testsuite.at
> index 2e9659a67..1021b4ad4 100644
> --- a/tests/system-userspace-testsuite.at
> +++ b/tests/system-userspace-testsuite.at
> @@ -25,5 +25,6 @@ m4_include([tests/system-common-macros.at])
>  m4_include([tests/system-traffic.at])
>  m4_include([tests/system-layer3-tunnels.at])
>  m4_include([tests/system-interface.at])
> +m4_include([tests/system-userspace-offload.at])
>  m4_include([tests/system-userspace-packet-type-aware.at])
>  m4_include([tests/system-route.at])


_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Reply via email to