Optimizing tunneling performance in userspace datapath by offloading the
rx checksum validation on tunnel packets to the NIC when it is supported.

This patch improves the bidirectional VxLAN tunneling performance by 8% and
decapsulation performance by 24%. However it introduces 1% performance drop in
PHY-PHY case due to the overhead of validating invalid checksum flag reported
by NIC.

Signed-off-by: Sugesh Chandran <sugesh.chand...@intel.com>
---
 lib/dpif-netdev.c     | 40 ++++++++++++++++++++++++++++++++--------
 lib/netdev-dpdk.c     | 39 +++++++++++++++++++++++++--------------
 lib/netdev-provider.h |  3 +++
 lib/netdev-vport.c    | 25 +++++++++++++++----------
 lib/netdev.c          |  1 +
 lib/netdev.h          |  5 +++++
 lib/packets.h         |  1 +
 7 files changed, 82 insertions(+), 32 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 2870951..5de5c6a 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -70,6 +70,7 @@
 #include "util.h"
 
 #include "openvswitch/vlog.h"
+#include "netdev-provider.h"
 
 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
 
@@ -479,7 +480,8 @@ static void dp_netdev_execute_actions(struct 
dp_netdev_pmd_thread *pmd,
                                       const struct nlattr *actions,
                                       size_t actions_len);
 static void dp_netdev_input(struct dp_netdev_pmd_thread *,
-                            struct dp_packet **, int cnt, odp_port_t port_no);
+                                      struct dp_packet **, int cnt,
+                                      struct dp_netdev_port *port);
 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
                                   struct dp_packet **, int cnt);
 
@@ -2572,7 +2574,7 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread 
*pmd,
         *recirc_depth_get() = 0;
 
         cycles_count_start(pmd);
-        dp_netdev_input(pmd, packets, cnt, port->port_no);
+        dp_netdev_input(pmd, packets, cnt, port);
         cycles_count_end(pmd, PMD_CYCLES_PROCESSING);
     } else if (error != EAGAIN && error != EOPNOTSUPP) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
@@ -3394,6 +3396,20 @@ dp_netdev_queue_batches(struct dp_packet *pkt,
     packet_batch_update(batch, pkt, mf);
 }
 
+static inline bool
+is_checksum_valid(struct dp_packet *packet) {
+#ifdef DPDK_NETDEV
+    if (packet->mbuf.ol_flags & (PKT_RX_IP_CKSUM_BAD |
+                                 PKT_RX_L4_CKSUM_BAD)) {
+        return 0;
+    }
+    packet->md.ol_flags = NETDEV_RX_CHECKSUM_OFFLOAD;
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 /* Try to process all ('cnt') the 'packets' using only the exact match cache
  * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
  * miniflow is copied into 'keys' and the packet pointer is moved at the
@@ -3409,11 +3425,13 @@ static inline size_t
 emc_processing(struct dp_netdev_pmd_thread *pmd, struct dp_packet **packets,
                size_t cnt, struct netdev_flow_key *keys,
                struct packet_batch batches[], size_t *n_batches,
-               bool md_is_valid, odp_port_t port_no)
+               bool md_is_valid, struct dp_netdev_port *port)
 {
     struct emc_cache *flow_cache = &pmd->flow_cache;
     struct netdev_flow_key *key = &keys[0];
     size_t i, n_missed = 0, n_dropped = 0;
+    bool rx_cksm_ol_enable = port && (port->netdev->ol_flags &
+                                    NETDEV_RX_CHECKSUM_OFFLOAD);
 
     for (i = 0; i < cnt; i++) {
         struct dp_netdev_flow *flow;
@@ -3425,6 +3443,12 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, struct 
dp_packet **packets,
             continue;
         }
 
+        if (OVS_UNLIKELY(rx_cksm_ol_enable && !is_checksum_valid(packet))) {
+            dp_packet_delete(packet);
+            n_dropped++;
+            continue;
+        }
+
         if (i != cnt - 1) {
             /* Prefetch next packet data and metadata. */
             OVS_PREFETCH(dp_packet_data(packets[i+1]));
@@ -3432,7 +3456,7 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, struct 
dp_packet **packets,
         }
 
         if (!md_is_valid) {
-            pkt_metadata_init(&packet->md, port_no);
+            pkt_metadata_init(&packet->md, port->port_no);
         }
         miniflow_extract(packet, &key->mf);
         key->len = 0; /* Not computed yet. */
@@ -3606,7 +3630,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
 static void
 dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
                   struct dp_packet **packets, int cnt,
-                  bool md_is_valid, odp_port_t port_no)
+                  bool md_is_valid, struct dp_netdev_port *port)
 {
 #if !defined(__CHECKER__) && !defined(_WIN32)
     const size_t PKT_ARRAY_SIZE = cnt;
@@ -3621,7 +3645,7 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
 
     n_batches = 0;
     newcnt = emc_processing(pmd, packets, cnt, keys, batches, &n_batches,
-                            md_is_valid, port_no);
+                            md_is_valid, port);
     if (OVS_UNLIKELY(newcnt)) {
         fast_path_processing(pmd, packets, newcnt, keys, batches, &n_batches);
     }
@@ -3638,9 +3662,9 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
 static void
 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
                 struct dp_packet **packets, int cnt,
-                odp_port_t port_no)
+                struct dp_netdev_port *port)
 {
-     dp_netdev_input__(pmd, packets, cnt, false, port_no);
+     dp_netdev_input__(pmd, packets, cnt, false, port);
 }
 
 static void
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index c7217ea..1c5cd56 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -527,7 +527,8 @@ dpdk_watchdog(void *dummy OVS_UNUSED)
 }
 
 static int
-dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq)
+dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq,
+                         const struct rte_eth_conf *new_port_conf)
 {
     int diag = 0;
     int i;
@@ -542,7 +543,8 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int 
n_rxq, int n_txq)
             VLOG_INFO("Retrying setup with (rxq:%d txq:%d)", n_rxq, n_txq);
         }
 
-        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq, &port_conf);
+        diag = rte_eth_dev_configure(dev->port_id, n_rxq, n_txq,
+                                     new_port_conf);
         if (diag) {
             break;
         }
@@ -589,16 +591,24 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int 
n_rxq, int n_txq)
     return diag;
 }
 
+static struct netdev_dpdk *
+netdev_dpdk_cast(const struct netdev *netdev)
+{
+    return CONTAINER_OF(netdev, struct netdev_dpdk, up);
+}
 
 static int
-dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dpdk_mutex)
+dpdk_eth_dev_init(struct netdev *netdev) OVS_REQUIRES(dpdk_mutex)
 {
     struct rte_pktmbuf_pool_private *mbp_priv;
     struct rte_eth_dev_info info;
     struct ether_addr eth_addr;
+    struct rte_eth_conf new_port_conf;
     int diag;
     int n_rxq, n_txq;
+    struct netdev_dpdk *dev;
 
+    dev = netdev_dpdk_cast(netdev);
     if (dev->port_id < 0 || dev->port_id >= rte_eth_dev_count()) {
         return ENODEV;
     }
@@ -608,7 +618,14 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) 
OVS_REQUIRES(dpdk_mutex)
     n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
     n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
 
-    diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq);
+    new_port_conf = port_conf;
+    /* Enable rx checksum offload if it is supported by the NIC */
+    if (info.rx_offload_capa & DEV_RX_OFFLOAD_IPV4_CKSUM) {
+        netdev->ol_flags |= NETDEV_RX_CHECKSUM_OFFLOAD;
+        new_port_conf.rxmode.hw_ip_checksum = 1;
+    }
+
+    diag = dpdk_eth_dev_queue_setup(dev, n_rxq, n_txq, &new_port_conf);
     if (diag) {
         VLOG_ERR("Interface %s(rxq:%d txq:%d) configure error: %s",
                  dev->up.name, n_rxq, n_txq, rte_strerror(-diag));
@@ -640,12 +657,6 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) 
OVS_REQUIRES(dpdk_mutex)
     return 0;
 }
 
-static struct netdev_dpdk *
-netdev_dpdk_cast(const struct netdev *netdev)
-{
-    return CONTAINER_OF(netdev, struct netdev_dpdk, up);
-}
-
 static struct netdev *
 netdev_dpdk_alloc(void)
 {
@@ -728,7 +739,7 @@ netdev_dpdk_init(struct netdev *netdev, unsigned int 
port_no,
 
     if (type == DPDK_DEV_ETH) {
         netdev_dpdk_alloc_txq(dev, NR_QUEUE);
-        err = dpdk_eth_dev_init(dev);
+        err = dpdk_eth_dev_init(netdev);
         if (err) {
             goto unlock;
         }
@@ -966,7 +977,7 @@ netdev_dpdk_set_multiq(struct netdev *netdev, unsigned int 
n_txq,
     netdev->n_rxq = n_rxq;
 
     rte_free(dev->tx_q);
-    err = dpdk_eth_dev_init(dev);
+    err = dpdk_eth_dev_init(netdev);
     netdev_dpdk_alloc_txq(dev, dev->real_n_txq);
     if (err) {
         /* If there has been an error, it means that the requested queues
@@ -1612,13 +1623,13 @@ netdev_dpdk_set_mtu(const struct netdev *netdev, int 
mtu)
     dev->mtu = mtu;
     dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
 
-    err = dpdk_eth_dev_init(dev);
+    err = dpdk_eth_dev_init(netdev);
     if (err) {
         dpdk_mp_put(mp);
         dev->mtu = old_mtu;
         dev->dpdk_mp = old_mp;
         dev->max_packet_len = MTU_TO_FRAME_LEN(dev->mtu);
-        dpdk_eth_dev_init(dev);
+        dpdk_eth_dev_init(netdev);
         goto out;
     }
 
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index cda25eb..4b87d17 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -60,6 +60,9 @@ struct netdev {
     /* Number of rx queues requested by user. */
     int requested_n_rxq;
     int ref_cnt;                        /* Times this devices was opened. */
+    /* offload features supported by netdev, defined by set of
+     * netdev_ofld_features enums. */
+    uint32_t ol_flags;
     struct shash_node *node;            /* Pointer to element in global map. */
     struct ovs_list saved_flags_list; /* Contains "struct netdev_saved_flags". 
*/
 };
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index e398562..9f1f5f8 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -874,7 +874,7 @@ ipv6_hdr(void *eth)
 
 static void *
 ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
-                  unsigned int *hlen)
+                  unsigned int *hlen, uint32_t rx_cksm_ol_flag)
 {
     void *nh;
     struct ip_header *ip;
@@ -900,7 +900,7 @@ ip_extract_tnl_md(struct dp_packet *packet, struct flow_tnl 
*tnl,
 
         ovs_be32 ip_src, ip_dst;
 
-        if (csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) {
+        if (!rx_cksm_ol_flag && csum(ip, IP_IHL(ip->ip_ihl_ver) * 4)) {
             VLOG_WARN_RL(&err_rl, "ip packet has invalid checksum");
             return NULL;
         }
@@ -990,16 +990,16 @@ push_ip_header(struct dp_packet *packet,
 
 static void *
 udp_extract_tnl_md(struct dp_packet *packet, struct flow_tnl *tnl,
-                   unsigned int *hlen)
+                   unsigned int *hlen, uint32_t rx_cksm_ol_flag)
 {
     struct udp_header *udp;
 
-    udp = ip_extract_tnl_md(packet, tnl, hlen);
+    udp = ip_extract_tnl_md(packet, tnl, hlen, rx_cksm_ol_flag);
     if (!udp) {
         return NULL;
     }
 
-    if (udp->udp_csum) {
+    if (!rx_cksm_ol_flag && udp->udp_csum) {
         uint32_t csum;
         if (is_header_ipv6(dp_packet_data(packet))) {
             csum = packet_csum_pseudoheader6(dp_packet_l3(packet));
@@ -1121,14 +1121,14 @@ gre_header_len(ovs_be16 flags)
 
 static int
 parse_gre_header(struct dp_packet *packet,
-                 struct flow_tnl *tnl)
+                 struct flow_tnl *tnl, uint32_t rx_cksm_ol_flag)
 {
     const struct gre_base_hdr *greh;
     ovs_16aligned_be32 *options;
     int hlen;
     unsigned int ulen;
 
-    greh = ip_extract_tnl_md(packet, tnl, &ulen);
+    greh = ip_extract_tnl_md(packet, tnl, &ulen, rx_cksm_ol_flag);
     if (!greh) {
         return -EINVAL;
     }
@@ -1180,6 +1180,8 @@ pkt_metadata_init_tnl(struct pkt_metadata *md)
      * are before this and as long as they are empty, the options won't
      * be looked at. */
     memset(md, 0, offsetof(struct pkt_metadata, tunnel.metadata.opts));
+    /* Reset the offload flags for recirculation*/
+    md->ol_flags = 0;
 }
 
 static int
@@ -1187,6 +1189,7 @@ netdev_gre_pop_header(struct dp_packet *packet)
 {
     struct pkt_metadata *md = &packet->md;
     struct flow_tnl *tnl = &md->tunnel;
+    uint32_t rx_cksm_ol_flag = md->ol_flags & NETDEV_RX_CHECKSUM_OFFLOAD;
     int hlen = sizeof(struct eth_header) + 4;
 
     hlen += is_header_ipv6(dp_packet_data(packet)) ?
@@ -1197,7 +1200,7 @@ netdev_gre_pop_header(struct dp_packet *packet)
         return EINVAL;
     }
 
-    hlen = parse_gre_header(packet, tnl);
+    hlen = parse_gre_header(packet, tnl, rx_cksm_ol_flag);
     if (hlen < 0) {
         return -hlen;
     }
@@ -1284,6 +1287,7 @@ netdev_vxlan_pop_header(struct dp_packet *packet)
 {
     struct pkt_metadata *md = &packet->md;
     struct flow_tnl *tnl = &md->tunnel;
+    uint32_t rx_cksm_ol_flag = md->ol_flags & NETDEV_RX_CHECKSUM_OFFLOAD;
     struct vxlanhdr *vxh;
     unsigned int hlen;
 
@@ -1292,7 +1296,7 @@ netdev_vxlan_pop_header(struct dp_packet *packet)
         return EINVAL;
     }
 
-    vxh = udp_extract_tnl_md(packet, tnl, &hlen);
+    vxh = udp_extract_tnl_md(packet, tnl, &hlen, rx_cksm_ol_flag);
     if (!vxh) {
         return EINVAL;
     }
@@ -1342,6 +1346,7 @@ netdev_geneve_pop_header(struct dp_packet *packet)
 {
     struct pkt_metadata *md = &packet->md;
     struct flow_tnl *tnl = &md->tunnel;
+    uint32_t rx_cksm_ol_flag = md->ol_flags & NETDEV_RX_CHECKSUM_OFFLOAD;
     struct genevehdr *gnh;
     unsigned int hlen, opts_len, ulen;
 
@@ -1352,7 +1357,7 @@ netdev_geneve_pop_header(struct dp_packet *packet)
         return EINVAL;
     }
 
-    gnh = udp_extract_tnl_md(packet, tnl, &ulen);
+    gnh = udp_extract_tnl_md(packet, tnl, &ulen, rx_cksm_ol_flag);
     if (!gnh) {
         return EINVAL;
     }
diff --git a/lib/netdev.c b/lib/netdev.c
index 3e50694..e4578e0 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -383,6 +383,7 @@ netdev_open(const char *name, const char *type, struct 
netdev **netdevp)
                 netdev->name = xstrdup(name);
                 netdev->change_seq = 1;
                 netdev->node = shash_add(&netdev_shash, name, netdev);
+                netdev->ol_flags = 0;
 
                 /* By default enable one tx and rx queue per netdev. */
                 netdev->n_txq = netdev->netdev_class->send ? 1 : 0;
diff --git a/lib/netdev.h b/lib/netdev.h
index 05968b2..ece4d77 100644
--- a/lib/netdev.h
+++ b/lib/netdev.h
@@ -241,6 +241,11 @@ enum netdev_flags {
     NETDEV_LOOPBACK = 0x0004    /* This is a loopback device. */
 };
 
+/*Flags for supported netdev offload features */
+enum netdev_ofld_features {
+    NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0
+};
+
 int netdev_get_flags(const struct netdev *, enum netdev_flags *);
 int netdev_set_flags(struct netdev *, enum netdev_flags,
                      struct netdev_saved_flags **);
diff --git a/lib/packets.h b/lib/packets.h
index a8ea24b..4d32bfd 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -145,6 +145,7 @@ struct pkt_metadata {
     uint32_t ct_mark;           /* Connection mark. */
     ovs_u128 ct_label;          /* Connection label. */
     union flow_in_port in_port; /* Input port. */
+    uint32_t ol_flags;
     struct flow_tnl tunnel;     /* Encapsulating tunnel parameters. Note that
                                  * if 'ip_dst' == 0, the rest of the fields may
                                  * be uninitialized. */
-- 
2.5.0

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to