Ingress scheduling configuration is given effect by way of Flow Director
filters. A small subset of the ingress scheduling possible is
implemented in this patch.

Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
---
 include/openvswitch/ofp-parse.h |   3 +
 lib/dpif-netdev.c               |   1 +
 lib/netdev-dpdk.c               | 181 ++++++++++++++++++++++++++++++++++++++--
 vswitchd/bridge.c               |   2 +
 4 files changed, 180 insertions(+), 7 deletions(-)

diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h
index 013a8f3..1991694 100644
--- a/include/openvswitch/ofp-parse.h
+++ b/include/openvswitch/ofp-parse.h
@@ -41,6 +41,9 @@ struct ofputil_table_mod;
 struct ofputil_bundle_msg;
 struct ofputil_tlv_table_mod;
 struct simap;
+struct tun_table;
+struct flow_wildcards;
+struct ofputil_port_map;
 enum ofputil_protocol;
 
 char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char *str_,
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e2cd931..9ce3456 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -44,6 +44,7 @@
 #include "dp-packet.h"
 #include "dpif.h"
 #include "dpif-provider.h"
+#include "netdev-provider.h"
 #include "dummy.h"
 #include "fat-rwlock.h"
 #include "flow.h"
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1ffedd4..d9aab4f 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -49,6 +49,8 @@
 #include "openvswitch/list.h"
 #include "openvswitch/ofp-print.h"
 #include "openvswitch/vlog.h"
+#include "openvswitch/ofp-parse.h"
+#include "openvswitch/ofp-util.h"
 #include "ovs-numa.h"
 #include "ovs-thread.h"
 #include "ovs-rcu.h"
@@ -175,6 +177,10 @@ static const struct rte_eth_conf port_conf = {
     .txmode = {
         .mq_mode = ETH_MQ_TX_NONE,
     },
+    .fdir_conf = {
+        .mode = RTE_FDIR_MODE_PERFECT,
+    },
+
 };
 
 /*
@@ -351,6 +357,11 @@ enum dpdk_hw_ol_features {
     NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
 };
 
+union ingress_filter {
+    struct rte_eth_ethertype_filter ethertype;
+    struct rte_eth_fdir_filter fdir;
+};
+
 struct netdev_dpdk {
     struct netdev up;
     dpdk_port_t port_id;
@@ -390,8 +401,11 @@ struct netdev_dpdk {
     /* If true, device was attached by rte_eth_dev_attach(). */
     bool attached;
 
-    /* Ingress Scheduling config */
+    /* Ingress Scheduling config & state. */
     char *ingress_sched_str;
+    bool ingress_sched_changed;
+    enum rte_filter_type ingress_filter_type;
+    union ingress_filter ingress_filter;
 
     /* In dpdk_list. */
     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex);
@@ -674,6 +688,22 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int 
n_rxq, int n_txq)
     int i;
     struct rte_eth_conf conf = port_conf;
 
+    /* Ingress scheduling uses an extra RX queue reserved for prioritized
+       frames but using RSS will 'pollute' that queue by distributing
+       non-priority packets on to it.  Therefore RSS is not compatible with
+       ingress scheduling.  Also requesting anything other than two queues
+       with ingress scheduling is wasteful as without RSS only two queues are
+       required.  Rather than force n_rxq to two here and overriding the
+       configured value it's less surprising for the user to issue a warning
+       (see dpdk_apply_ingress_scheduling()) and not enable ingress shceduling.
+    */
+    if (dev->ingress_sched_str && n_rxq == 2) {
+        conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
+    }
+    else {
+        conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
+    }
+
     /* For some NICs (e.g. Niantic), scatter_rx mode needs to be explicitly
      * enabled. */
     if (dev->mtu > ETHER_MTU) {
@@ -757,6 +787,128 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) 
OVS_REQUIRES(dev->mutex)
     }
 }
 
+static void
+dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq)
+{
+    if (!dev->ingress_sched_str) {
+        return;
+    }
+
+    /* See dpdk_eth_dev_queue_setup for n_rxq requirement */
+    if (n_rxq != 2) {
+        VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \
+                 "Requires n_rxq==2.", dev->up.name);
+        return;
+    }
+
+    int priority_q_id = n_rxq-1;
+    char *key, *val, *str, *iter;
+
+    ovs_be32 ip_src, ip_dst;
+    ip_src = ip_dst = 0;
+
+    uint16_t eth_type, port_src, port_dst;
+    eth_type = port_src = port_dst = 0;
+    uint8_t ip_proto = 0;
+    int diag = 0;
+
+    /* delete any existing filter */
+    if (dev->ingress_filter_type == RTE_ETH_FILTER_FDIR) {
+        diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_FDIR,
+            RTE_ETH_FILTER_DELETE, &dev->ingress_filter.fdir);
+    } else if (dev->ingress_filter_type == RTE_ETH_FILTER_ETHERTYPE) {
+        diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_ETHERTYPE,
+            RTE_ETH_FILTER_DELETE, &dev->ingress_filter.ethertype);
+    }
+
+    /* str_to_x on error returns malloc'd str we'll need to free */
+    char *mallocd_str;
+    /* Parse the configuration into local vars */
+    iter = str = xstrdup(dev->ingress_sched_str);
+    while (ofputil_parse_key_value (&iter, &key, &val)) {
+        if (strcmp(key, "nw_src") == 0 || strcmp(key, "ip_src") == 0) {
+            mallocd_str = str_to_ip(val, &ip_src);
+        } else if (strcmp(key, "nw_dst") == 0 || strcmp(key, "ip_dst") == 0) {
+            mallocd_str = str_to_ip(val, &ip_dst);
+        } else if (strcmp(key, "dl_type") == 0 ||
+                   strcmp(key, "eth_type") == 0) {
+            mallocd_str = str_to_u16(val, "eth_type/dl_type", &eth_type);
+        } else if (strcmp(key, "tcp_src") == 0 ||
+                  strcmp(key, "tp_src") == 0 ||
+                  strcmp(key, "udp_src") == 0) {
+            mallocd_str = str_to_u16(val, "tcp/udp_src", &port_src);
+        } else if (strcmp(key, "tcp_dst") == 0 ||
+                   strcmp(key, "tp_dst") == 0 ||
+                   strcmp(key, "udp_dst") == 0) {
+            mallocd_str = str_to_u16(val, "tcp/udp_dst", &port_dst);
+        } else if (strcmp(key, "ip") == 0) {
+            eth_type = ETH_P_IP;
+        } else if (strcmp(key, "udp") == 0) {
+            eth_type = ETH_P_IP;
+            ip_proto = IPPROTO_UDP;
+        } else if (strcmp(key, "tcp") == 0) {
+            eth_type = ETH_P_IP;
+            ip_proto = IPPROTO_TCP;
+        } else {
+            VLOG_WARN("Ignoring unsupported ingress scheduling field '%s'", \
+                      key);
+        }
+        if (mallocd_str) {
+            VLOG_ERR ("%s", mallocd_str);
+            free(mallocd_str);
+            mallocd_str = NULL;
+        }
+    }
+    free (str);
+
+    /* Set the filters */
+    if (eth_type && ip_src && ip_dst && port_src && port_dst && ip_proto) {
+        struct rte_eth_fdir_filter entry = { 0 };
+        if (ip_proto == IPPROTO_TCP) {
+            entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_TCP;
+        } else {
+            entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
+        }
+        entry.input.flow.udp4_flow.ip.src_ip = ip_src;
+        entry.input.flow.udp4_flow.ip.dst_ip = ip_dst;
+        entry.input.flow.udp4_flow.src_port = htons(port_src);
+        entry.input.flow.udp4_flow.dst_port = htons(port_dst);
+        entry.action.rx_queue = priority_q_id;
+        entry.action.behavior = RTE_ETH_FDIR_ACCEPT;
+        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID;
+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
+                RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_ADD, &entry);
+        dev->ingress_filter_type = RTE_ETH_FILTER_FDIR;
+        dev->ingress_filter.fdir = entry;
+    }
+    else if (eth_type && !ip_src && !ip_dst && !port_src
+             && !port_dst && !ip_proto) {
+        struct rte_eth_ethertype_filter entry = {0};
+        memset (&entry, 0, sizeof entry);
+        entry.ether_type = eth_type;
+        entry.flags = 0;
+        entry.queue = priority_q_id;
+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
+                RTE_ETH_FILTER_ETHERTYPE, RTE_ETH_FILTER_ADD, &entry);
+        dev->ingress_filter.ethertype = entry;
+        dev->ingress_filter_type = RTE_ETH_FILTER_ETHERTYPE;
+    }
+    else {
+        VLOG_ERR("Unsupported ingress scheduling match-field combination.");
+        dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
+        return;
+    }
+
+    if (diag) {
+        dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
+        VLOG_ERR("Failed to add ingress scheduling filter.");
+    }
+    else {
+        /* Mark the appropriate q as prioritized */
+        dev->up.priority_rxq = priority_q_id;
+    }
+}
+
 static int
 dpdk_eth_dev_init(struct netdev_dpdk *dev)
     OVS_REQUIRES(dev->mutex)
@@ -791,6 +943,8 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
         return -diag;
     }
 
+    dpdk_apply_ingress_scheduling(dev, n_rxq);
+
     diag = rte_eth_dev_start(dev->port_id);
     if (diag) {
         VLOG_ERR("Interface %s start error: %s", dev->up.name,
@@ -897,6 +1051,9 @@ common_construct(struct netdev *netdev, dpdk_port_t 
port_no,
     dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
     dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
 
+    dev->ingress_sched_str = NULL;
+    dev->ingress_sched_changed = false;
+    dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
     /* Initialize the flow control to NULL */
     memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
 
@@ -2015,11 +2172,20 @@ netdev_dpdk_set_ingress_sched(struct netdev *netdev,
 {
     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
 
-    free(dev->ingress_sched_str);
-    if (ingress_sched_str) {
-        dev->ingress_sched_str = xstrdup(ingress_sched_str);
+    if ((ingress_sched_str && dev->ingress_sched_str &&
+        strcmp(ingress_sched_str, dev->ingress_sched_str) == 0) ||
+        (!ingress_sched_str && !dev->ingress_sched_str)) {
+        /* no-op; new cfg == old cfg or else both are NULL */
+        return 0;
+    } else {
+        /* free the old, copy in the new */
+        free(dev->ingress_sched_str);
+        if (ingress_sched_str) {
+            dev->ingress_sched_str = xstrdup(ingress_sched_str);
+        }
+        dev->ingress_sched_changed = true;
+        netdev_request_reconfigure(netdev);
     }
-
     return 0;
 }
 
@@ -3185,12 +3351,13 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
         && dev->mtu == dev->requested_mtu
         && dev->rxq_size == dev->requested_rxq_size
         && dev->txq_size == dev->requested_txq_size
-        && dev->socket_id == dev->requested_socket_id) {
+        && dev->socket_id == dev->requested_socket_id
+        && !dev->ingress_sched_changed) {
         /* Reconfiguration is unnecessary */
-
         goto out;
     }
 
+    dev->ingress_sched_changed = false;
     rte_eth_dev_stop(dev->port_id);
 
     if (dev->mtu != dev->requested_mtu
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
index 93c91f6..077a6d6 100644
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -831,6 +831,8 @@ bridge_delete_or_reconfigure_ports(struct bridge *br)
         }
 
         iface_set_netdev_mtu(iface->cfg, iface->netdev);
+        netdev_set_ingress_sched(iface->netdev,
+            smap_get(&iface->cfg->other_config, "ingress_sched"));
 
         /* If the requested OpenFlow port for 'iface' changed, and it's not
          * already the correct port, then we might want to temporarily delete
-- 
2.7.4

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to