Hi Mark,

Thanks for the very useful review comments.

I'll send a rev'd patch set shortly.

/Billy

> -----Original Message-----
> From: Kavanagh, Mark B
> Sent: Friday, August 4, 2017 4:14 PM
> To: O Mahony, Billy <billy.o.mah...@intel.com>; d...@openvswitch.org
> Subject: RE: [ovs-dev] [PATCH 2/4] netdev-dpdk: Apply ingress_sched config
> to dpdk phy ports
> 
> >From: ovs-dev-boun...@openvswitch.org
> >[mailto:ovs-dev-boun...@openvswitch.org]
> >On Behalf Of Billy O'Mahony
> >Sent: Thursday, July 20, 2017 5:11 PM
> >To: d...@openvswitch.org
> >Subject: [ovs-dev] [PATCH 2/4] netdev-dpdk: Apply ingress_sched config
> >to dpdk phy ports
> >
> >Ingress scheduling configuration is given effect by way of Flow
> >Director filters. A small subset of the ingress scheduling possible is
> >implemented in this patch.
> >
> >Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com>
> 
> Hi Billy,
> 
> As a general comment, this patch doesn't apply to HEAD of master, so please
> rebase as part of rework.
> 
> Review comments inline.
> 
> Thanks,
> Mark
> 
> 
> >---
> > include/openvswitch/ofp-parse.h |   3 +
> > lib/dpif-netdev.c               |   1 +
> > lib/netdev-dpdk.c               | 167
> ++++++++++++++++++++++++++++++++++++++-
> >-
> > vswitchd/bridge.c               |   2 +
> > 4 files changed, 166 insertions(+), 7 deletions(-)
> >
> >diff --git a/include/openvswitch/ofp-parse.h
> >b/include/openvswitch/ofp-parse.h index fc5784e..08d6086 100644
> >--- a/include/openvswitch/ofp-parse.h
> >+++ b/include/openvswitch/ofp-parse.h
> >@@ -37,6 +37,9 @@ struct ofputil_table_mod;  struct ofputil_bundle_msg;
> >struct ofputil_tlv_table_mod;  struct simap;
> >+struct tun_table;
> >+struct flow_wildcards;
> >+struct ofputil_port_map;
> > enum ofputil_protocol;
> >
> > char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char
> >*str_, diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
> >47a9fa0..d35566f 100644
> >--- a/lib/dpif-netdev.c
> >+++ b/lib/dpif-netdev.c
> >@@ -44,6 +44,7 @@
> > #include "dp-packet.h"
> > #include "dpif.h"
> > #include "dpif-provider.h"
> >+#include "netdev-provider.h"
> 
> If a setter function for modifying netdev->ingress_sched_str is
> implemented, there is no need to include netdev-provider.h
> 
> > #include "dummy.h"
> > #include "fat-rwlock.h"
> > #include "flow.h"
> >diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
> >e74c50f..e393abf 100644
> >--- a/lib/netdev-dpdk.c
> >+++ b/lib/netdev-dpdk.c
> >@@ -33,6 +33,8 @@
> > #include <rte_meter.h>
> > #include <rte_virtio_net.h>
> >
> >+#include <openvswitch/ofp-parse.h>
> >+#include <openvswitch/ofp-util.h>
> 
> Move these include below, with the other openvswitch include file.
> 
> > #include "dirs.h"
> > #include "dp-packet.h"
> > #include "dpdk.h"
> >@@ -169,6 +171,10 @@ static const struct rte_eth_conf port_conf = {
> >     .txmode = {
> >         .mq_mode = ETH_MQ_TX_NONE,
> >     },
> >+    .fdir_conf = {
> >+        .mode = RTE_FDIR_MODE_PERFECT,
> 
> As you mentioned in your cover letter, you've only tested on a Fortville NIC.
> How widely supported are the Flow Director features across DPDK-
> supported NICs?
[[BO'M]] I'm not sure. Probably many NICs have the capabilities required - but 
I don't know if the dpdk drivers expose it.
> 
> >+    },
> >+
> > };
> >
> > enum { DPDK_RING_SIZE = 256 };
> >@@ -330,6 +336,11 @@ enum dpdk_hw_ol_features {
> >     NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,  };
> >
> >+union ingress_filter {
> >+    struct rte_eth_ethertype_filter ethertype;
> >+    struct rte_eth_fdir_filter fdir;
> >+};
> >+
> > struct netdev_dpdk {
> >     struct netdev up;
> >     dpdk_port_t port_id;
> >@@ -369,8 +380,11 @@ struct netdev_dpdk {
> >     /* If true, device was attached by rte_eth_dev_attach(). */
> >     bool attached;
> >
> >-    /* Ingress Scheduling config */
> >+    /* Ingress Scheduling config & state. */
> >     char *ingress_sched_str;
> >+    bool ingress_sched_changed;
> >+    enum rte_filter_type ingress_filter_type;
> >+    union ingress_filter ingress_filter;
> >
> >     /* In dpdk_list. */
> >     struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); @@ -653,6
> >+667,15 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int
> n_rxq,
> >int n_txq)
> >     int i;
> >     struct rte_eth_conf conf = port_conf;
> >
> >+    /* Ingress scheduling requires ETH_MQ_RX_NONE so limit it to when
> exactly
> >+     * two rxqs are defined. Otherwise MQ will not work as expected.
> >+ */
> 
> So if a user later enables multiq (i.e. > 2 rxqs), that implicitly disables 
> ingress
> scheduling?
[[BO'M]] Yes. RSS and Ingress Scheduling are incompatible - at least I couldn't 
figure out how to make them compatible. It could be setup so that RSS would be 
disabled but that would be favoring the existing common use case of the less 
common new use case.
> 
> In any event, the comment here needs to be much clearer, as this is the first
> mention within the code that ingress scheduling is limited to just 2 rxqs. Is
> that limitation just for the PoC?
[[BO'M]] Unless I can figure out how to do stop RSS polluting the priority 
queue.
> 
> It may also be useful to briefly explain why RSS prevents ingress scheduling.
[[BO'M]] I've expanded the comments to include an explanation of the issue.
> 
> >+    if (dev->ingress_sched_str && n_rxq == 2) {
> >+        conf.rxmode.mq_mode = ETH_MQ_RX_NONE;
> >+    }
> >+    else {
> >+        conf.rxmode.mq_mode = ETH_MQ_RX_RSS;
> >+    }
> >+
> >     if (dev->mtu > ETHER_MTU) {
> >         conf.rxmode.jumbo_frame = 1;
> >         conf.rxmode.max_rx_pkt_len = dev->max_packet_len; @@ -730,6
> >+753,121 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev)
> >OVS_REQUIRES(dev->mutex)
> >     }
> > }
> >
> >+static void
> 
> Why no return value?
[[BO'M]] The caller has nothing to do whether the config works or fails. Which 
makes sense as Ingress scheduling is best-effort. So if the config fails to be 
applied for any reason the rest of dpdk_eth_dev_init should just continue.
> 
> >+dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq) {
> >+    if (!dev->ingress_sched_str) {
> >+        return;
> >+    }
> >+
> >+    if (n_rxq != 2) {
> 
> If n_rxq is always 2, then remove that parameter from this function, and
> perform the check (i.e. n_rxq ==2) in dpdk_eth_dev_init instead.
> 
> >+        VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \
> >+                 "Requires n_rxq==2.", dev->up.name);
> >+    }
> >+
> >+    int priority_q_id = n_rxq-1;
> 
> So, this is always 1, right?
[[BO'M]] Yes. Your right that it's pointless to carry around n_rxq if the value 
must always be two. But I'm going to leave it as is for now in case I figure 
out how to get past that limitation.
> 
> >+    char *key, *val, *str, *iter;
> >+
> >+    ovs_be32 ip_src, ip_dst;
> >+    ip_src = ip_dst = 0;
> >+
> >+    uint16_t eth_type, port_src, port_dst;
> >+    eth_type = port_src = port_dst = 0;
> >+    uint8_t ip_proto = 0;
> >+    int diag = 0;
> >+
> >+    /* delete any existing filter */
> >+    if (dev->ingress_filter_type == RTE_ETH_FILTER_FDIR) {
> >+        diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_FDIR,
> >+            RTE_ETH_FILTER_DELETE, &dev->ingress_filter.fdir);
> >+    } else if (dev->ingress_filter_type == RTE_ETH_FILTER_ETHERTYPE) {
> >+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
> >RTE_ETH_FILTER_ETHERTYPE,
> >+            RTE_ETH_FILTER_DELETE, &dev->ingress_filter.ethertype);
> >+    }
> >+
> >+    char *mallocd_str; /* str_to_x returns malloc'd str we'll need to
> >+ free */
> 
> Move inline comment above the line. Also, malloc'd string is only returned in
> the event of error (otherwise NULL), so you might want to clarify the
> comment.
[[BO'M]] Done.
> 
> >+    /* Parse the configuration into local vars */
> >+    iter = str = xstrdup(dev->ingress_sched_str);
> 
> Why are two strings (i.e. iter and str) needed here?
[[BO'M]] str remembers the malloc'd str for freeing. Iter is modified by each 
call in the loop to ofputil_parse_key_value.
> 
> >+    while (ofputil_parse_key_value (&iter, &key, &val)) {
> >+        if (strcmp(key, "nw_src") == 0 || strcmp(key, "ip_src") == 0) {
> >+            mallocd_str = str_to_ip(val, &ip_src);
> >+        } else if (strcmp(key, "nw_dst") == 0 || strcmp(key, "ip_dst")
> >+ == 0)
> >{
> >+            mallocd_str = str_to_ip(val, &ip_dst);
> >+        } else if (strcmp(key, "dl_type") == 0 ||
> >+                   strcmp(key, "eth_type") == 0) {
> >+            mallocd_str = str_to_u16(val, "eth_type/dl_type", &eth_type);
> >+        } else if (strcmp(key, "tcp_src") == 0 ||
> >+                  strcmp(key, "tp_src") == 0 ||
> >+                  strcmp(key, "udp_src") == 0) {
> >+            mallocd_str = str_to_u16(val, "tcp/udp_src", &port_src);
> >+        } else if (strcmp(key, "tcp_dst") == 0 ||
> >+                   strcmp(key, "tp_dst") == 0 ||
> >+                   strcmp(key, "udp_dst") == 0) {
> >+            mallocd_str = str_to_u16(val, "tcp/udp_dst", &port_dst);
> >+        } else if (strcmp(key, "ip") == 0) {
> 
> Use defined macros for well-known values - examples below.
> 
> >+            eth_type = 0x0800;
> 
> ETH_P_IP <linux/if_ether.h>
[[BO'M]] Thanks for #include tips
> 
> >+        } else if (strcmp(key, "udp") == 0) {
> >+            eth_type = 0x0800;
> >+            ip_proto = 17;
> 
> IPPROTO_UDP <netinet/in.h>
> 
> >+        } else if (strcmp(key, "tcp") == 0) {
> >+            eth_type = 0x0800;
> >+            ip_proto = 6;
> 
> IPPROTO_TCP <netinet/in.h>
> 
> >+        } else {
> >+            VLOG_WARN("Ignoring unsupported ingress scheduling field '%s'", 
> >\
> >+                      key);
> >+        }
> >+        if (mallocd_str) {
> >+            VLOG_ERR ("%s", mallocd_str);
> >+            free(mallocd_str);
> >+            mallocd_str = NULL;
> >+        }
> >+    }
> >+    free (str);
> 
> Breaking out the parsing into a separate function would make this function
> more readable.
[[BO'M]] I'm not sure.  It would make this function shorter sure. But a reader 
has to jump back and forth between fn bodies to follow what is a sequential 
procedure.
> 
> >+
> >+    /* Set the filters */
> >+    if (eth_type && ip_src && ip_dst && port_src && port_dst && ip_proto)
> {
> >+        struct rte_eth_fdir_filter entry = { 0 };
> 
> This causes a compiler error for me.
[[BO'M]] What is the error, compiler version and build/config line?
I'm using gcc version 5.4.0 20160609 and ./boot.sh && ./configure 
--enable-Werror --with-dpdk=...dpdk/x86_64-native-linuxapp-gcc && make -j
> 
> >+        entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP;
> 
> Is only UDP flow supported for the POC?
[[BO'M]] Good catch - that's a bug actually!
> 
> >+        entry.input.flow.udp4_flow.ip.src_ip = ip_src;
> >+        entry.input.flow.udp4_flow.ip.dst_ip = ip_dst;
> >+        entry.input.flow.udp4_flow.src_port = htons(port_src);
> >+        entry.input.flow.udp4_flow.dst_port = htons(port_dst);
> >+        entry.action.rx_queue = priority_q_id;
> >+        entry.action.behavior = RTE_ETH_FDIR_ACCEPT;
> >+        entry.action.report_status = RTE_ETH_FDIR_REPORT_ID;
> >+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
> >+                RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_ADD, &entry);
> >+        dev->ingress_filter_type = RTE_ETH_FILTER_FDIR;
> >+        dev->ingress_filter.fdir = entry;
> >+    }
> >+    else if (eth_type && !ip_src && !ip_dst && !port_src
> >+             && !port_dst && !ip_proto) {
> >+        struct rte_eth_ethertype_filter entry = {0};
> >+        memset (&entry, 0, sizeof entry);
> >+        entry.ether_type = eth_type;
> >+        entry.flags = 0;
> >+        entry.queue = priority_q_id;
> >+        diag = rte_eth_dev_filter_ctrl(dev->port_id,
> >+                RTE_ETH_FILTER_ETHERTYPE, RTE_ETH_FILTER_ADD, &entry);
> >+        dev->ingress_filter.ethertype = entry;
> >+        dev->ingress_filter_type = RTE_ETH_FILTER_ETHERTYPE;
> >+    }
> >+    else {
> >+        VLOG_ERR("Unsupported ingress scheduling match-field
> combination.");
> >+        dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
> >+        return;
> >+    }
> >+
> >+    if (diag) {
> >+        dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
> >+        VLOG_ERR("Failed to add ingress scheduling filter.");
> >+    }
> >+    else {
> >+        /* Mark the appropriate q as prioritized */
> >+        dev->up.priority_rxq = priority_q_id;
> >+    }
> >+}
> >+
> > static int
> > dpdk_eth_dev_init(struct netdev_dpdk *dev)
> >     OVS_REQUIRES(dev->mutex)
> >@@ -764,6 +902,8 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
> >         return -diag;
> >     }
> >
> >+    dpdk_apply_ingress_scheduling(dev, n_rxq);
> >+
> >     diag = rte_eth_dev_start(dev->port_id);
> >     if (diag) {
> >         VLOG_ERR("Interface %s start error: %s", dev->up.name, @@
> >-870,6 +1010,9 @@ common_construct(struct netdev *netdev,
> dpdk_port_t
> >port_no,
> >     dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
> >     dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
> >
> >+    dev->ingress_sched_str = NULL;
> >+    dev->ingress_sched_changed = false;
> >+    dev->ingress_filter_type = RTE_ETH_FILTER_NONE;
> >     /* Initialize the flow control to NULL */
> >     memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
> >
> >@@ -1950,11 +2093,20 @@ netdev_dpdk_set_ingress_sched(struct netdev
> >*netdev,  {
> >     struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> >
> >-    free(dev->ingress_sched_str);
> >-    if (ingress_sched_str) {
> >-        dev->ingress_sched_str = xstrdup(ingress_sched_str);
> >+    if ((ingress_sched_str && dev->ingress_sched_str &&
> >+        strcmp(ingress_sched_str, dev->ingress_sched_str) == 0) ||
> >+        (!ingress_sched_str && !dev->ingress_sched_str)) {
> >+        /* no-op; new cfg == old cfg or else both are NULL */
> 
> Just one may be NULL, not necessarily both.
[[BO'M]] If just one is null it's treated as an update being required ie the 
else section
> 
> >+        return 0;
> >+    } else {
> >+        /* free the old, copy in the new */
> >+        free(dev->ingress_sched_str);
> >+        if (ingress_sched_str) {
> >+            dev->ingress_sched_str = xstrdup(ingress_sched_str);
> >+        }
> >+        dev->ingress_sched_changed = true;
> >+        netdev_request_reconfigure(netdev);
> 
> Why trigger a netdev_request_reconfigure here? The ingress_sched_str has
> already been set, so it doesn't make sense.

[[BO'M]] Short answer is that the reconfiguration won't work without this. When 
updated configuration from ovsdb is percolated across the various objects 
(bridges, ports, interfaces...) they each look at the new configuration and 
decide if the change to the config requires them to be stopped reconfigured and 
restarted.  It's the same for the requested_txq_size and friends in 
netdev_dpdk_set_config which also ends up calling netdev_request_reconfigure if 
the configuration has changed.

The ingress scheduling config is held in 'other_config' section of Interface 
table - the request_txq_size and friends come from the main config section 
which is passed to netdev_dpdk_set_config but the other_config section is not 
passed down to that function. So it's not available there. 

It would've been easier if the ingress scheduling was placed in the main 
Interface configuration section but it was proposed to be put in other_config 
section initially and I stuck with that.

> A solution may be to add another netdev member
> 'requested_ingress_sched_str', which is assigned the value of
> 'ingress_sched_str' in this function.
> Then, in netdev_dpdk_reconfigure, dev->ingress_sched_str = dev-
> >requested_ingress_sched_str.
> 
> >     }
> >-
> >     return 0;
> > }
> >
> >@@ -3112,12 +3264,13 @@ netdev_dpdk_reconfigure(struct netdev
> *netdev)
> >         && dev->mtu == dev->requested_mtu
> >         && dev->rxq_size == dev->requested_rxq_size
> >         && dev->txq_size == dev->requested_txq_size
> >-        && dev->socket_id == dev->requested_socket_id) {
> >+        && dev->socket_id == dev->requested_socket_id
> >+        && !dev->ingress_sched_changed) {
> 
> If you make the changes I've suggested above, the last line above should
> read 'dev->requested_ingress_sched_str == dev->ingress_sched_str'.
> 
> 
> >         /* Reconfiguration is unnecessary */
> >-
> >         goto out;
> >     }
> >
> >+    dev->ingress_sched_changed = false;
> >     rte_eth_dev_stop(dev->port_id);
> >
> >     if (dev->mtu != dev->requested_mtu diff --git a/vswitchd/bridge.c
> >b/vswitchd/bridge.c index 9113195..2c5dfd3 100644
> >--- a/vswitchd/bridge.c
> >+++ b/vswitchd/bridge.c
> >@@ -831,6 +831,8 @@ bridge_delete_or_reconfigure_ports(struct bridge
> *br)
> >         }
> >
> >         iface_set_netdev_mtu(iface->cfg, iface->netdev);
> >+        netdev_set_ingress_sched(iface->netdev,
> >+            smap_get(&iface->cfg->other_config, "ingress_sched"));
> >
> >         /* If the requested OpenFlow port for 'iface' changed, and it's not
> >          * already the correct port, then we might want to temporarily
> >delete
> >--
> >2.7.4
> >
> >_______________________________________________
> >dev mailing list
> >d...@openvswitch.org
> >https://mail.openvswitch.org/mailman/listinfo/ovs-dev
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to