Ingress scheduling configuration is given effect by way of Flow Director filters. A small subset of the ingress scheduling possible is implemented in this patch.
Signed-off-by: Billy O'Mahony <billy.o.mah...@intel.com> --- include/openvswitch/ofp-parse.h | 3 + lib/dpif-netdev.c | 1 + lib/netdev-dpdk.c | 181 ++++++++++++++++++++++++++++++++++++++-- vswitchd/bridge.c | 2 + 4 files changed, 180 insertions(+), 7 deletions(-) diff --git a/include/openvswitch/ofp-parse.h b/include/openvswitch/ofp-parse.h index 013a8f3..1991694 100644 --- a/include/openvswitch/ofp-parse.h +++ b/include/openvswitch/ofp-parse.h @@ -41,6 +41,9 @@ struct ofputil_table_mod; struct ofputil_bundle_msg; struct ofputil_tlv_table_mod; struct simap; +struct tun_table; +struct flow_wildcards; +struct ofputil_port_map; enum ofputil_protocol; char *parse_ofp_str(struct ofputil_flow_mod *, int command, const char *str_, diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index e2cd931..9ce3456 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -44,6 +44,7 @@ #include "dp-packet.h" #include "dpif.h" #include "dpif-provider.h" +#include "netdev-provider.h" #include "dummy.h" #include "fat-rwlock.h" #include "flow.h" diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 1ffedd4..d9aab4f 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -49,6 +49,8 @@ #include "openvswitch/list.h" #include "openvswitch/ofp-print.h" #include "openvswitch/vlog.h" +#include "openvswitch/ofp-parse.h" +#include "openvswitch/ofp-util.h" #include "ovs-numa.h" #include "ovs-thread.h" #include "ovs-rcu.h" @@ -175,6 +177,10 @@ static const struct rte_eth_conf port_conf = { .txmode = { .mq_mode = ETH_MQ_TX_NONE, }, + .fdir_conf = { + .mode = RTE_FDIR_MODE_PERFECT, + }, + }; /* @@ -351,6 +357,11 @@ enum dpdk_hw_ol_features { NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0, }; +union ingress_filter { + struct rte_eth_ethertype_filter ethertype; + struct rte_eth_fdir_filter fdir; +}; + struct netdev_dpdk { struct netdev up; dpdk_port_t port_id; @@ -390,8 +401,11 @@ struct netdev_dpdk { /* If true, device was attached by rte_eth_dev_attach(). */ bool attached; - /* Ingress Scheduling config */ + /* Ingress Scheduling config & state. */ char *ingress_sched_str; + bool ingress_sched_changed; + enum rte_filter_type ingress_filter_type; + union ingress_filter ingress_filter; /* In dpdk_list. */ struct ovs_list list_node OVS_GUARDED_BY(dpdk_mutex); @@ -674,6 +688,22 @@ dpdk_eth_dev_queue_setup(struct netdev_dpdk *dev, int n_rxq, int n_txq) int i; struct rte_eth_conf conf = port_conf; + /* Ingress scheduling uses an extra RX queue reserved for prioritized + frames but using RSS will 'pollute' that queue by distributing + non-priority packets on to it. Therefore RSS is not compatible with + ingress scheduling. Also requesting anything other than two queues + with ingress scheduling is wasteful as without RSS only two queues are + required. Rather than force n_rxq to two here and overriding the + configured value it's less surprising for the user to issue a warning + (see dpdk_apply_ingress_scheduling()) and not enable ingress shceduling. + */ + if (dev->ingress_sched_str && n_rxq == 2) { + conf.rxmode.mq_mode = ETH_MQ_RX_NONE; + } + else { + conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + } + /* For some NICs (e.g. Niantic), scatter_rx mode needs to be explicitly * enabled. */ if (dev->mtu > ETHER_MTU) { @@ -757,6 +787,128 @@ dpdk_eth_flow_ctrl_setup(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex) } } +static void +dpdk_apply_ingress_scheduling(struct netdev_dpdk *dev, int n_rxq) +{ + if (!dev->ingress_sched_str) { + return; + } + + /* See dpdk_eth_dev_queue_setup for n_rxq requirement */ + if (n_rxq != 2) { + VLOG_ERR("Interface %s: Ingress scheduling config ignored; " \ + "Requires n_rxq==2.", dev->up.name); + return; + } + + int priority_q_id = n_rxq-1; + char *key, *val, *str, *iter; + + ovs_be32 ip_src, ip_dst; + ip_src = ip_dst = 0; + + uint16_t eth_type, port_src, port_dst; + eth_type = port_src = port_dst = 0; + uint8_t ip_proto = 0; + int diag = 0; + + /* delete any existing filter */ + if (dev->ingress_filter_type == RTE_ETH_FILTER_FDIR) { + diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_FDIR, + RTE_ETH_FILTER_DELETE, &dev->ingress_filter.fdir); + } else if (dev->ingress_filter_type == RTE_ETH_FILTER_ETHERTYPE) { + diag = rte_eth_dev_filter_ctrl(dev->port_id, RTE_ETH_FILTER_ETHERTYPE, + RTE_ETH_FILTER_DELETE, &dev->ingress_filter.ethertype); + } + + /* str_to_x on error returns malloc'd str we'll need to free */ + char *mallocd_str; + /* Parse the configuration into local vars */ + iter = str = xstrdup(dev->ingress_sched_str); + while (ofputil_parse_key_value (&iter, &key, &val)) { + if (strcmp(key, "nw_src") == 0 || strcmp(key, "ip_src") == 0) { + mallocd_str = str_to_ip(val, &ip_src); + } else if (strcmp(key, "nw_dst") == 0 || strcmp(key, "ip_dst") == 0) { + mallocd_str = str_to_ip(val, &ip_dst); + } else if (strcmp(key, "dl_type") == 0 || + strcmp(key, "eth_type") == 0) { + mallocd_str = str_to_u16(val, "eth_type/dl_type", ð_type); + } else if (strcmp(key, "tcp_src") == 0 || + strcmp(key, "tp_src") == 0 || + strcmp(key, "udp_src") == 0) { + mallocd_str = str_to_u16(val, "tcp/udp_src", &port_src); + } else if (strcmp(key, "tcp_dst") == 0 || + strcmp(key, "tp_dst") == 0 || + strcmp(key, "udp_dst") == 0) { + mallocd_str = str_to_u16(val, "tcp/udp_dst", &port_dst); + } else if (strcmp(key, "ip") == 0) { + eth_type = ETH_P_IP; + } else if (strcmp(key, "udp") == 0) { + eth_type = ETH_P_IP; + ip_proto = IPPROTO_UDP; + } else if (strcmp(key, "tcp") == 0) { + eth_type = ETH_P_IP; + ip_proto = IPPROTO_TCP; + } else { + VLOG_WARN("Ignoring unsupported ingress scheduling field '%s'", \ + key); + } + if (mallocd_str) { + VLOG_ERR ("%s", mallocd_str); + free(mallocd_str); + mallocd_str = NULL; + } + } + free (str); + + /* Set the filters */ + if (eth_type && ip_src && ip_dst && port_src && port_dst && ip_proto) { + struct rte_eth_fdir_filter entry = { 0 }; + if (ip_proto == IPPROTO_TCP) { + entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_TCP; + } else { + entry.input.flow_type = RTE_ETH_FLOW_NONFRAG_IPV4_UDP; + } + entry.input.flow.udp4_flow.ip.src_ip = ip_src; + entry.input.flow.udp4_flow.ip.dst_ip = ip_dst; + entry.input.flow.udp4_flow.src_port = htons(port_src); + entry.input.flow.udp4_flow.dst_port = htons(port_dst); + entry.action.rx_queue = priority_q_id; + entry.action.behavior = RTE_ETH_FDIR_ACCEPT; + entry.action.report_status = RTE_ETH_FDIR_REPORT_ID; + diag = rte_eth_dev_filter_ctrl(dev->port_id, + RTE_ETH_FILTER_FDIR, RTE_ETH_FILTER_ADD, &entry); + dev->ingress_filter_type = RTE_ETH_FILTER_FDIR; + dev->ingress_filter.fdir = entry; + } + else if (eth_type && !ip_src && !ip_dst && !port_src + && !port_dst && !ip_proto) { + struct rte_eth_ethertype_filter entry = {0}; + memset (&entry, 0, sizeof entry); + entry.ether_type = eth_type; + entry.flags = 0; + entry.queue = priority_q_id; + diag = rte_eth_dev_filter_ctrl(dev->port_id, + RTE_ETH_FILTER_ETHERTYPE, RTE_ETH_FILTER_ADD, &entry); + dev->ingress_filter.ethertype = entry; + dev->ingress_filter_type = RTE_ETH_FILTER_ETHERTYPE; + } + else { + VLOG_ERR("Unsupported ingress scheduling match-field combination."); + dev->ingress_filter_type = RTE_ETH_FILTER_NONE; + return; + } + + if (diag) { + dev->ingress_filter_type = RTE_ETH_FILTER_NONE; + VLOG_ERR("Failed to add ingress scheduling filter."); + } + else { + /* Mark the appropriate q as prioritized */ + dev->up.priority_rxq = priority_q_id; + } +} + static int dpdk_eth_dev_init(struct netdev_dpdk *dev) OVS_REQUIRES(dev->mutex) @@ -791,6 +943,8 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) return -diag; } + dpdk_apply_ingress_scheduling(dev, n_rxq); + diag = rte_eth_dev_start(dev->port_id); if (diag) { VLOG_ERR("Interface %s start error: %s", dev->up.name, @@ -897,6 +1051,9 @@ common_construct(struct netdev *netdev, dpdk_port_t port_no, dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE; dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE; + dev->ingress_sched_str = NULL; + dev->ingress_sched_changed = false; + dev->ingress_filter_type = RTE_ETH_FILTER_NONE; /* Initialize the flow control to NULL */ memset(&dev->fc_conf, 0, sizeof dev->fc_conf); @@ -2015,11 +2172,20 @@ netdev_dpdk_set_ingress_sched(struct netdev *netdev, { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); - free(dev->ingress_sched_str); - if (ingress_sched_str) { - dev->ingress_sched_str = xstrdup(ingress_sched_str); + if ((ingress_sched_str && dev->ingress_sched_str && + strcmp(ingress_sched_str, dev->ingress_sched_str) == 0) || + (!ingress_sched_str && !dev->ingress_sched_str)) { + /* no-op; new cfg == old cfg or else both are NULL */ + return 0; + } else { + /* free the old, copy in the new */ + free(dev->ingress_sched_str); + if (ingress_sched_str) { + dev->ingress_sched_str = xstrdup(ingress_sched_str); + } + dev->ingress_sched_changed = true; + netdev_request_reconfigure(netdev); } - return 0; } @@ -3185,12 +3351,13 @@ netdev_dpdk_reconfigure(struct netdev *netdev) && dev->mtu == dev->requested_mtu && dev->rxq_size == dev->requested_rxq_size && dev->txq_size == dev->requested_txq_size - && dev->socket_id == dev->requested_socket_id) { + && dev->socket_id == dev->requested_socket_id + && !dev->ingress_sched_changed) { /* Reconfiguration is unnecessary */ - goto out; } + dev->ingress_sched_changed = false; rte_eth_dev_stop(dev->port_id); if (dev->mtu != dev->requested_mtu diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c index 93c91f6..077a6d6 100644 --- a/vswitchd/bridge.c +++ b/vswitchd/bridge.c @@ -831,6 +831,8 @@ bridge_delete_or_reconfigure_ports(struct bridge *br) } iface_set_netdev_mtu(iface->cfg, iface->netdev); + netdev_set_ingress_sched(iface->netdev, + smap_get(&iface->cfg->other_config, "ingress_sched")); /* If the requested OpenFlow port for 'iface' changed, and it's not * already the correct port, then we might want to temporarily delete -- 2.7.4 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev