Cc: Christophe Fontaine <cfont...@redhat.com>
Cc: Kevin Traynor <ktray...@redhat.com>
Signed-off-by: Robin Jarry <rja...@redhat.com>
---
v2 -> v3:
* Added dry_run validation that rte_flows are all supported by the NIC
before configuring anything.
* Added check to make cp-protection and hw-offload mutually exclusive.
* Removed the "match-all" RSS flow that dealt with redirecting all
non-control-plane traffic to all but the control-plane Rx queue. Very
few NICs actually support "match-all" flows without any mask. This was
replaced by reconfiguring the RSS redirection table. The
* Made sure to unconfigure everything and remove the extra Rx queue in
the case the hardware does not support one of the RTE flows.
* Updated vswitchd/vswitch.xml
* Added diagnostics info in netdev_dpdk_get_status
* Tested under load on the following NICs:
- Intel E810 (2x 25G)
- Mellanox ConnectX-5 (2x 25G)
* Basic functionality tested on the following NICs:
- Intel 82599ES (2x 10G)
- Intel X710 (4x 10G)
- Mellanox ConnectX-4 (2x 25G)
Documentation/topics/dpdk/phy.rst | 55 ++++++
lib/netdev-dpdk.c | 293 +++++++++++++++++++++++++++++-
vswitchd/vswitch.xml | 26 +++
3 files changed, 373 insertions(+), 1 deletion(-)
diff --git a/Documentation/topics/dpdk/phy.rst
b/Documentation/topics/dpdk/phy.rst
index 937f4c40e5a8..86e69d79b104 100644
--- a/Documentation/topics/dpdk/phy.rst
+++ b/Documentation/topics/dpdk/phy.rst
@@ -131,6 +131,61 @@ possible with DPDK acceleration. It is possible to
configure multiple Rx queues
for ``dpdk`` ports, thus ensuring this is not a bottleneck for performance.
For
information on configuring PMD threads, refer to :doc:`pmd`.
+Control Plane Protection
+------------------------
+
+Some control protocols are used to maintain link status between forwarding
+engines. In SDN environments, these packets share the same physical network
+than the user data traffic.
+
+When the system is not sized properly, the PMD threads may not be able to
+process all incoming traffic from the configured Rx queues. When a signaling
+packet of such protocols is dropped, it can cause link flapping, worsening the
+situation.
+
+Some physical NICs can be programmed to put these protocols in a dedicated
+hardware Rx queue using the rte_flow__ API.
+
+__
https://doc.dpdk.org/guides-21.11/prog_guide/rte_flow.html#device-compatibility
+
+The currently supported control plane protocols are:
+
+``lacp``
+ `Link Aggregation Control Protocol`__. Ether type ``0x8809``.
+
+ __ https://www.ieee802.org/3/ad/public/mar99/seaman_1_0399.pdf
+
+.. warning::
+
+ This feature is not compatible with all NICs. Refer to vendor documentation
+ for more information.
+
+Control plane protection must be enabled on specific protocols per port. The
+``cp-protection`` option requires a coma separated list of protocol names::
+
+ $ ovs-vsctl add-port br0 dpdk-p0 -- set Interface dpdk-p0 type=dpdk \
+ options:dpdk-devargs=0000:01:00.0 options:cp-protection=lacp
+
+.. note::
+
+ If multiple Rx queues are already configured, regular RSS (Receive Side
+ Scaling) queue balancing is done on all but the extra control plane
+ protection queue.
+
+.. tip::
+
+ You can check if control plane protection is supported on a port with the
+ following command::
+
+ $ ovs-vsctl get interface dpdk-p0 status
+ {cp_protection_queue="2", driver_name=..., rss_queues="0-1"}
+
+ If the hardware does not support redirecting control plane traffic to
+ a dedicated queue, it will be explicit::
+
+ $ ovs-vsctl get interface dpdk-p0 status
+ {cp_protection_queue=unsupported, driver_name=..., rss_queues="0-1"}
+
.. _dpdk-phy-flow-control:
Flow Control
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 0dd655507b50..94f04437a641 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -410,6 +410,11 @@ enum dpdk_hw_ol_features {
NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4,
};
+enum dpdk_cp_prot_flags {
+ DPDK_CP_PROT_UNSUPPORTED = 1 << 0,
+ DPDK_CP_PROT_LACP = 1 << 1,
+};
+
/*
* In order to avoid confusion in variables names, following naming
convention
* should be used, if possible:
@@ -453,6 +458,7 @@ struct netdev_dpdk {
};
struct dpdk_tx_queue *tx_q;
struct rte_eth_link link;
+ uint16_t reta_size;
);
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
@@ -529,6 +535,13 @@ struct netdev_dpdk {
/* VF configuration. */
struct eth_addr requested_hwaddr;
+
+ /* Requested control plane protection flags,
+ * from the enum set 'dpdk_cp_prot_flags' */
+ uint64_t requested_cp_prot_flags;
+ uint64_t cp_prot_flags;
+ size_t cp_prot_flows_num;
+ struct rte_flow **cp_prot_flows;
);
PADDED_MEMBERS(CACHE_LINE_SIZE,
@@ -1192,6 +1205,7 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev)
netdev_get_name(&dev->up));
}
}
+ dev->reta_size = info.reta_size;
n_rxq = MIN(info.max_rx_queues, dev->up.n_rxq);
n_txq = MIN(info.max_tx_queues, dev->up.n_txq);
@@ -1309,6 +1323,10 @@ common_construct(struct netdev *netdev, dpdk_port_t
port_no,
dev->requested_n_txq = NR_QUEUE;
dev->requested_rxq_size = NIC_PORT_DEFAULT_RXQ_SIZE;
dev->requested_txq_size = NIC_PORT_DEFAULT_TXQ_SIZE;
+ dev->requested_cp_prot_flags = 0;
+ dev->cp_prot_flags = 0;
+ dev->cp_prot_flows_num = 0;
+ dev->cp_prot_flows = NULL;
/* Initialize the flow control to NULL */
memset(&dev->fc_conf, 0, sizeof dev->fc_conf);
@@ -1904,6 +1922,9 @@ dpdk_set_rxq_config(struct netdev_dpdk *dev, const struct
smap *args)
int new_n_rxq;
new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
+ if (dev->requested_cp_prot_flags) {
+ new_n_rxq += 1;
+ }
if (new_n_rxq != dev->requested_n_rxq) {
dev->requested_n_rxq = new_n_rxq;
netdev_request_reconfigure(&dev->up);
@@ -1927,6 +1948,53 @@ dpdk_process_queue_size(struct netdev *netdev, const
struct smap *args,
}
}
+static int
+dpdk_cp_prot_set_config(struct netdev *netdev, struct netdev_dpdk *dev,
+ const struct smap *args, char **errp)
+{
+ const char *arg = smap_get_def(args, "cp-protection", "");
+ uint64_t flags = 0;
+ char buf[256];
+ char *token, *saveptr;
+
+ ovs_strzcpy(buf, arg, sizeof(buf));
+ buf[sizeof(buf) - 1] = '\0';
+
+ token = strtok_r(buf, ",", &saveptr);
+ while (token) {
+ if (strcmp(token, "lacp") == 0) {
+ flags |= DPDK_CP_PROT_LACP;
+ } else {
+ VLOG_WARN_BUF(
+ errp, "%s options:cp-protection unknown protocol '%s'",
+ netdev_get_name(netdev), token);
+ return -1;
+ }
+ token = strtok_r(NULL, ",", &saveptr);
+ }
+
+ if (flags && dev->type != DPDK_DEV_ETH) {
+ VLOG_WARN_BUF( errp,
+ "%s options:cp-protection is only supported on ethernet ports",
+ netdev_get_name(netdev));
+ return -1;
+ }
+
+ if (flags && netdev_is_flow_api_enabled()) {
+ VLOG_WARN_BUF(errp,
+ "%s options:cp-protection is incompatible with hw-offload",
+ netdev_get_name(netdev));
+ return -1;
+ }
+
+ if (flags != dev->requested_cp_prot_flags) {
+ dev->requested_cp_prot_flags = flags;
+ netdev_request_reconfigure(netdev);
+ }
+
+ return 0;
+}
+
static int
netdev_dpdk_set_config(struct netdev *netdev, const struct smap *args,
char **errp)
@@ -1946,6 +2014,11 @@ netdev_dpdk_set_config(struct netdev *netdev, const
struct smap *args,
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&dev->mutex);
+ if (dpdk_cp_prot_set_config(netdev, dev, args, errp) < 0) {
+ err = EINVAL;
+ goto out;
+ }
+
dpdk_set_rxq_config(dev, args);
dpdk_process_queue_size(netdev, args, "n_rxq_desc",
@@ -3639,8 +3712,10 @@ netdev_dpdk_get_status(const struct netdev *netdev,
struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
struct rte_eth_dev_info dev_info;
+ uint64_t cp_prot_flags;
uint32_t link_speed;
uint32_t dev_flags;
+ int n_rxq;
if (!rte_eth_dev_is_valid_port(dev->port_id)) {
return ENODEV;
@@ -3651,6 +3726,8 @@ netdev_dpdk_get_status(const struct netdev *netdev,
struct smap *args)
rte_eth_dev_info_get(dev->port_id, &dev_info);
link_speed = dev->link.link_speed;
dev_flags = *dev_info.dev_flags;
+ cp_prot_flags = dev->cp_prot_flags;
+ n_rxq = netdev->n_rxq;
ovs_mutex_unlock(&dev->mutex);
const struct rte_bus *bus;
const struct rte_pci_device *pci_dev;
@@ -3703,6 +3780,24 @@ netdev_dpdk_get_status(const struct netdev *netdev,
struct smap *args)
ETH_ADDR_ARGS(dev->hwaddr));
}
+ if (cp_prot_flags) {
+ if (cp_prot_flags & DPDK_CP_PROT_UNSUPPORTED) {
+ smap_add(args, "cp_protection_queue", "unsupported");
+ if (n_rxq > 1) {
+ smap_add_format(args, "rss_queues", "0-%d", n_rxq - 1);
+ } else {
+ smap_add(args, "rss_queues", "0");
+ }
+ } else {
+ smap_add_format(args, "cp_protection_queue", "%d", n_rxq - 1);
+ if (n_rxq > 2) {
+ smap_add_format(args, "rss_queues", "0-%d", n_rxq - 2);
+ } else {
+ smap_add(args, "rss_queues", "0");
+ }
+ }
+ }
+
return 0;
}
@@ -4933,6 +5028,179 @@ static const struct dpdk_qos_ops trtcm_policer_ops = {
.qos_queue_dump_state_init = trtcm_policer_qos_queue_dump_state_init
};
+static int
+dpdk_cp_prot_add_flow(struct netdev_dpdk *dev,
+ const struct rte_flow_attr *attr,
+ const struct rte_flow_item items[],
+ const struct rte_flow_action actions[],
+ const char *desc, bool dry_run)
+{
+ struct rte_flow_error error;
+ struct rte_flow *flow;
+ size_t num;
+
+ if (dry_run) {
+ int ret;
+ ret = rte_flow_validate(dev->port_id, attr, items, actions, &error);
+ if (rte_flow_validate(dev->port_id, attr, items, actions, &error)) {
+ VLOG_WARN("%s: cp-protection: device does not support %s flow: %s",
+ netdev_get_name(&dev->up), desc, error.message);
+ }
+ return ret;
+ }
+
+ flow = rte_flow_create(dev->port_id, attr, items, actions, &error);
+ if (flow == NULL) {
+ VLOG_WARN("%s: cp-protection: failed to add %s flow: %s",
+ netdev_get_name(&dev->up), desc, error.message);
+ return rte_errno;
+ }
+
+ num = dev->cp_prot_flows_num + 1;
+ dev->cp_prot_flows = xrealloc(dev->cp_prot_flows, sizeof(flow) * num);
+ dev->cp_prot_flows[dev->cp_prot_flows_num] = flow;
+ dev->cp_prot_flows_num = num;
+
+ return 0;
+}
+
+static int
+dpdk_cp_prot_add_traffic_flow(struct netdev_dpdk *dev,
+ const struct rte_flow_item items[],
+ const char *desc, bool dry_run)
+{
+ const struct rte_flow_attr attr = { .ingress = 1 };
+ const struct rte_flow_action actions[] = {
+ {
+ .type = RTE_FLOW_ACTION_TYPE_QUEUE,
+ .conf = &(const struct rte_flow_action_queue) {
+ .index = dev->up.n_rxq - 1,
+ },
+ },
+ { .type = RTE_FLOW_ACTION_TYPE_END },
+ };
+
+ if (!dry_run) {
+ VLOG_INFO("%s: cp-protection: redirecting %s traffic to queue %d",
+ netdev_get_name(&dev->up), desc, dev->up.n_rxq - 1);
+ }
+ return dpdk_cp_prot_add_flow(dev, &attr, items, actions, desc, dry_run);
+}
+
+static int
+dpdk_cp_prot_rss_configure(struct netdev_dpdk *dev, int rss_n_rxq)
+{
+ struct rte_eth_rss_reta_entry64 *reta_conf;
+ size_t reta_conf_size;
+ int err;
+
+ if (rss_n_rxq == 1) {
+ VLOG_INFO("%s: cp-protection: redirecting other traffic to queue 0",
+ netdev_get_name(&dev->up));
+ } else {
+ VLOG_INFO("%s: cp-protection: applying rss on queues 0-%d",
+ netdev_get_name(&dev->up), rss_n_rxq - 1);
+ }
+
+ reta_conf_size = (dev->reta_size / RTE_ETH_RETA_GROUP_SIZE)
+ * sizeof(struct rte_eth_rss_reta_entry64);
+ memset(reta_conf, 0, reta_conf_size);
+
+ for (uint16_t i = 0; i < dev->reta_size; i++) {
+ uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE;
+ uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE;
+ reta_conf[idx].mask |= 1ULL << shift;
+ reta_conf[idx].reta[shift] = i % rss_n_rxq;
+ }
+ err = rte_eth_dev_rss_reta_update(dev->port_id, reta_conf, dev->reta_size);
+ if (err < 0) {
+ VLOG_DBG("%s: failed to configure RSS redirection table: err=%d",
+ netdev_get_name(&dev->up), err);
+ }
+
+ free(reta_conf);
+
+ return err;
+}
+
+static int
+dpdk_cp_prot_configure(struct netdev_dpdk *dev, bool dry_run)
+{
+ int err = 0;
+
+ if (dev->requested_cp_prot_flags & DPDK_CP_PROT_UNSUPPORTED) {
+ goto out;
+ }
+ if (dev->up.n_rxq < 2) {
+ err = ENOTSUP;
+ VLOG_DBG("%s: cp-protection: not enough available rx queues",
+ netdev_get_name(&dev->up));
+ goto out;
+ }
+
+ if (dev->requested_cp_prot_flags & DPDK_CP_PROT_LACP) {
+ err = dpdk_cp_prot_add_traffic_flow(
+ dev,
+ (const struct rte_flow_item []) {
+ {
+ .type = RTE_FLOW_ITEM_TYPE_ETH,
+ .spec = &(const struct rte_flow_item_eth){
+ .type = htons(ETH_TYPE_LACP),
+ },
+ .mask = &(const struct rte_flow_item_eth){
+ .type = htons(0xffff),
+ },
+ },
+ { .type = RTE_FLOW_ITEM_TYPE_END },
+ },
+ "lacp",
+ dry_run
+ );
+ if (err) {
+ goto out;
+ }
+ }
+
+ if (!dry_run && dev->cp_prot_flows_num) {
+ /* reconfigure RSS reta in all but the cp protection queue */
+ err = dpdk_cp_prot_rss_configure(dev, dev->up.n_rxq - 1);
+ }
+
+out:
+ if (!dry_run) {
+ dev->cp_prot_flags = dev->requested_cp_prot_flags;
+ }
+ if (err) {
+ dev->requested_cp_prot_flags |= DPDK_CP_PROT_UNSUPPORTED;
+ }
+ return err;
+}
+
+static void
+dpdk_cp_prot_unconfigure(struct netdev_dpdk *dev)
+{
+ struct rte_flow_error error;
+
+ if (dev->cp_prot_flows_num == 0) {
+ return;
+ }
+
+ VLOG_DBG("%s: cp-protection: reset flows", netdev_get_name(&dev->up));
+
+ for (int i = 0; i < dev->cp_prot_flows_num; i++) {
+ if (rte_flow_destroy(dev->port_id, dev->cp_prot_flows[i], &error)) {
+ VLOG_DBG("%s: cp-protection: failed to destroy flow: %s",
+ netdev_get_name(&dev->up), error.message);
+ }
+ }
+ free(dev->cp_prot_flows);
+ dev->cp_prot_flows_num = 0;
+ dev->cp_prot_flows = NULL;
+
+ (void) dpdk_cp_prot_rss_configure(dev, dev->up.n_rxq);
+}
+
static int
netdev_dpdk_reconfigure(struct netdev *netdev)
{
@@ -4943,6 +5211,7 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
if (netdev->n_txq == dev->requested_n_txq
&& netdev->n_rxq == dev->requested_n_rxq
+ && dev->cp_prot_flags == dev->requested_cp_prot_flags
&& dev->mtu == dev->requested_mtu
&& dev->lsc_interrupt_mode == dev->requested_lsc_interrupt_mode
&& dev->rxq_size == dev->requested_rxq_size
@@ -4987,6 +5256,8 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
}
}
+ dpdk_cp_prot_unconfigure(dev);
+
err = dpdk_eth_dev_init(dev);
if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) {
netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO;
@@ -5014,6 +5285,20 @@ netdev_dpdk_reconfigure(struct netdev *netdev)
if (!dev->tx_q) {
err = ENOMEM;
}
+ if (!err && dev->requested_cp_prot_flags) {
+ /* dry run first */
+ err = dpdk_cp_prot_configure(dev, true);
+ if (!err) {
+ /* if no error, apply configuration */
+ err = dpdk_cp_prot_configure(dev, false);
+ }
+ if (err) {
+ /* no hw support, remove the extra queue & recover gracefully */
+ err = 0;
+ dev->requested_n_rxq -= 1;
+ netdev_request_reconfigure(netdev);
+ }
+ }
netdev_change_seq_changed(netdev);
@@ -5215,7 +5500,13 @@ netdev_dpdk_flow_api_supported(struct netdev *netdev)
ovs_mutex_lock(&dev->mutex);
if (dev->type == DPDK_DEV_ETH) {
/* TODO: Check if we able to offload some minimal flow. */
- ret = true;
+ if (dev->requested_cp_prot_flags || dev->cp_prot_flags) {
+ VLOG_WARN(
+ "%s: hw-offload is mutually exclusive with cp-protection",
+ netdev_get_name(netdev));
+ } else {
+ ret = true;
+ }
}
ovs_mutex_unlock(&dev->mutex);
out:
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 36388e3c42d7..7e6ae3df7583 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -3430,6 +3430,32 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch
options:peer=p1 \
<p>This option may only be used with dpdk VF representors.</p>
</column>
+ <column name="options" key="cp-protection"
+ type='{"type": "string", "enum": ["set", ["lacp"]]}'>
+ <p>
+ Allocate an extra Rx queue for control plane packets of the specified
+ protocol(s).
+ </p>
+ <p>
+ If the user has already configured multiple
+ <code>options:n_rxq</code> on the port, an additional one will be
+ allocated for control plane packets. If the hardware cannot satisfy
+ the requested number of requested Rx queues, the last Rx queue will
+ be assigned for control plane. If only one Rx queue is available or
+ if the hardware does not support the RTE flow matchers/actions
+ required to redirect the selected protocols,
+ <code>cp-protection</code> will be disabled.
+ </p>
+ <p>
+ This feature is multually exclusive with
+ <code>other_options:hw-offload</code> as it may conflict with the
+ offloaded RTE flows.
+ </p>
+ <p>
+ Disabled by default.
+ </p>
+ </column>
+
<column name="other_config" key="tx-steering"
type='{"type": "string",
"enum": ["set", ["thread", "hash"]]}'>