From: Jan Scheurich <jan.scheur...@ericsson.com> Today dpif-netdev considers PMD threads on a non-local NUMA node for automatic assignment of the rxqs of a port only if there are no local, non-isolated PMDs.
On typical servers with both physical ports on one NUMA node, this often leaves the PMDs on the other NUMA node under-utilized, wasting CPU resources. The alternative, to manually pin the rxqs to PMDs on remote NUMA nodes, also has drawbacks as it limits OVS' ability to auto load-balance the rxqs. This patch introduces a new interface configuration option to allow ports to be automatically polled by PMDs on any NUMA node: ovs-vsctl set interface <Name> other_config:cross-numa-polling=true If this option is not present or set to false, legacy behaviour applies. Signed-off-by: Jan Scheurich <jan.scheur...@ericsson.com> Signed-off-by: Anurag Agarwal <anura...@gmail.com> --- Documentation/topics/dpdk/pmd.rst | 24 ++++++++++++++++++++++-- lib/dpif-netdev.c | 31 +++++++++++++++++++++++-------- tests/pmd.at | 30 ++++++++++++++++++++++++++++++ vswitchd/vswitch.xml | 20 ++++++++++++++++++++ 4 files changed, 95 insertions(+), 10 deletions(-) diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index b259cc8b3..f6c9671d7 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -132,8 +132,28 @@ or can be triggered by using:: Port/Rx Queue assignment to PMD threads by manual pinning ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Rx queues may be manually pinned to cores. This will change the default Rx -queue assignment to PMD threads:: + +Normally, Rx queues are assigned to PMD threads automatically. By default +OVS only assigns Rx queues to PMD threads executing on the same NUMA +node in order to avoid unnecessary latency for accessing packet buffers +across the NUMA boundary. Typically this overhead is higher for vhostuser +ports than for physical ports due to the packet copy that is done for all +rx packets. + +On NUMA servers with physical ports only on one NUMA node, the NUMA-local +polling policy can lead to an under-utilization of the PMD threads on the +remote NUMA node. For the overall OVS performance it may in such cases be +beneficial to utilize the spare capacity and allow polling of a physical +port's rxqs across NUMA nodes despite the overhead involved. +The policy can be set per port with the following configuration option:: + + $ ovs-vsctl set Interface <iface> \ + other_config:cross-numa-polling=true|false + +The default value is false. + +Rx queues may also be manually pinned to cores. This will change the default +Rx queue assignment to PMD threads:: $ ovs-vsctl set Interface <iface> \ other_config:pmd-rxq-affinity=<rxq-affinity-list> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 720818e30..4fda8d7a0 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -466,6 +466,7 @@ struct dp_netdev_port { char *type; /* Port type as requested by user. */ char *rxq_affinity_list; /* Requested affinity of rx queues. */ enum txq_req_mode txq_requested_mode; + bool cross_numa_polling; /* If true cross polling will be enabled */ }; static bool dp_netdev_flow_ref(struct dp_netdev_flow *); @@ -5018,6 +5019,7 @@ dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no, bool emc_enabled = smap_get_bool(cfg, "emc-enable", true); const char *tx_steering_mode = smap_get(cfg, "tx-steering"); enum txq_req_mode txq_mode; + bool cross_numa_polling = smap_get_bool(cfg, "cross-numa-polling", false); ovs_rwlock_wrlock(&dp->port_rwlock); error = get_port_by_number(dp, port_no, &port); @@ -5086,6 +5088,14 @@ dpif_netdev_port_set_config(struct dpif *dpif, odp_port_t port_no, dp_netdev_request_reconfigure(dp); } + if (cross_numa_polling != port->cross_numa_polling) { + port->cross_numa_polling = cross_numa_polling; + VLOG_INFO("%s:cross-numa-polling has been %s.", + netdev_get_name(port->netdev), + (cross_numa_polling)? "enabled" : "disabled"); + dp_netdev_request_reconfigure(dp); + } + unlock: ovs_rwlock_unlock(&dp->port_rwlock); return error; @@ -5885,7 +5895,7 @@ sched_numa_list_schedule(struct sched_numa_list *numa_list, { struct dp_netdev_port *port; struct dp_netdev_rxq **rxqs = NULL; - struct sched_numa *last_cross_numa; + struct sched_numa *next_numa = NULL; unsigned n_rxqs = 0; bool start_logged = false; size_t n_numa; @@ -5969,7 +5979,7 @@ sched_numa_list_schedule(struct sched_numa_list *numa_list, qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles); } - last_cross_numa = NULL; + next_numa = NULL; n_numa = sched_numa_list_count(numa_list); for (unsigned i = 0; i < n_rxqs; i++) { struct dp_netdev_rxq *rxq = rxqs[i]; @@ -5989,20 +5999,25 @@ sched_numa_list_schedule(struct sched_numa_list *numa_list, proc_cycles = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_HIST); /* Select the numa that should be used for this rxq. */ numa_id = netdev_get_numa_id(rxq->port->netdev); - numa = sched_numa_list_lookup(numa_list, numa_id); + + if (!(rxqs[i]->port->cross_numa_polling)) { + /* Try to find a local pmd. */ + numa = sched_numa_list_lookup(numa_list, numa_id); + } else { + /* Allow polling by any pmd. */ + numa = NULL; + } /* Check if numa has no PMDs or no non-isolated PMDs. */ if (!numa || !sched_numa_noniso_pmd_count(numa)) { /* Unable to use this numa to find a PMD. */ - numa = NULL; /* Find any numa with available PMDs. */ for (int j = 0; j < n_numa; j++) { - numa = sched_numa_list_next(numa_list, last_cross_numa); - if (sched_numa_noniso_pmd_count(numa)) { + next_numa = sched_numa_list_next(numa_list, next_numa); + if (sched_numa_noniso_pmd_count(next_numa)) { + numa = next_numa; break; } - last_cross_numa = numa; - numa = NULL; } } diff --git a/tests/pmd.at b/tests/pmd.at index a2f9d34a2..81a9a0eca 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -541,6 +541,36 @@ icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10 OVS_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([PMD - Enable cross numa polling]) +OVS_VSWITCHD_START( + [add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=4 -- \ + set Open_vSwitch . other_config:pmd-cpu-mask=3 +], [], [], [--dummy-numa 0,1]) + +AT_CHECK([ovs-ofctl add-flow br0 action=controller]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show | cut -f 3 -d ' ' | sort | uniq], [0], [dnl +0 +]) + +dnl Enable cross numa polling and check numa ids +AT_CHECK([ovs-vsctl set Interface p1 other_config:cross-numa-polling=true]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show | cut -f 3 -d ' ' | sort | uniq], [0], [dnl +0 +1 +]) + +dnl Disable cross numa polling and check numa ids +AT_CHECK([ovs-vsctl set Interface p1 other_config:cross-numa-polling=false]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show | cut -f 3 -d ' ' | sort | uniq], [0], [dnl +0 +]) + +OVS_VSWITCHD_STOP(["/|WARN|/d"]) +AT_CLEANUP + AT_SETUP([PMD - change numa node]) OVS_VSWITCHD_START( [add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \ diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 0c6632617..4d7339885 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -3284,6 +3284,26 @@ ovs-vsctl add-port br0 p0 -- set Interface p0 type=patch options:peer=p1 \ </p> </column> + <column name="other_config" key="cross-numa-polling" + type='{"type": "boolean"}'> + <p> + Specifies if the RX queues of the port can be automatically assigned + to PMD threads on any NUMA node or only on the local NUMA node of + the port. + </p> + <p> + Polling of physical ports from a non-local PMD thread incurs some + performance penalty due to the access to packet data across the NUMA + barrier. This option can still increase the overall performance if + it allows better utilization of those non-local PMDs threads. + It is most useful together with the auto load-balancing of RX queues + (see other_config:auto_lb in table Open_vSwitch). + </p> + <p> + Defaults to false. + </p> + </column> + <column name="options" key="xdp-mode" type='{"type": "string", "enum": ["set", ["best-effort", "native-with-zerocopy", -- 2.25.1 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev