When assign a rxq to a pmd, the rxq will only be assigned to
the numa it belongs. An exception is that the numa has no non-iso
pmd there.

For example, we have one vm port (vhu) with all its rxqs in numa1,
while one phy port (p0) in numa0 with all its rxqs in numa 0.
we have four pmds in two numas.
See tables below, paras are listed as (port, queue, numa, core)
 p0  0 0 20
 p0  1 0 21
 p0  2 0 20
 p0  3 0 21
 vhu 0 1 40
 vhu 1 1 41
 vhu 2 1 40
 vhu 2 1 41
In this scenario, ovs-dpdk underperforms another setting that
all p0's queues pinned to four pmds using pmd-rxq-affinity. With
pmd-rxq-isolate=false, we can make vhu get polled on numa 1.
 p0  0 0 20
 p0  1 1 40
 p0  2 0 21
 p0  3 1 41
 vhu 0 1 40
 vhu 1 1 41
 vhu 2 1 40
 vhu 2 1 41
Then we found that with really high throughput, say pmd 40, 41
will reach 100% cycles busy, and pmd 20, 21 reach 70% cycles
busy. The unbalanced rxqs on these pmds became the bottle neck.

pmd-rxq-numa=false would ignore the rxq's numa attribute when
assign rxq to pmds. This could make full use of the cores from
different numas while we have limited core resources for pmds.

Signed-off-by: Wan Junjie <wanjun...@bytedance.com>
---
 lib/dpif-netdev.c    |  30 +++++++++---
 tests/pmd.at         | 106 +++++++++++++++++++++++++++++++++++++++++++
 vswitchd/vswitch.xml |  19 +++++++-
 3 files changed, 148 insertions(+), 7 deletions(-)

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e28e0b554..6b54a3a4c 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -297,6 +297,7 @@ struct dp_netdev {
     struct ovs_mutex tx_qid_pool_mutex;
     /* Rxq to pmd assignment type. */
     enum sched_assignment_type pmd_rxq_assign_type;
+    bool pmd_rxq_assign_numa;
     bool pmd_iso;
 
     /* Protects the access of the 'struct dp_netdev_pmd_thread'
@@ -1844,6 +1845,7 @@ create_dp_netdev(const char *name, const struct 
dpif_class *class,
 
     cmap_init(&dp->poll_threads);
     dp->pmd_rxq_assign_type = SCHED_CYCLES;
+    dp->pmd_rxq_assign_numa = true;
 
     ovs_mutex_init(&dp->tx_qid_pool_mutex);
     /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
@@ -4867,6 +4869,14 @@ dpif_netdev_set_config(struct dpif *dpif, const struct 
smap *other_config)
         dp_netdev_request_reconfigure(dp);
     }
 
+    bool rxq_assign_numa = smap_get_bool(other_config, "pmd-rxq-numa", true);
+    if (dp->pmd_rxq_assign_numa != rxq_assign_numa) {
+        dp->pmd_rxq_assign_numa = rxq_assign_numa;
+        VLOG_INFO("Rxq to PMD numa assignment changed to: \'%s\'.",
+            rxq_assign_numa ? "numa enabled": "numa disabled");
+        dp_netdev_request_reconfigure(dp);
+    }
+
     struct pmd_auto_lb *pmd_alb = &dp->pmd_alb;
 
     rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval",
@@ -5869,6 +5879,7 @@ static void
 sched_numa_list_schedule(struct sched_numa_list *numa_list,
                          struct dp_netdev *dp,
                          enum sched_assignment_type algo,
+                         bool assign_numa,
                          enum vlog_level level)
     OVS_REQ_RDLOCK(dp->port_rwlock)
 {
@@ -5980,14 +5991,19 @@ sched_numa_list_schedule(struct sched_numa_list 
*numa_list,
         numa_id = netdev_get_numa_id(rxq->port->netdev);
         numa = sched_numa_list_lookup(numa_list, numa_id);
 
-        /* Check if numa has no PMDs or no non-isolated PMDs. */
-        if (!numa || !sched_numa_noniso_pmd_count(numa)) {
+        /* Check if numa has no PMDs or no non-isolated PMDs.
+           Or if we should ignore the pmd numa attribute */
+        if (!numa || !sched_numa_noniso_pmd_count(numa) || !assign_numa) {
             /* Unable to use this numa to find a PMD. */
             numa = NULL;
-            /* Find any numa with available PMDs. */
+            /* If we ignore rxq pmd numa attribute and it's roundrobin, we
+              should do roundrobin, else find any numa with available PMDs. */
             for (int j = 0; j < n_numa; j++) {
                 numa = sched_numa_list_next(numa_list, last_cross_numa);
                 if (sched_numa_noniso_pmd_count(numa)) {
+                    if (!assign_numa) {
+                        last_cross_numa = numa;
+                    }
                     break;
                 }
                 last_cross_numa = numa;
@@ -5996,7 +6012,7 @@ sched_numa_list_schedule(struct sched_numa_list 
*numa_list,
         }
 
         if (numa) {
-            if (numa->numa_id != numa_id) {
+            if (numa->numa_id != numa_id && assign_numa) {
                 VLOG(level, "There's no available (non-isolated) pmd thread "
                             "on numa node %d. Port \'%s\' rx queue %d will "
                             "be assigned to a pmd on numa node %d. "
@@ -6036,9 +6052,10 @@ rxq_scheduling(struct dp_netdev *dp)
 {
     struct sched_numa_list numa_list;
     enum sched_assignment_type algo = dp->pmd_rxq_assign_type;
+    bool assign_numa = dp->pmd_rxq_assign_numa;
 
     sched_numa_list_populate(&numa_list, dp);
-    sched_numa_list_schedule(&numa_list, dp, algo, VLL_INFO);
+    sched_numa_list_schedule(&numa_list, dp, algo, assign_numa, VLL_INFO);
     sched_numa_list_put_in_place(&numa_list);
 
     sched_numa_list_free_entries(&numa_list);
@@ -6164,7 +6181,8 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
     /* Populate estimated assignments. */
     sched_numa_list_populate(&numa_list_est, dp);
     sched_numa_list_schedule(&numa_list_est, dp,
-                             dp->pmd_rxq_assign_type, VLL_DBG);
+                             dp->pmd_rxq_assign_type,
+                             dp->pmd_rxq_assign_numa, VLL_DBG);
 
     /* Check if cross-numa polling, there is only one numa with PMDs. */
     if (!sched_numa_list_cross_numa_polling(&numa_list_est) ||
diff --git a/tests/pmd.at b/tests/pmd.at
index a2f9d34a2..4ef5b6886 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -317,6 +317,48 @@ pmd thread numa_id 0 core_id 2:
   isolated : false
 ])
 
+# disable pmd-rxq-numa for roundrobin
+TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
+AT_CHECK([ovs-vsctl set open . other_config:pmd-rxq-numa=false])
+OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Rxq to PMD numa 
assignment changed to: 'numa disabled'"])
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl
+pmd thread numa_id 1 core_id 1:
+  isolated : false
+  port: p0                queue-id:  1 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  3 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
+pmd thread numa_id 0 core_id 2:
+  isolated : false
+  port: p0                queue-id:  0 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  2 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  4 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
+])
+
+# enable pmd-rxq-numa again for roundrobin
+TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
+AT_CHECK([ovs-vsctl set open . other_config:pmd-rxq-numa=true])
+OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Rxq to PMD numa 
assignment changed to: 'numa enabled'"])
+
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl
+pmd thread numa_id 1 core_id 1:
+  isolated : false
+  port: p0                queue-id:  0 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  1 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  2 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  3 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  4 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
+pmd thread numa_id 0 core_id 2:
+  isolated : false
+])
+
 TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
 AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-rxq-assign=cycles])
 OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Performing pmd to rx 
queue assignment using cycles algorithm"])
@@ -337,6 +379,50 @@ pmd thread numa_id 0 core_id 2:
   isolated : false
 ])
 
+
+# disable pmd-rxq-numa for cycles
+TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
+AT_CHECK([ovs-vsctl set open . other_config:pmd-rxq-numa=false])
+OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Rxq to PMD numa 
assignment changed to: 'numa disabled'"])
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl
+pmd thread numa_id 1 core_id 1:
+  isolated : false
+  port: p0                queue-id:  1 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  3 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
+pmd thread numa_id 0 core_id 2:
+  isolated : false
+  port: p0                queue-id:  0 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  2 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  4 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
+])
+
+# enable pmd-rxq-numa again for cycles
+TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
+AT_CHECK([ovs-vsctl set open . other_config:pmd-rxq-numa=true])
+OVS_WAIT_UNTIL([tail -n +$TMP ovs-vswitchd.log | grep "Rxq to PMD numa 
assignment changed to: 'numa enabled'"])
+
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show], [0], [dnl
+pmd thread numa_id 1 core_id 1:
+  isolated : false
+  port: p0                queue-id:  0 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  1 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  2 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  3 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  4 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  5 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  6 (enabled)   pmd usage: NOT AVAIL
+  port: p0                queue-id:  7 (enabled)   pmd usage: NOT AVAIL
+  overhead: NOT AVAIL
+pmd thread numa_id 0 core_id 2:
+  isolated : false
+])
+
+
 # Switch back from mixed numa to single numa
 TMP=$(($(cat ovs-vswitchd.log | wc -l | tr -d [[:blank:]])+1))
 AT_CHECK([ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x1])
@@ -925,6 +1011,19 @@ p1 0 0 1
 p1 1 0 2
 ])
 
+dnl for rxq affinity the disable numa assignment should not work
+AT_CHECK([ovs-vsctl set open . other_config:pmd-rxq-numa=false])
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 0 1
+p1 1 0 2
+])
+
+AT_CHECK([ovs-vsctl set open . other_config:pmd-rxq-numa=true])
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 0 1
+p1 1 0 2
+])
+
 AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"])
 
 dnl We moved the queues to different contiguous numa node. Expecting threads on
@@ -934,6 +1033,13 @@ p1 0 1 3
 p1 1 1 4
 ])
 
+dnl for rxq affinity the disable numa assignment should not work
+AT_CHECK([ovs-vsctl set open . other_config:pmd-rxq-numa=false])
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 1 3
+p1 1 1 4
+])
+
 AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:6"])
 
 dnl We moved the queues to different non-contiguous numa node. Expecting 
threads on
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 064e0facf..1e7c00746 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -576,7 +576,24 @@
           is set to <code>group</code>.
         </p>
       </column>
-
+      <column name="other_config" key="pmd-rxq-numa"
+              type='{"type": "boolean"}'>
+        <p>
+         Configures if Rx queues will be assigned to cores taken numa into
+         consideration.
+        </p>
+        <p>
+         It uses current scheme of cycle based assignment of RX queues that
+         are not statically pinned to PMDs.
+        </p>
+        <p>
+          The default value is <code>true</code>.
+        </p>
+        <p>
+          Set this value to <code>false</code> to disable this option. It is
+          currently enabled by default.
+        </p>
+      </column>
       <column name="other_config" key="n-handler-threads"
               type='{"type": "integer", "minInteger": 1}'>
         <p>
-- 
2.33.0

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to