From: Anurag Agarwal
The current algorithm for balancing unpinned rxqs over non-isolated pmds
assigns the rxqs in descending load order to pmds in a "zig-zag" fashion
(e.g. A,B,C,C,B,A,A,B,...). This simple heuristic relies on the pmds
being initially "empty" and produces optimal results only if the rxq loads
are not too disparate (e.g. 100,10,10,10,10,10,10).
The first pre-requisite will no longer be fulfilled when the isolation
of pmds with pinned rxqs is made optional in a subsequent patch.
To prepare for this change we introduce a least-loaded scheduling
algorithm. During rxq scheduling, we keep track of the number of assigned
rxqs and their processing cycles per non-isolated pmd and maintain the
array of pmds per numa node sorted according to their load. We still
assign the rxqs in descending load order, but always assign to the least
loaded pmd. This deals with the case of pmds with a-prioy load due to
pinned rxs and, as additional benefit, handles disparate rxq loads better.
If rxq processing cycles are not used for rxq scheduling, the estimated
pmd load is based on the number of assigned rxqs. In this case, the
least-loaded scheduling algorithm effectively results in a round-robin
assignment of rxqs to pmds so that the legacy code for round-robin
assignment could be removed.
Signed-off-by: Anurag Agarwal
Signed-off-by: Jan Scheurich
Signed-off-by: Rudra Surya Bhaskara Rao
---
lib/dpif-netdev.c | 267 +-
tests/pmd.at | 4 +-
2 files changed, 166 insertions(+), 105 deletions(-)
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index fd192db..1c458b2 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -800,6 +800,10 @@ struct dp_netdev_pmd_thread {
/* Next time when PMD should try RCU quiescing. */
long long next_rcu_quiesce;
+
+/* Number and cycles of assigned rxqs during rxq scheduling. */
+int ll_n_rxq;
+uint64_t ll_cycles;
};
/* Interface to netdev-based datapath. */
@@ -4905,6 +4909,62 @@ port_reconfigure(struct dp_netdev_port *port)
return 0;
}
+/* Sort PMDs in ascending order of load. If processing cycles are not used for
+ * rxq scheduling, the ll_cycles are zero. In that case the count of
+ * assigned rxqs is taken as PMD load, resulting in round-robin scheduling. */
+static int
+compare_pmd_load(const void *a, const void *b)
+{
+struct dp_netdev_pmd_thread *pmda;
+struct dp_netdev_pmd_thread *pmdb;
+
+pmda = *(struct dp_netdev_pmd_thread **) a;
+pmdb = *(struct dp_netdev_pmd_thread **) b;
+
+if (pmda->ll_cycles != pmdb->ll_cycles) {
+return (pmda->ll_cycles > pmdb->ll_cycles) ? 1 : -1;
+} else if (pmda->ll_n_rxq != pmdb->ll_n_rxq) {
+return (pmda->ll_n_rxq > pmdb->ll_n_rxq) ? 1 : -1;
+} else {
+/* Cycles and rxqs are the same so tiebreak on core ID.
+ * Tiebreaking (as opposed to return 0) ensures consistent
+ * sort results across multiple OS's. */
+return (pmda->core_id > pmdb->core_id) ? 1 : -1;
+}
+}
+
+/* Sort Rx Queues in descending order of processing cycles they
+ * are consuming. */
+static int
+compare_rxq_cycles(const void *a, const void *b)
+{
+struct dp_netdev_rxq *qa;
+struct dp_netdev_rxq *qb;
+uint64_t cycles_qa, cycles_qb;
+
+qa = *(struct dp_netdev_rxq **) a;
+qb = *(struct dp_netdev_rxq **) b;
+
+cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
+cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
+
+if (cycles_qa != cycles_qb) {
+return (cycles_qa < cycles_qb) ? 1 : -1;
+} else {
+/* Cycles are the same so tiebreak on port/queue id.
+ * Tiebreaking (as opposed to return 0) ensures consistent
+ * sort results across multiple OS's. */
+uint32_t port_qa = odp_to_u32(qa->port->port_no);
+uint32_t port_qb = odp_to_u32(qb->port->port_no);
+if (port_qa != port_qb) {
+return (port_qa > port_qb) ? 1 : -1;
+} else {
+return (netdev_rxq_get_queue_id(qa->rx)
+- netdev_rxq_get_queue_id(qb->rx));
+}
+}
+}
+
struct rr_numa_list {
struct hmap numas; /* Contains 'struct rr_numa' */
};
@@ -4917,9 +4977,6 @@ struct rr_numa {
/* Non isolated pmds on numa node 'numa_id' */
struct dp_netdev_pmd_thread **pmds;
int n_pmds;
-
-int cur_index;
-bool idx_inc;
};
static struct rr_numa *
@@ -4976,49 +5033,12 @@ rr_numa_list_populate(struct dp_netdev *dp, struct
rr_numa_list *rr)
numa->n_pmds++;
numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
numa->pmds[numa->n_pmds - 1] = pmd;
-/* At least one pmd so initialise curr_idx and idx_inc. */
-numa->cur_index = 0;
-numa->idx_inc = true;
}
-}
-
-/*
- * Returns the next pmd from the numa node.
- *
- * If 'updown' is 'true' it will alternate between selecting the next pmd in
- *