Use separate priority queues for different groups. Sharing
the same priority queue over multiple groups caused multiple
issues:
* latency and ordering issues when threads push back
  events (from wrong groups) to the tail of the priority queue
* unnecessary contention (scaling issues) when threads belong
  to different groups

Lowered the maximum number of groups from 256 to 32 (in the default
configuration) to limit memory usage of priority queues. This should
be enough for the most users.

Signed-off-by: Petri Savolainen <petri.savolai...@linaro.org>
---
 platform/linux-generic/odp_schedule.c | 284 +++++++++++++++++++++++-----------
 1 file changed, 195 insertions(+), 89 deletions(-)

diff --git a/platform/linux-generic/odp_schedule.c 
b/platform/linux-generic/odp_schedule.c
index e7079b9..f366e7e 100644
--- a/platform/linux-generic/odp_schedule.c
+++ b/platform/linux-generic/odp_schedule.c
@@ -34,7 +34,7 @@ ODP_STATIC_ASSERT((ODP_SCHED_PRIO_NORMAL > 0) &&
                  "normal_prio_is_not_between_highest_and_lowest");
 
 /* Number of scheduling groups */
-#define NUM_SCHED_GRPS 256
+#define NUM_SCHED_GRPS 32
 
 /* Priority queues per priority */
 #define QUEUES_PER_PRIO  4
@@ -163,7 +163,11 @@ typedef struct {
                ordered_stash_t stash[MAX_ORDERED_STASH];
        } ordered;
 
+       uint32_t grp_epoch;
+       int num_grp;
+       uint8_t grp[NUM_SCHED_GRPS];
        uint8_t weight_tbl[WEIGHT_TBL_SIZE];
+       uint8_t grp_weight[WEIGHT_TBL_SIZE];
 
 } sched_local_t;
 
@@ -199,7 +203,7 @@ typedef struct {
        pri_mask_t     pri_mask[NUM_PRIO];
        odp_spinlock_t mask_lock;
 
-       prio_queue_t   prio_q[NUM_PRIO][QUEUES_PER_PRIO];
+       prio_queue_t   prio_q[NUM_SCHED_GRPS][NUM_PRIO][QUEUES_PER_PRIO];
 
        odp_spinlock_t poll_cmd_lock;
        /* Number of commands in a command queue */
@@ -214,8 +218,10 @@ typedef struct {
        odp_shm_t      shm;
        uint32_t       pri_count[NUM_PRIO][QUEUES_PER_PRIO];
 
-       odp_spinlock_t grp_lock;
-       odp_thrmask_t mask_all;
+       odp_thrmask_t    mask_all;
+       odp_spinlock_t   grp_lock;
+       odp_atomic_u32_t grp_epoch;
+
        struct {
                char           name[ODP_SCHED_GROUP_NAME_LEN];
                odp_thrmask_t  mask;
@@ -223,6 +229,7 @@ typedef struct {
        } sched_grp[NUM_SCHED_GRPS];
 
        struct {
+               int         grp;
                int         prio;
                int         queue_per_prio;
        } queue[ODP_CONFIG_QUEUES];
@@ -273,7 +280,7 @@ static void sched_local_init(void)
 static int schedule_init_global(void)
 {
        odp_shm_t shm;
-       int i, j;
+       int i, j, grp;
 
        ODP_DBG("Schedule init ... ");
 
@@ -293,15 +300,20 @@ static int schedule_init_global(void)
        sched->shm  = shm;
        odp_spinlock_init(&sched->mask_lock);
 
-       for (i = 0; i < NUM_PRIO; i++) {
-               for (j = 0; j < QUEUES_PER_PRIO; j++) {
-                       int k;
+       for (grp = 0; grp < NUM_SCHED_GRPS; grp++) {
+               for (i = 0; i < NUM_PRIO; i++) {
+                       for (j = 0; j < QUEUES_PER_PRIO; j++) {
+                               prio_queue_t *prio_q;
+                               int k;
 
-                       ring_init(&sched->prio_q[i][j].ring);
+                               prio_q = &sched->prio_q[grp][i][j];
+                               ring_init(&prio_q->ring);
 
-                       for (k = 0; k < PRIO_QUEUE_RING_SIZE; k++)
-                               sched->prio_q[i][j].queue_index[k] =
-                               PRIO_QUEUE_EMPTY;
+                               for (k = 0; k < PRIO_QUEUE_RING_SIZE; k++) {
+                                       prio_q->queue_index[k] =
+                                       PRIO_QUEUE_EMPTY;
+                               }
+                       }
                }
        }
 
@@ -317,12 +329,17 @@ static int schedule_init_global(void)
                sched->pktio_cmd[i].cmd_index = PKTIO_CMD_FREE;
 
        odp_spinlock_init(&sched->grp_lock);
+       odp_atomic_init_u32(&sched->grp_epoch, 0);
 
        for (i = 0; i < NUM_SCHED_GRPS; i++) {
                memset(sched->sched_grp[i].name, 0, ODP_SCHED_GROUP_NAME_LEN);
                odp_thrmask_zero(&sched->sched_grp[i].mask);
        }
 
+       sched->sched_grp[ODP_SCHED_GROUP_ALL].allocated = 1;
+       sched->sched_grp[ODP_SCHED_GROUP_WORKER].allocated = 1;
+       sched->sched_grp[ODP_SCHED_GROUP_CONTROL].allocated = 1;
+
        odp_thrmask_setall(&sched->mask_all);
 
        ODP_DBG("done\n");
@@ -330,29 +347,38 @@ static int schedule_init_global(void)
        return 0;
 }
 
+static inline void queue_destroy_finalize(uint32_t qi)
+{
+       sched_cb_queue_destroy_finalize(qi);
+}
+
 static int schedule_term_global(void)
 {
        int ret = 0;
        int rc = 0;
-       int i, j;
+       int i, j, grp;
 
-       for (i = 0; i < NUM_PRIO; i++) {
-               for (j = 0; j < QUEUES_PER_PRIO; j++) {
-                       ring_t *ring = &sched->prio_q[i][j].ring;
-                       uint32_t qi;
+       for (grp = 0; grp < NUM_SCHED_GRPS; grp++) {
+               for (i = 0; i < NUM_PRIO; i++) {
+                       for (j = 0; j < QUEUES_PER_PRIO; j++) {
+                               ring_t *ring = &sched->prio_q[grp][i][j].ring;
+                               uint32_t qi;
 
-                       while ((qi = ring_deq(ring, PRIO_QUEUE_MASK)) !=
-                              RING_EMPTY) {
-                               odp_event_t events[1];
-                               int num;
+                               while ((qi = ring_deq(ring, PRIO_QUEUE_MASK)) !=
+                                      RING_EMPTY) {
+                                       odp_event_t events[1];
+                                       int num;
 
-                               num = sched_cb_queue_deq_multi(qi, events, 1);
+                                       num = sched_cb_queue_deq_multi(qi,
+                                                                      events,
+                                                                      1);
 
-                               if (num < 0)
-                                       sched_cb_queue_destroy_finalize(qi);
+                                       if (num < 0)
+                                               queue_destroy_finalize(qi);
 
-                               if (num > 0)
-                                       ODP_ERR("Queue not empty\n");
+                                       if (num > 0)
+                                               ODP_ERR("Queue not empty\n");
+                               }
                        }
                }
        }
@@ -383,6 +409,40 @@ static int schedule_term_local(void)
        return 0;
 }
 
+static inline void grp_update_mask(int grp, const odp_thrmask_t *new_mask)
+{
+       odp_thrmask_copy(&sched->sched_grp[grp].mask, new_mask);
+       odp_atomic_add_rel_u32(&sched->grp_epoch, 1);
+}
+
+static inline int grp_update_tbl(void)
+{
+       int i;
+       int num = 0;
+       int thr = sched_local.thr;
+
+       odp_spinlock_lock(&sched->grp_lock);
+
+       for (i = 0; i < NUM_SCHED_GRPS; i++) {
+               if (sched->sched_grp[i].allocated == 0)
+                       continue;
+
+               if (odp_thrmask_isset(&sched->sched_grp[i].mask, thr)) {
+                       sched_local.grp[num] = i;
+                       num++;
+               }
+       }
+
+       odp_spinlock_unlock(&sched->grp_lock);
+
+       /* Update group weights. Round robin over all thread's groups. */
+       for (i = 0; i < WEIGHT_TBL_SIZE; i++)
+               sched_local.grp_weight[i] = i % num;
+
+       sched_local.num_grp = num;
+       return num;
+}
+
 static unsigned schedule_max_ordered_locks(void)
 {
        return MAX_ORDERED_LOCKS_PER_QUEUE;
@@ -433,6 +493,7 @@ static int schedule_init_queue(uint32_t queue_index,
        int prio = sched_param->prio;
 
        pri_set_queue(queue_index, prio);
+       sched->queue[queue_index].grp  = sched_param->group;
        sched->queue[queue_index].prio = prio;
        sched->queue[queue_index].queue_per_prio = queue_per_prio(queue_index);
 
@@ -444,6 +505,7 @@ static void schedule_destroy_queue(uint32_t queue_index)
        int prio = sched->queue[queue_index].prio;
 
        pri_clr_queue(queue_index, prio);
+       sched->queue[queue_index].grp = 0;
        sched->queue[queue_index].prio = 0;
        sched->queue[queue_index].queue_per_prio = 0;
 }
@@ -535,9 +597,10 @@ static void schedule_release_atomic(void)
        uint32_t qi = sched_local.queue_index;
 
        if (qi != PRIO_QUEUE_EMPTY && sched_local.num  == 0) {
-               int prio           = sched->queue[qi].prio;
+               int grp = sched->queue[qi].grp;
+               int prio = sched->queue[qi].prio;
                int queue_per_prio = sched->queue[qi].queue_per_prio;
-               ring_t *ring       = &sched->prio_q[prio][queue_per_prio].ring;
+               ring_t *ring = &sched->prio_q[grp][prio][queue_per_prio].ring;
 
                /* Release current atomic queue */
                ring_enq(ring, PRIO_QUEUE_MASK, qi);
@@ -688,42 +751,14 @@ static int schedule_ord_enq_multi(uint32_t queue_index, 
void *buf_hdr[],
        return 1;
 }
 
-/*
- * Schedule queues
- */
-static int do_schedule(odp_queue_t *out_queue, odp_event_t out_ev[],
-                      unsigned int max_num)
+static inline int do_schedule_grp(odp_queue_t *out_queue, odp_event_t out_ev[],
+                                 unsigned int max_num, int grp, int first)
 {
        int prio, i;
        int ret;
-       int id, first;
+       int id;
        unsigned int max_deq = MAX_DEQ;
        uint32_t qi;
-       uint16_t round;
-
-       if (sched_local.num) {
-               ret = copy_events(out_ev, max_num);
-
-               if (out_queue)
-                       *out_queue = sched_local.queue;
-
-               return ret;
-       }
-
-       schedule_release_context();
-
-       if (odp_unlikely(sched_local.pause))
-               return 0;
-
-       /* Each thread prefers a priority queue. Poll weight table avoids
-        * starvation of other priority queues on low thread counts. */
-       round = sched_local.round + 1;
-
-       if (odp_unlikely(round == WEIGHT_TBL_SIZE))
-               round = 0;
-
-       sched_local.round = round;
-       first = sched_local.weight_tbl[round];
 
        /* Schedule events */
        for (prio = 0; prio < NUM_PRIO; prio++) {
@@ -736,7 +771,6 @@ static int do_schedule(odp_queue_t *out_queue, odp_event_t 
out_ev[],
 
                for (i = 0; i < QUEUES_PER_PRIO;) {
                        int num;
-                       int grp;
                        int ordered;
                        odp_queue_t handle;
                        ring_t *ring;
@@ -753,7 +787,7 @@ static int do_schedule(odp_queue_t *out_queue, odp_event_t 
out_ev[],
                        }
 
                        /* Get queue index from the priority queue */
-                       ring = &sched->prio_q[prio][id].ring;
+                       ring = &sched->prio_q[grp][prio][id].ring;
                        qi   = ring_deq(ring, PRIO_QUEUE_MASK);
 
                        /* Priority queue empty */
@@ -763,21 +797,6 @@ static int do_schedule(odp_queue_t *out_queue, odp_event_t 
out_ev[],
                                continue;
                        }
 
-                       grp = sched_cb_queue_grp(qi);
-
-                       if (grp > ODP_SCHED_GROUP_ALL &&
-                           !odp_thrmask_isset(&sched->sched_grp[grp].mask,
-                                              sched_local.thr)) {
-                               /* This thread is not eligible for work from
-                                * this queue, so continue scheduling it.
-                                */
-                               ring_enq(ring, PRIO_QUEUE_MASK, qi);
-
-                               i++;
-                               id++;
-                               continue;
-                       }
-
                        /* Low priorities have smaller batch size to limit
                         * head of line blocking latency. */
                        if (odp_unlikely(prio > ODP_SCHED_PRIO_DEFAULT))
@@ -845,6 +864,70 @@ static int do_schedule(odp_queue_t *out_queue, odp_event_t 
out_ev[],
                }
        }
 
+       return 0;
+}
+
+/*
+ * Schedule queues
+ */
+static inline int do_schedule(odp_queue_t *out_queue, odp_event_t out_ev[],
+                             unsigned int max_num)
+{
+       int i, num_grp;
+       int ret;
+       int id, first, grp_id;
+       uint16_t round;
+       uint32_t epoch;
+
+       if (sched_local.num) {
+               ret = copy_events(out_ev, max_num);
+
+               if (out_queue)
+                       *out_queue = sched_local.queue;
+
+               return ret;
+       }
+
+       schedule_release_context();
+
+       if (odp_unlikely(sched_local.pause))
+               return 0;
+
+       /* Each thread prefers a priority queue. Poll weight table avoids
+        * starvation of other priority queues on low thread counts. */
+       round = sched_local.round + 1;
+
+       if (odp_unlikely(round == WEIGHT_TBL_SIZE))
+               round = 0;
+
+       sched_local.round = round;
+       first = sched_local.weight_tbl[round];
+
+       epoch = odp_atomic_load_acq_u32(&sched->grp_epoch);
+       num_grp = sched_local.num_grp;
+
+       if (odp_unlikely(sched_local.grp_epoch != epoch)) {
+               num_grp = grp_update_tbl();
+               sched_local.grp_epoch = epoch;
+       }
+
+       grp_id = sched_local.grp_weight[round];
+
+       /* Schedule queues per group and priority */
+       for (i = 0; i < num_grp; i++) {
+               int grp;
+
+               grp = sched_local.grp[grp_id];
+               ret = do_schedule_grp(out_queue, out_ev, max_num, grp, first);
+
+               if (odp_likely(ret))
+                       return ret;
+
+               grp_id++;
+               if (odp_unlikely(grp_id >= num_grp))
+                       grp_id = 0;
+       }
+
        /*
         * Poll packet input when there are no events
         *   * Each thread starts the search for a poll command from its
@@ -1050,7 +1133,8 @@ static odp_schedule_group_t schedule_group_create(const 
char *name,
                                        ODP_SCHED_GROUP_NAME_LEN - 1);
                                grp_name[ODP_SCHED_GROUP_NAME_LEN - 1] = 0;
                        }
-                       odp_thrmask_copy(&sched->sched_grp[i].mask, mask);
+
+                       grp_update_mask(i, mask);
                        group = (odp_schedule_group_t)i;
                        sched->sched_grp[i].allocated = 1;
                        break;
@@ -1063,13 +1147,16 @@ static odp_schedule_group_t schedule_group_create(const 
char *name,
 
 static int schedule_group_destroy(odp_schedule_group_t group)
 {
+       odp_thrmask_t zero;
        int ret;
 
+       odp_thrmask_zero(&zero);
+
        odp_spinlock_lock(&sched->grp_lock);
 
        if (group < NUM_SCHED_GRPS && group >= SCHED_GROUP_NAMED &&
            sched->sched_grp[group].allocated) {
-               odp_thrmask_zero(&sched->sched_grp[group].mask);
+               grp_update_mask(group, &zero);
                memset(sched->sched_grp[group].name, 0,
                       ODP_SCHED_GROUP_NAME_LEN);
                sched->sched_grp[group].allocated = 0;
@@ -1109,9 +1196,11 @@ static int schedule_group_join(odp_schedule_group_t 
group,
 
        if (group < NUM_SCHED_GRPS && group >= SCHED_GROUP_NAMED &&
            sched->sched_grp[group].allocated) {
-               odp_thrmask_or(&sched->sched_grp[group].mask,
-                              &sched->sched_grp[group].mask,
-                              mask);
+               odp_thrmask_t new_mask;
+
+               odp_thrmask_or(&new_mask, &sched->sched_grp[group].mask, mask);
+               grp_update_mask(group, &new_mask);
+
                ret = 0;
        } else {
                ret = -1;
@@ -1124,18 +1213,19 @@ static int schedule_group_join(odp_schedule_group_t 
group,
 static int schedule_group_leave(odp_schedule_group_t group,
                                const odp_thrmask_t *mask)
 {
+       odp_thrmask_t new_mask;
        int ret;
 
+       odp_thrmask_xor(&new_mask, mask, &sched->mask_all);
+
        odp_spinlock_lock(&sched->grp_lock);
 
        if (group < NUM_SCHED_GRPS && group >= SCHED_GROUP_NAMED &&
            sched->sched_grp[group].allocated) {
-               odp_thrmask_t leavemask;
+               odp_thrmask_and(&new_mask, &sched->sched_grp[group].mask,
+                               &new_mask);
+               grp_update_mask(group, &new_mask);
 
-               odp_thrmask_xor(&leavemask, mask, &sched->mask_all);
-               odp_thrmask_and(&sched->sched_grp[group].mask,
-                               &sched->sched_grp[group].mask,
-                               &leavemask);
                ret = 0;
        } else {
                ret = -1;
@@ -1186,12 +1276,19 @@ static int schedule_group_info(odp_schedule_group_t 
group,
 
 static int schedule_thr_add(odp_schedule_group_t group, int thr)
 {
+       odp_thrmask_t mask;
+       odp_thrmask_t new_mask;
+
        if (group < 0 || group >= SCHED_GROUP_NAMED)
                return -1;
 
+       odp_thrmask_zero(&mask);
+       odp_thrmask_set(&mask, thr);
+
        odp_spinlock_lock(&sched->grp_lock);
 
-       odp_thrmask_set(&sched->sched_grp[group].mask, thr);
+       odp_thrmask_or(&new_mask, &sched->sched_grp[group].mask, &mask);
+       grp_update_mask(group, &new_mask);
 
        odp_spinlock_unlock(&sched->grp_lock);
 
@@ -1200,12 +1297,20 @@ static int schedule_thr_add(odp_schedule_group_t group, 
int thr)
 
 static int schedule_thr_rem(odp_schedule_group_t group, int thr)
 {
+       odp_thrmask_t mask;
+       odp_thrmask_t new_mask;
+
        if (group < 0 || group >= SCHED_GROUP_NAMED)
                return -1;
 
+       odp_thrmask_zero(&mask);
+       odp_thrmask_set(&mask, thr);
+       odp_thrmask_xor(&new_mask, &mask, &sched->mask_all);
+
        odp_spinlock_lock(&sched->grp_lock);
 
-       odp_thrmask_clr(&sched->sched_grp[group].mask, thr);
+       odp_thrmask_and(&new_mask, &sched->sched_grp[group].mask, &new_mask);
+       grp_update_mask(group, &new_mask);
 
        odp_spinlock_unlock(&sched->grp_lock);
 
@@ -1219,9 +1324,10 @@ static void schedule_prefetch(int num ODP_UNUSED)
 
 static int schedule_sched_queue(uint32_t queue_index)
 {
+       int grp            = sched->queue[queue_index].grp;
        int prio           = sched->queue[queue_index].prio;
        int queue_per_prio = sched->queue[queue_index].queue_per_prio;
-       ring_t *ring       = &sched->prio_q[prio][queue_per_prio].ring;
+       ring_t *ring       = &sched->prio_q[grp][prio][queue_per_prio].ring;
 
        ring_enq(ring, PRIO_QUEUE_MASK, queue_index);
        return 0;
-- 
2.8.1

Reply via email to