Ping. This patch set removes the non-deterministic latency, lower QoS and potential queue starvation that is caused by this code ...
- if (grp > ODP_SCHED_GROUP_ALL && - !odp_thrmask_isset(&sched->sched_grp[grp].mask, - sched_local.thr)) { - /* This thread is not eligible for work from - * this queue, so continue scheduling it. - */ - ring_enq(ring, PRIO_QUEUE_MASK, qi); - - i++; - id++; - continue; - } ... which sends queues of "wrong" group back to the end of the priority queue. If e.g. tens of threads are sending it back and only one thread would accept it, it's actually very likely that queue service level is much lower than it should be. Improved latency can be seen already with the new l2fwd -g option. -Petri > -----Original Message----- > From: lng-odp [mailto:lng-odp-boun...@lists.linaro.org] On Behalf Of Petri > Savolainen > Sent: Thursday, April 06, 2017 2:59 PM > To: lng-odp@lists.linaro.org > Subject: [lng-odp] [PATCH 3/3] linux-gen: sched: optimize group scheduling > > Use separate priority queues for different groups. Sharing > the same priority queue over multiple groups caused multiple > issues: > * latency and ordering issues when threads push back > events (from wrong groups) to the tail of the priority queue > * unnecessary contention (scaling issues) when threads belong > to different groups > > Lowered the maximum number of groups from 256 to 32 (in the default > configuration) to limit memory usage of priority queues. This should > be enough for the most users. > > Signed-off-by: Petri Savolainen <petri.savolai...@linaro.org> > --- > platform/linux-generic/odp_schedule.c | 284 +++++++++++++++++++++++------ > ----- > 1 file changed, 195 insertions(+), 89 deletions(-) > > diff --git a/platform/linux-generic/odp_schedule.c b/platform/linux- > generic/odp_schedule.c > index e7079b9..f366e7e 100644 > --- a/platform/linux-generic/odp_schedule.c > +++ b/platform/linux-generic/odp_schedule.c > @@ -34,7 +34,7 @@ ODP_STATIC_ASSERT((ODP_SCHED_PRIO_NORMAL > 0) && > "normal_prio_is_not_between_highest_and_lowest"); > > /* Number of scheduling groups */ > -#define NUM_SCHED_GRPS 256 > +#define NUM_SCHED_GRPS 32 > > /* Priority queues per priority */ > #define QUEUES_PER_PRIO 4 > @@ -163,7 +163,11 @@ typedef struct { > ordered_stash_t stash[MAX_ORDERED_STASH]; > } ordered; > > + uint32_t grp_epoch; > + int num_grp; > + uint8_t grp[NUM_SCHED_GRPS]; > uint8_t weight_tbl[WEIGHT_TBL_SIZE]; > + uint8_t grp_weight[WEIGHT_TBL_SIZE]; > > } sched_local_t; > > @@ -199,7 +203,7 @@ typedef struct { > pri_mask_t pri_mask[NUM_PRIO]; > odp_spinlock_t mask_lock; > > - prio_queue_t prio_q[NUM_PRIO][QUEUES_PER_PRIO]; > + prio_queue_t > prio_q[NUM_SCHED_GRPS][NUM_PRIO][QUEUES_PER_PRIO]; > > odp_spinlock_t poll_cmd_lock; > /* Number of commands in a command queue */ > @@ -214,8 +218,10 @@ typedef struct { > odp_shm_t shm; > uint32_t pri_count[NUM_PRIO][QUEUES_PER_PRIO]; > > - odp_spinlock_t grp_lock; > - odp_thrmask_t mask_all; > + odp_thrmask_t mask_all; > + odp_spinlock_t grp_lock; > + odp_atomic_u32_t grp_epoch; > + > struct { > char name[ODP_SCHED_GROUP_NAME_LEN]; > odp_thrmask_t mask; > @@ -223,6 +229,7 @@ typedef struct { > } sched_grp[NUM_SCHED_GRPS]; > > struct { > + int grp; > int prio; > int queue_per_prio; > } queue[ODP_CONFIG_QUEUES]; > @@ -273,7 +280,7 @@ static void sched_local_init(void) > static int schedule_init_global(void) > { > odp_shm_t shm; > - int i, j; > + int i, j, grp; > > ODP_DBG("Schedule init ... "); > > @@ -293,15 +300,20 @@ static int schedule_init_global(void) > sched->shm = shm; > odp_spinlock_init(&sched->mask_lock); > > - for (i = 0; i < NUM_PRIO; i++) { > - for (j = 0; j < QUEUES_PER_PRIO; j++) { > - int k; > + for (grp = 0; grp < NUM_SCHED_GRPS; grp++) { > + for (i = 0; i < NUM_PRIO; i++) { > + for (j = 0; j < QUEUES_PER_PRIO; j++) { > + prio_queue_t *prio_q; > + int k; > > - ring_init(&sched->prio_q[i][j].ring); > + prio_q = &sched- > >prio_q[grp][i][j]; > + ring_init(&prio_q->ring); > > - for (k = 0; k < PRIO_QUEUE_RING_SIZE; k++) > - sched- > >prio_q[i][j].queue_index[k] = > - PRIO_QUEUE_EMPTY; > + for (k = 0; k < > PRIO_QUEUE_RING_SIZE; k++) { > + prio_q- > >queue_index[k] = > + PRIO_QUEUE_EMPTY; > + } > + } > } > } > > @@ -317,12 +329,17 @@ static int schedule_init_global(void) > sched->pktio_cmd[i].cmd_index = PKTIO_CMD_FREE; > > odp_spinlock_init(&sched->grp_lock); > + odp_atomic_init_u32(&sched->grp_epoch, 0); > > for (i = 0; i < NUM_SCHED_GRPS; i++) { > memset(sched->sched_grp[i].name, 0, > ODP_SCHED_GROUP_NAME_LEN); > odp_thrmask_zero(&sched->sched_grp[i].mask); > } > > + sched->sched_grp[ODP_SCHED_GROUP_ALL].allocated = 1; > + sched->sched_grp[ODP_SCHED_GROUP_WORKER].allocated = 1; > + sched->sched_grp[ODP_SCHED_GROUP_CONTROL].allocated = 1; > + > odp_thrmask_setall(&sched->mask_all); > > ODP_DBG("done\n"); > @@ -330,29 +347,38 @@ static int schedule_init_global(void) > return 0; > } > > +static inline void queue_destroy_finalize(uint32_t qi) > +{ > + sched_cb_queue_destroy_finalize(qi); > +} > + > static int schedule_term_global(void) > { > int ret = 0; > int rc = 0; > - int i, j; > + int i, j, grp; > > - for (i = 0; i < NUM_PRIO; i++) { > - for (j = 0; j < QUEUES_PER_PRIO; j++) { > - ring_t *ring = &sched->prio_q[i][j].ring; > - uint32_t qi; > + for (grp = 0; grp < NUM_SCHED_GRPS; grp++) { > + for (i = 0; i < NUM_PRIO; i++) { > + for (j = 0; j < QUEUES_PER_PRIO; j++) { > + ring_t *ring = &sched- > >prio_q[grp][i][j].ring; > + uint32_t qi; > > - while ((qi = ring_deq(ring, > PRIO_QUEUE_MASK)) != > - RING_EMPTY) { > - odp_event_t events[1]; > - int num; > + while ((qi = ring_deq(ring, > PRIO_QUEUE_MASK)) != > + RING_EMPTY) { > + odp_event_t > events[1]; > + int num; > > - num = > sched_cb_queue_deq_multi(qi, events, 1); > + num = > sched_cb_queue_deq_multi(qi, > + > events, > + > 1); > > - if (num < 0) > - > sched_cb_queue_destroy_finalize(qi); > + if (num < 0) > + > queue_destroy_finalize(qi); > > - if (num > 0) > - ODP_ERR("Queue not > empty\n"); > + if (num > 0) > + > ODP_ERR("Queue not empty\n"); > + } > } > } > } > @@ -383,6 +409,40 @@ static int schedule_term_local(void) > return 0; > } > > +static inline void grp_update_mask(int grp, const odp_thrmask_t > *new_mask) > +{ > + odp_thrmask_copy(&sched->sched_grp[grp].mask, new_mask); > + odp_atomic_add_rel_u32(&sched->grp_epoch, 1); > +} > + > +static inline int grp_update_tbl(void) > +{ > + int i; > + int num = 0; > + int thr = sched_local.thr; > + > + odp_spinlock_lock(&sched->grp_lock); > + > + for (i = 0; i < NUM_SCHED_GRPS; i++) { > + if (sched->sched_grp[i].allocated == 0) > + continue; > + > + if (odp_thrmask_isset(&sched->sched_grp[i].mask, > thr)) { > + sched_local.grp[num] = i; > + num++; > + } > + } > + > + odp_spinlock_unlock(&sched->grp_lock); > + > + /* Update group weights. Round robin over all thread's groups. > */ > + for (i = 0; i < WEIGHT_TBL_SIZE; i++) > + sched_local.grp_weight[i] = i % num; > + > + sched_local.num_grp = num; > + return num; > +} > + > static unsigned schedule_max_ordered_locks(void) > { > return MAX_ORDERED_LOCKS_PER_QUEUE; > @@ -433,6 +493,7 @@ static int schedule_init_queue(uint32_t queue_index, > int prio = sched_param->prio; > > pri_set_queue(queue_index, prio); > + sched->queue[queue_index].grp = sched_param->group; > sched->queue[queue_index].prio = prio; > sched->queue[queue_index].queue_per_prio = > queue_per_prio(queue_index); > > @@ -444,6 +505,7 @@ static void schedule_destroy_queue(uint32_t > queue_index) > int prio = sched->queue[queue_index].prio; > > pri_clr_queue(queue_index, prio); > + sched->queue[queue_index].grp = 0; > sched->queue[queue_index].prio = 0; > sched->queue[queue_index].queue_per_prio = 0; > } > @@ -535,9 +597,10 @@ static void schedule_release_atomic(void) > uint32_t qi = sched_local.queue_index; > > if (qi != PRIO_QUEUE_EMPTY && sched_local.num == 0) { > - int prio = sched->queue[qi].prio; > + int grp = sched->queue[qi].grp; > + int prio = sched->queue[qi].prio; > int queue_per_prio = sched- > >queue[qi].queue_per_prio; > - ring_t *ring = &sched- > >prio_q[prio][queue_per_prio].ring; > + ring_t *ring = &sched- > >prio_q[grp][prio][queue_per_prio].ring; > > /* Release current atomic queue */ > ring_enq(ring, PRIO_QUEUE_MASK, qi); > @@ -688,42 +751,14 @@ static int schedule_ord_enq_multi(uint32_t > queue_index, void *buf_hdr[], > return 1; > } > > -/* > - * Schedule queues > - */ > -static int do_schedule(odp_queue_t *out_queue, odp_event_t out_ev[], > - unsigned int max_num) > +static inline int do_schedule_grp(odp_queue_t *out_queue, odp_event_t > out_ev[], > + unsigned int max_num, int > grp, int first) > { > int prio, i; > int ret; > - int id, first; > + int id; > unsigned int max_deq = MAX_DEQ; > uint32_t qi; > - uint16_t round; > - > - if (sched_local.num) { > - ret = copy_events(out_ev, max_num); > - > - if (out_queue) > - *out_queue = sched_local.queue; > - > - return ret; > - } > - > - schedule_release_context(); > - > - if (odp_unlikely(sched_local.pause)) > - return 0; > - > - /* Each thread prefers a priority queue. Poll weight table > avoids > - * starvation of other priority queues on low thread counts. */ > - round = sched_local.round + 1; > - > - if (odp_unlikely(round == WEIGHT_TBL_SIZE)) > - round = 0; > - > - sched_local.round = round; > - first = sched_local.weight_tbl[round]; > > /* Schedule events */ > for (prio = 0; prio < NUM_PRIO; prio++) { > @@ -736,7 +771,6 @@ static int do_schedule(odp_queue_t *out_queue, > odp_event_t out_ev[], > > for (i = 0; i < QUEUES_PER_PRIO;) { > int num; > - int grp; > int ordered; > odp_queue_t handle; > ring_t *ring; > @@ -753,7 +787,7 @@ static int do_schedule(odp_queue_t *out_queue, > odp_event_t out_ev[], > } > > /* Get queue index from the priority queue > */ > - ring = &sched->prio_q[prio][id].ring; > + ring = &sched->prio_q[grp][prio][id].ring; > qi = ring_deq(ring, PRIO_QUEUE_MASK); > > /* Priority queue empty */ > @@ -763,21 +797,6 @@ static int do_schedule(odp_queue_t *out_queue, > odp_event_t out_ev[], > continue; > } > > - grp = sched_cb_queue_grp(qi); > - > - if (grp > ODP_SCHED_GROUP_ALL && > - !odp_thrmask_isset(&sched- > >sched_grp[grp].mask, > - > sched_local.thr)) { > - /* This thread is not eligible > for work from > - * this queue, so continue > scheduling it. > - */ > - ring_enq(ring, PRIO_QUEUE_MASK, > qi); > - > - i++; > - id++; > - continue; > - } > - > /* Low priorities have smaller batch size > to limit > * head of line blocking latency. */ > if (odp_unlikely(prio > > ODP_SCHED_PRIO_DEFAULT)) > @@ -845,6 +864,70 @@ static int do_schedule(odp_queue_t *out_queue, > odp_event_t out_ev[], > } > } > > + return 0; > +} > + > +/* > + * Schedule queues > + */ > +static inline int do_schedule(odp_queue_t *out_queue, odp_event_t > out_ev[], > + unsigned int max_num) > +{ > + int i, num_grp; > + int ret; > + int id, first, grp_id; > + uint16_t round; > + uint32_t epoch; > + > + if (sched_local.num) { > + ret = copy_events(out_ev, max_num); > + > + if (out_queue) > + *out_queue = sched_local.queue; > + > + return ret; > + } > + > + schedule_release_context(); > + > + if (odp_unlikely(sched_local.pause)) > + return 0; > + > + /* Each thread prefers a priority queue. Poll weight table > avoids > + * starvation of other priority queues on low thread counts. */ > + round = sched_local.round + 1; > + > + if (odp_unlikely(round == WEIGHT_TBL_SIZE)) > + round = 0; > + > + sched_local.round = round; > + first = sched_local.weight_tbl[round]; > + > + epoch = odp_atomic_load_acq_u32(&sched->grp_epoch); > + num_grp = sched_local.num_grp; > + > + if (odp_unlikely(sched_local.grp_epoch != epoch)) { > + num_grp = grp_update_tbl(); > + sched_local.grp_epoch = epoch; > + } > + > + grp_id = sched_local.grp_weight[round]; > + > + /* Schedule queues per group and priority */ > + for (i = 0; i < num_grp; i++) { > + int grp; > + > + grp = sched_local.grp[grp_id]; > + ret = do_schedule_grp(out_queue, out_ev, max_num, > grp, first); > + > + if (odp_likely(ret)) > + return ret; > + > + grp_id++; > + if (odp_unlikely(grp_id >= num_grp)) > + grp_id = 0; > + } > + > /* > * Poll packet input when there are no events > * * Each thread starts the search for a poll command from > its > @@ -1050,7 +1133,8 @@ static odp_schedule_group_t > schedule_group_create(const char *name, > > ODP_SCHED_GROUP_NAME_LEN - 1); > > grp_name[ODP_SCHED_GROUP_NAME_LEN - 1] = 0; > } > - odp_thrmask_copy(&sched- > >sched_grp[i].mask, mask); > + > + grp_update_mask(i, mask); > group = (odp_schedule_group_t)i; > sched->sched_grp[i].allocated = 1; > break; > @@ -1063,13 +1147,16 @@ static odp_schedule_group_t > schedule_group_create(const char *name, > > static int schedule_group_destroy(odp_schedule_group_t group) > { > + odp_thrmask_t zero; > int ret; > > + odp_thrmask_zero(&zero); > + > odp_spinlock_lock(&sched->grp_lock); > > if (group < NUM_SCHED_GRPS && group >= SCHED_GROUP_NAMED && > sched->sched_grp[group].allocated) { > - odp_thrmask_zero(&sched->sched_grp[group].mask); > + grp_update_mask(group, &zero); > memset(sched->sched_grp[group].name, 0, > ODP_SCHED_GROUP_NAME_LEN); > sched->sched_grp[group].allocated = 0; > @@ -1109,9 +1196,11 @@ static int schedule_group_join(odp_schedule_group_t > group, > > if (group < NUM_SCHED_GRPS && group >= SCHED_GROUP_NAMED && > sched->sched_grp[group].allocated) { > - odp_thrmask_or(&sched->sched_grp[group].mask, > - &sched->sched_grp[group].mask, > - mask); > + odp_thrmask_t new_mask; > + > + odp_thrmask_or(&new_mask, &sched- > >sched_grp[group].mask, mask); > + grp_update_mask(group, &new_mask); > + > ret = 0; > } else { > ret = -1; > @@ -1124,18 +1213,19 @@ static int > schedule_group_join(odp_schedule_group_t group, > static int schedule_group_leave(odp_schedule_group_t group, > const odp_thrmask_t *mask) > { > + odp_thrmask_t new_mask; > int ret; > > + odp_thrmask_xor(&new_mask, mask, &sched->mask_all); > + > odp_spinlock_lock(&sched->grp_lock); > > if (group < NUM_SCHED_GRPS && group >= SCHED_GROUP_NAMED && > sched->sched_grp[group].allocated) { > - odp_thrmask_t leavemask; > + odp_thrmask_and(&new_mask, &sched- > >sched_grp[group].mask, > + &new_mask); > + grp_update_mask(group, &new_mask); > > - odp_thrmask_xor(&leavemask, mask, &sched->mask_all); > - odp_thrmask_and(&sched->sched_grp[group].mask, > - &sched->sched_grp[group].mask, > - &leavemask); > ret = 0; > } else { > ret = -1; > @@ -1186,12 +1276,19 @@ static int > schedule_group_info(odp_schedule_group_t group, > > static int schedule_thr_add(odp_schedule_group_t group, int thr) > { > + odp_thrmask_t mask; > + odp_thrmask_t new_mask; > + > if (group < 0 || group >= SCHED_GROUP_NAMED) > return -1; > > + odp_thrmask_zero(&mask); > + odp_thrmask_set(&mask, thr); > + > odp_spinlock_lock(&sched->grp_lock); > > - odp_thrmask_set(&sched->sched_grp[group].mask, thr); > + odp_thrmask_or(&new_mask, &sched->sched_grp[group].mask, > &mask); > + grp_update_mask(group, &new_mask); > > odp_spinlock_unlock(&sched->grp_lock); > > @@ -1200,12 +1297,20 @@ static int schedule_thr_add(odp_schedule_group_t > group, int thr) > > static int schedule_thr_rem(odp_schedule_group_t group, int thr) > { > + odp_thrmask_t mask; > + odp_thrmask_t new_mask; > + > if (group < 0 || group >= SCHED_GROUP_NAMED) > return -1; > > + odp_thrmask_zero(&mask); > + odp_thrmask_set(&mask, thr); > + odp_thrmask_xor(&new_mask, &mask, &sched->mask_all); > + > odp_spinlock_lock(&sched->grp_lock); > > - odp_thrmask_clr(&sched->sched_grp[group].mask, thr); > + odp_thrmask_and(&new_mask, &sched->sched_grp[group].mask, > &new_mask); > + grp_update_mask(group, &new_mask); > > odp_spinlock_unlock(&sched->grp_lock); > > @@ -1219,9 +1324,10 @@ static void schedule_prefetch(int num ODP_UNUSED) > > static int schedule_sched_queue(uint32_t queue_index) > { > + int grp = sched->queue[queue_index].grp; > int prio = sched->queue[queue_index].prio; > int queue_per_prio = sched->queue[queue_index].queue_per_prio; > - ring_t *ring = &sched->prio_q[prio][queue_per_prio].ring; > + ring_t *ring = &sched- > >prio_q[grp][prio][queue_per_prio].ring; > > ring_enq(ring, PRIO_QUEUE_MASK, queue_index); > return 0; > -- > 2.8.1