<snip> > > +version = 3 > > sources = files('rte_sched.c', 'rte_red.c', 'rte_approx.c') headers > > = files('rte_sched.h', 'rte_sched_common.h', > > 'rte_red.h', 'rte_approx.h') > > diff --git a/lib/librte_sched/rte_sched.c > > b/lib/librte_sched/rte_sched.c index bc06bc3f4..b1f521794 100644 > > --- a/lib/librte_sched/rte_sched.c > > +++ b/lib/librte_sched/rte_sched.c > > @@ -37,6 +37,8 @@ > > > > #define RTE_SCHED_TB_RATE_CONFIG_ERR (1e-7) > > #define RTE_SCHED_WRR_SHIFT 3 > > +#define RTE_SCHED_TRAFFIC_CLASS_BE > > (RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE - 1) > > +#define RTE_SCHED_MAX_QUEUES_PER_TC > > RTE_SCHED_BE_QUEUES_PER_PIPE > > #define RTE_SCHED_GRINDER_PCACHE_SIZE (64 / > > RTE_SCHED_QUEUES_PER_PIPE) > > #define RTE_SCHED_PIPE_INVALID UINT32_MAX > > #define RTE_SCHED_BMP_POS_INVALID UINT32_MAX > > @@ -84,8 +86,9 @@ struct rte_sched_pipe_profile { > > uint32_t > > tc_credits_per_period[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; > > uint8_t tc_ov_weight; > > > > - /* Pipe queues */ > > - uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_PIPE]; > > + /* Pipe best-effort traffic class queues */ > > + uint8_t n_be_queues; > > The n_be_queues is the same for all pipes within the same port, so it does not > make sense to save this per-port value in each pipe profile. At the very > least, > let's move it to the port data structure, please. > > In fact, a better solution (that also simplifies the implementation) is to > enforce > the same queue size for all BE queues, as it does not make sense to have > queues within the same traffic class of different size (see my comment in the > other patch where you update the API). So n_be_queues should always be 4, > therefore no need for this variable. > Thanks for your time and comments. I have removed n_be_queues in v5. > > + uint8_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE]; > > }; > > > > struct rte_sched_pipe { > > @@ -100,8 +103,10 @@ struct rte_sched_pipe { > > uint64_t tc_time; /* time of next update */ > > uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; > > > > + uint8_t n_be_queues; /* Best effort traffic class queues */ > > Same comment here, even more important, as we need to strive reducing the > size of this struct for performance reasons. > > > + > > /* Weighted Round Robin (WRR) */ > > - uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE]; > > + uint8_t wrr_tokens[RTE_SCHED_BE_QUEUES_PER_PIPE]; > > > > /* TC oversubscription */ > > uint32_t tc_ov_credits; > > @@ -153,16 +158,16 @@ struct rte_sched_grinder { > > uint32_t tc_index; > > struct rte_sched_queue > > *queue[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; > > struct rte_mbuf **qbase[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; > > - uint32_t qindex[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE]; > > - uint16_t qsize; > > + uint32_t qindex[RTE_SCHED_MAX_QUEUES_PER_TC]; > > + uint16_t qsize[RTE_SCHED_MAX_QUEUES_PER_TC]; > > uint32_t qmask; > > uint32_t qpos; > > struct rte_mbuf *pkt; > > > > /* WRR */ > > - uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > > - uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > > - uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > > + uint16_t wrr_tokens[RTE_SCHED_BE_QUEUES_PER_PIPE]; > > + uint16_t wrr_mask[RTE_SCHED_BE_QUEUES_PER_PIPE]; > > + uint8_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE]; > > }; > > > > struct rte_sched_port { > > @@ -301,7 +306,6 @@ pipe_profile_check(struct rte_sched_pipe_params > > *params, > > if (params->wrr_weights[i] == 0) > > return -16; > > } > > - > > return 0; > > } > > > > @@ -483,7 +487,7 @@ rte_sched_port_log_pipe_profile(struct > > rte_sched_port *port, uint32_t i) > > " Token bucket: period = %u, credits per period = %u, size = > > %u\n" > > " Traffic classes: period = %u, credits per period = [%u, %u, > > %u, %u]\n" > > " Traffic class 3 oversubscription: weight = %hhu\n" > > - " WRR cost: [%hhu, %hhu, %hhu, %hhu], [%hhu, %hhu, > > %hhu, %hhu], [%hhu, %hhu, %hhu, %hhu], [%hhu, %hhu, %hhu, %hhu]\n", > > + " WRR cost: [%hhu, %hhu, %hhu, %hhu]\n", > > i, > > > > /* Token bucket */ > > @@ -502,10 +506,7 @@ rte_sched_port_log_pipe_profile(struct > > rte_sched_port *port, uint32_t i) > > p->tc_ov_weight, > > > > /* WRR */ > > - p->wrr_cost[ 0], p->wrr_cost[ 1], p->wrr_cost[ 2], p- > > >wrr_cost[ 3], > > - p->wrr_cost[ 4], p->wrr_cost[ 5], p->wrr_cost[ 6], p- > > >wrr_cost[ 7], > > - p->wrr_cost[ 8], p->wrr_cost[ 9], p->wrr_cost[10], p- > > >wrr_cost[11], > > - p->wrr_cost[12], p->wrr_cost[13], p->wrr_cost[14], p- > > >wrr_cost[15]); > > + p->wrr_cost[0], p->wrr_cost[1], p->wrr_cost[2], p- > > >wrr_cost[3]); > > } > > > > static inline uint64_t > > @@ -519,10 +520,12 @@ rte_sched_time_ms_to_bytes(uint32_t time_ms, > > uint32_t rate) } > > > > static void > > -rte_sched_pipe_profile_convert(struct rte_sched_pipe_params *src, > > +rte_sched_pipe_profile_convert(struct rte_sched_port *port, > > + struct rte_sched_pipe_params *src, > > struct rte_sched_pipe_profile *dst, > > uint32_t rate) > > { > > + uint32_t wrr_cost[RTE_SCHED_BE_QUEUES_PER_PIPE]; > > uint32_t i; > > > > /* Token Bucket */ > > @@ -553,18 +556,36 @@ rte_sched_pipe_profile_convert(struct > > rte_sched_pipe_params *src, > > dst->tc_ov_weight = src->tc_ov_weight; #endif > > > > - /* WRR */ > > - for (i = 0; i < RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE; i++) { > > - uint32_t > > wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS]; > > - uint32_t lcd, lcd1, lcd2; > > - uint32_t qindex; > > + /* WRR queues */ > > + for (i = 0; i < RTE_SCHED_BE_QUEUES_PER_PIPE; i++) > > + if (port->qsize[i]) > > + dst->n_be_queues++; > > + > > + if (dst->n_be_queues == 1) > > + dst->wrr_cost[0] = src->wrr_weights[0]; > > + > > + if (dst->n_be_queues == 2) { > > + uint32_t lcd; > > + > > + wrr_cost[0] = src->wrr_weights[0]; > > + wrr_cost[1] = src->wrr_weights[1]; > > + > > + lcd = rte_get_lcd(wrr_cost[0], wrr_cost[1]); > > + > > + wrr_cost[0] = lcd / wrr_cost[0]; > > + wrr_cost[1] = lcd / wrr_cost[1]; > > > > - qindex = i * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS; > > + dst->wrr_cost[0] = (uint8_t) wrr_cost[0]; > > + dst->wrr_cost[1] = (uint8_t) wrr_cost[1]; > > + } > > > > - wrr_cost[0] = src->wrr_weights[qindex]; > > - wrr_cost[1] = src->wrr_weights[qindex + 1]; > > - wrr_cost[2] = src->wrr_weights[qindex + 2]; > > - wrr_cost[3] = src->wrr_weights[qindex + 3]; > > + if (dst->n_be_queues == 4) { > > See the above comment, it is better and simpler to enforce n_be_queues == 4, > which simplifies this code a loot, as it keeps only this branch and removes > the > need for the above two. > Fixed in v5.
> > + uint32_t lcd1, lcd2, lcd; > > + > > + wrr_cost[0] = src->wrr_weights[0]; > > + wrr_cost[1] = src->wrr_weights[1]; > > + wrr_cost[2] = src->wrr_weights[2]; > > + wrr_cost[3] = src->wrr_weights[3]; > > > > lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]); > > lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]); @@ -575,10 > +596,10 @@ > > rte_sched_pipe_profile_convert(struct > > rte_sched_pipe_params *src, > > wrr_cost[2] = lcd / wrr_cost[2]; > > wrr_cost[3] = lcd / wrr_cost[3]; > > > > - dst->wrr_cost[qindex] = (uint8_t) wrr_cost[0]; > > - dst->wrr_cost[qindex + 1] = (uint8_t) wrr_cost[1]; > > - dst->wrr_cost[qindex + 2] = (uint8_t) wrr_cost[2]; > > - dst->wrr_cost[qindex + 3] = (uint8_t) wrr_cost[3]; > > + dst->wrr_cost[0] = (uint8_t) wrr_cost[0]; > > + dst->wrr_cost[1] = (uint8_t) wrr_cost[1]; > > + dst->wrr_cost[2] = (uint8_t) wrr_cost[2]; > > + dst->wrr_cost[3] = (uint8_t) wrr_cost[3]; > > } > > } > > > > @@ -592,7 +613,7 @@ rte_sched_port_config_pipe_profile_table(struct > > rte_sched_port *port, > > struct rte_sched_pipe_params *src = params->pipe_profiles > > + i; > > struct rte_sched_pipe_profile *dst = port->pipe_profiles + i; > > > > - rte_sched_pipe_profile_convert(src, dst, params->rate); > > + rte_sched_pipe_profile_convert(port, src, dst, params- > > >rate); > > rte_sched_port_log_pipe_profile(port, i); > > } > > > > @@ -976,7 +997,7 @@ rte_sched_port_pipe_profile_add(struct > > rte_sched_port *port, > > return status; > > > > pp = &port->pipe_profiles[port->n_pipe_profiles]; > > - rte_sched_pipe_profile_convert(params, pp, port->rate); > > + rte_sched_pipe_profile_convert(port, params, pp, port->rate); > > > > /* Pipe profile not exists */ > > for (i = 0; i < port->n_pipe_profiles; i++) @@ -1715,6 +1736,7 @@ > > grinder_schedule(struct rte_sched_port *port, uint32_t pos) > > struct rte_sched_queue *queue = grinder->queue[grinder->qpos]; > > struct rte_mbuf *pkt = grinder->pkt; > > uint32_t pkt_len = pkt->pkt_len + port->frame_overhead; > > + int be_tc_active; > > > > if (!grinder_credits_check(port, pos)) > > return 0; > > @@ -1725,13 +1747,18 @@ grinder_schedule(struct rte_sched_port *port, > > uint32_t pos) > > /* Send packet */ > > port->pkts_out[port->n_pkts_out++] = pkt; > > queue->qr++; > > - grinder->wrr_tokens[grinder->qpos] += pkt_len * grinder- > > >wrr_cost[grinder->qpos]; > > + > > + be_tc_active = (grinder->tc_index == > > RTE_SCHED_TRAFFIC_CLASS_BE); > > + grinder->wrr_tokens[grinder->qpos] += > > + pkt_len * grinder->wrr_cost[grinder->qpos] * be_tc_active; > > + > > Integer multiplication is very expensive, you can easily avoid it by doing > bitwise-and with a mask whose values are either 0 or all-ones. > Replace multiplication with bitwise & operation in v5. > > if (queue->qr == queue->qw) { > > uint32_t qindex = grinder->qindex[grinder->qpos]; > > > > rte_bitmap_clear(port->bmp, qindex); > > grinder->qmask &= ~(1 << grinder->qpos); > > - grinder->wrr_mask[grinder->qpos] = 0; > > + if (be_tc_active) > > + grinder->wrr_mask[grinder->qpos] = 0; > > rte_sched_port_set_queue_empty_timestamp(port, > > qindex); > > } > > > > @@ -1877,7 +1904,7 @@ grinder_next_tc(struct rte_sched_port *port, > > uint32_t pos) > > > > grinder->tc_index = (qindex >> 2) & 0x3; > > grinder->qmask = grinder->tccache_qmask[grinder->tccache_r]; > > - grinder->qsize = qsize; > > + grinder->qsize[grinder->tc_index] = qsize; > > > > grinder->qindex[0] = qindex; > > grinder->qindex[1] = qindex + 1; > > @@ -1962,26 +1989,15 @@ grinder_wrr_load(struct rte_sched_port *port, > > uint32_t pos) > > struct rte_sched_grinder *grinder = port->grinder + pos; > > struct rte_sched_pipe *pipe = grinder->pipe; > > struct rte_sched_pipe_profile *pipe_params = grinder- > > >pipe_params; > > - uint32_t tc_index = grinder->tc_index; > > uint32_t qmask = grinder->qmask; > > - uint32_t qindex; > > - > > - qindex = tc_index * 4; > > - > > - grinder->wrr_tokens[0] = ((uint16_t) pipe->wrr_tokens[qindex]) << > > RTE_SCHED_WRR_SHIFT; > > - grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex + 1]) > > << RTE_SCHED_WRR_SHIFT; > > - grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex + 2]) > > << RTE_SCHED_WRR_SHIFT; > > - grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex + 3]) > > << RTE_SCHED_WRR_SHIFT; > > - > > - grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF; > > - grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF; > > - grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF; > > - grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF; > > + uint32_t i; > > > > - grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex]; > > - grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1]; > > - grinder->wrr_cost[2] = pipe_params->wrr_cost[qindex + 2]; > > - grinder->wrr_cost[3] = pipe_params->wrr_cost[qindex + 3]; > > + for (i = 0; i < pipe->n_be_queues; i++) { > > + grinder->wrr_tokens[i] = > > + ((uint16_t) pipe->wrr_tokens[i]) << > > RTE_SCHED_WRR_SHIFT; > > + grinder->wrr_mask[i] = ((qmask >> i) & 0x1) * 0xFFFF; > > + grinder->wrr_cost[i] = pipe_params->wrr_cost[i]; > > + } > > } > > > > static inline void > > @@ -1989,19 +2005,12 @@ grinder_wrr_store(struct rte_sched_port *port, > > uint32_t pos) { > > struct rte_sched_grinder *grinder = port->grinder + pos; > > struct rte_sched_pipe *pipe = grinder->pipe; > > - uint32_t tc_index = grinder->tc_index; > > - uint32_t qindex; > > - > > - qindex = tc_index * 4; > > + uint32_t i; > > > > - pipe->wrr_tokens[qindex] = (grinder->wrr_tokens[0] & grinder- > > >wrr_mask[0]) > > - >> RTE_SCHED_WRR_SHIFT; > > - pipe->wrr_tokens[qindex + 1] = (grinder->wrr_tokens[1] & grinder- > > >wrr_mask[1]) > > - >> RTE_SCHED_WRR_SHIFT; > > - pipe->wrr_tokens[qindex + 2] = (grinder->wrr_tokens[2] & grinder- > > >wrr_mask[2]) > > - >> RTE_SCHED_WRR_SHIFT; > > - pipe->wrr_tokens[qindex + 3] = (grinder->wrr_tokens[3] & grinder- > > >wrr_mask[3]) > > - >> RTE_SCHED_WRR_SHIFT; > > + for (i = 0; i < pipe->n_be_queues; i++) > > + pipe->wrr_tokens[i] = > > + (grinder->wrr_tokens[i] & grinder->wrr_mask[i]) >> > > + RTE_SCHED_WRR_SHIFT; > > } > > > > static inline void > > @@ -2040,22 +2049,31 @@ static inline void > > grinder_prefetch_tc_queue_arrays(struct rte_sched_port *port, uint32_t > > pos) > > { > > struct rte_sched_grinder *grinder = port->grinder + pos; > > - uint16_t qsize, qr[4]; > > + struct rte_sched_pipe *pipe = grinder->pipe; > > + struct rte_sched_queue *queue; > > + uint32_t i; > > + uint16_t qsize, qr[RTE_SCHED_MAX_QUEUES_PER_TC]; > > > > - qsize = grinder->qsize; > > - qr[0] = grinder->queue[0]->qr & (qsize - 1); > > - qr[1] = grinder->queue[1]->qr & (qsize - 1); > > - qr[2] = grinder->queue[2]->qr & (qsize - 1); > > - qr[3] = grinder->queue[3]->qr & (qsize - 1); > > + grinder->qpos = 0; > > + if (grinder->tc_index < RTE_SCHED_TRAFFIC_CLASS_BE) { > > + queue = grinder->queue[0]; > > + qsize = grinder->qsize[0]; > > + qr[0] = queue->qr & (qsize - 1); > > > > - rte_prefetch0(grinder->qbase[0] + qr[0]); > > - rte_prefetch0(grinder->qbase[1] + qr[1]); > > + rte_prefetch0(grinder->qbase[0] + qr[0]); > > + return; > > + } > > + > > + for (i = 0; i < pipe->n_be_queues; i++) { > > + queue = grinder->queue[i]; > > + qsize = grinder->qsize[i]; > > + qr[i] = queue->qr & (qsize - 1); > > + > > + rte_prefetch0(grinder->qbase[i] + qr[i]); > > + } > > > > grinder_wrr_load(port, pos); > > grinder_wrr(port, pos); > > - > > - rte_prefetch0(grinder->qbase[2] + qr[2]); > > - rte_prefetch0(grinder->qbase[3] + qr[3]); > > } > > > > static inline void > > @@ -2064,7 +2082,7 @@ grinder_prefetch_mbuf(struct rte_sched_port > > *port, uint32_t pos) > > struct rte_sched_grinder *grinder = port->grinder + pos; > > uint32_t qpos = grinder->qpos; > > struct rte_mbuf **qbase = grinder->qbase[qpos]; > > - uint16_t qsize = grinder->qsize; > > + uint16_t qsize = grinder->qsize[qpos]; > > uint16_t qr = grinder->queue[qpos]->qr & (qsize - 1); > > > > grinder->pkt = qbase[qr]; > > @@ -2118,18 +2136,24 @@ grinder_handle(struct rte_sched_port *port, > > uint32_t pos) > > > > case e_GRINDER_READ_MBUF: > > { > > - uint32_t result = 0; > > + uint32_t wrr_active, result = 0; > > > > result = grinder_schedule(port, pos); > > > > + wrr_active = (grinder->tc_index == > > RTE_SCHED_TRAFFIC_CLASS_BE); > > + > > /* Look for next packet within the same TC */ > > if (result && grinder->qmask) { > > - grinder_wrr(port, pos); > > + if (wrr_active) > > + grinder_wrr(port, pos); > > + > > grinder_prefetch_mbuf(port, pos); > > > > return 1; > > } > > - grinder_wrr_store(port, pos); > > + > > + if (wrr_active) > > + grinder_wrr_store(port, pos); > > > > /* Look for another active TC within same pipe */ > > if (grinder_next_tc(port, pos)) { > > diff --git a/lib/librte_sched/rte_sched.h > > b/lib/librte_sched/rte_sched.h index d61dda9f5..2a935998a 100644 > > --- a/lib/librte_sched/rte_sched.h > > +++ b/lib/librte_sched/rte_sched.h > > @@ -66,6 +66,22 @@ extern "C" { > > #include "rte_red.h" > > #endif > > > > +/** Maximum number of queues per pipe. > > + * Note that the multiple queues (power of 2) can only be assigned to > > + * lowest priority (best-effort) traffic class. Other higher priority > > +traffic > > + * classes can only have one queue. > > + * Can not change. > > + * > > + * @see struct rte_sched_port_params > > + */ > > +#define RTE_SCHED_QUEUES_PER_PIPE 16 > > + > > +/** Number of WRR queues for best-effort traffic class per pipe. > > + * > > + * @see struct rte_sched_pipe_params > > + */ > > +#define RTE_SCHED_BE_QUEUES_PER_PIPE 4 > > + > > /** Number of traffic classes per pipe (as well as subport). > > * Cannot be changed. > > */ > > @@ -74,11 +90,6 @@ extern "C" { > > /** Number of queues per pipe traffic class. Cannot be changed. */ > > #define RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS 4 > > > > -/** Number of queues per pipe. */ > > -#define RTE_SCHED_QUEUES_PER_PIPE \ > > - (RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE * \ > > - RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS) > > - > > /** Maximum number of pipe profiles that can be defined per port. > > * Compile-time configurable. > > */ > > @@ -165,7 +176,7 @@ struct rte_sched_pipe_params { #endif > > > > /* Pipe queues */ > > - uint8_t wrr_weights[RTE_SCHED_QUEUES_PER_PIPE]; /**< WRR > > weights */ > > + uint8_t wrr_weights[RTE_SCHED_BE_QUEUES_PER_PIPE]; /**< > > WRR weights */ > > }; > > > > /** Queue statistics */ > > -- > > 2.21.0