API added to support multiple hardware queues on an ethernet device. Round-robin scheduler added (sch_rr) to provide a no-scheduling policy qdisc for hardware with multiple queues.
Signed-off-by: Peter P Waskiewicz Jr <[EMAIL PROTECTED]> --- include/linux/etherdevice.h | 3 include/linux/netdevice.h | 62 +++++ include/linux/pkt_sched.h | 11 + include/linux/skbuff.h | 2 net/core/dev.c | 27 ++ net/core/skbuff.c | 3 net/ethernet/eth.c | 9 - net/sched/Kconfig | 22 ++ net/sched/Makefile | 1 net/sched/sch_generic.c | 4 net/sched/sch_prio.c | 66 +++++- net/sched/sch_rr.c | 516 +++++++++++++++++++++++++++++++++++++++++++ 12 files changed, 706 insertions(+), 20 deletions(-) diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 071c67a..283e687 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -39,7 +39,8 @@ extern void eth_header_cache_update(struct hh_cache *hh, struct net_device *dev extern int eth_header_cache(struct neighbour *neigh, struct hh_cache *hh); -extern struct net_device *alloc_etherdev(int sizeof_priv); +extern struct net_device *alloc_etherdev_mq(int sizeof_priv, int queue_count); +#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) static inline void eth_copy_and_sum (struct sk_buff *dest, const unsigned char *src, int len, int base) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f671cd2..376a0d2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -108,6 +108,14 @@ struct wireless_dev; #define MAX_HEADER (LL_MAX_HEADER + 48) #endif +struct net_device_subqueue +{ + /* Give a control state for each queue. This struct may contain + * per-queue locks in the future. + */ + unsigned long state; +}; + /* * Network device statistics. Akin to the 2.0 ether stats but * with byte counters. @@ -325,6 +333,7 @@ struct net_device #define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */ #define NETIF_F_GSO 2048 /* Enable software GSO. */ #define NETIF_F_LLTX 4096 /* LockLess TX */ +#define NETIF_F_MULTI_QUEUE 16384 /* Has multiple TX/RX queues */ /* Segmentation offload features */ #define NETIF_F_GSO_SHIFT 16 @@ -540,6 +549,10 @@ struct net_device struct device dev; /* space for optional statistics and wireless sysfs groups */ struct attribute_group *sysfs_groups[3]; + + /* The TX queue control structures */ + struct net_device_subqueue *egress_subqueue; + int egress_subqueue_count; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -702,6 +715,48 @@ static inline int netif_running(const struct net_device *dev) return test_bit(__LINK_STATE_START, &dev->state); } +/* + * Routines to manage the subqueues on a device. We only need start + * stop, and a check if it's stopped. All other device management is + * done at the overall netdevice level. + * Also test the device if we're multiqueue. + */ +static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index) +{ + clear_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state); +} + +static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index) +{ +#ifdef CONFIG_NETPOLL_TRAP + if (netpoll_trap()) + return; +#endif + set_bit(__LINK_STATE_XOFF, &dev->egress_subqueue[queue_index].state); +} + +static inline int netif_subqueue_stopped(const struct net_device *dev, + u16 queue_index) +{ + return test_bit(__LINK_STATE_XOFF, + &dev->egress_subqueue[queue_index].state); +} + +static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) +{ +#ifdef CONFIG_NETPOLL_TRAP + if (netpoll_trap()) + return; +#endif + if (test_and_clear_bit(__LINK_STATE_XOFF, + &dev->egress_subqueue[queue_index].state)) + __netif_schedule(dev); +} + +static inline int netif_is_multiqueue(const struct net_device *dev) +{ + return (!!(NETIF_F_MULTI_QUEUE & dev->features)); +} /* Use this variant when it is known for sure that it * is executing from interrupt context. @@ -995,8 +1050,11 @@ static inline void netif_tx_disable(struct net_device *dev) extern void ether_setup(struct net_device *dev); /* Support for loadable net-drivers */ -extern struct net_device *alloc_netdev(int sizeof_priv, const char *name, - void (*setup)(struct net_device *)); +extern struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), + int queue_count); +#define alloc_netdev(sizeof_priv, name, setup) \ + alloc_netdev_mq(sizeof_priv, name, setup, 1) extern int register_netdev(struct net_device *dev); extern void unregister_netdev(struct net_device *dev); /* Functions used for multicast support */ diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index d10f353..0d1adaf 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -22,6 +22,7 @@ #define TC_PRIO_CONTROL 7 #define TC_PRIO_MAX 15 +#define TC_RR_MAX 15 /* Generic queue statistics, available for all the elements. Particular schedulers may have also their private records. @@ -90,6 +91,16 @@ struct tc_fifo_qopt __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ }; +/* RR section */ +#define TCQ_RR_BANDS 16 +#define TCQ_MIN_RR_BANDS 2 + +struct tc_rr_qopt +{ + int bands; /* Number of bands */ + __u8 priomap[TC_RR_MAX+1]; /* Map: Linux priority -> RR band */ +}; + /* PRIO section */ #define TCQ_PRIO_BANDS 16 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e7367c7..8bcd870 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -215,6 +215,7 @@ typedef unsigned char *sk_buff_data_t; * @pkt_type: Packet class * @fclone: skbuff clone status * @ip_summed: Driver fed us an IP checksum + * @queue_mapping: Queue mapping for multiqueue devices * @priority: Packet queueing priority * @users: User count - see {datagram,tcp}.c * @protocol: Packet protocol from driver @@ -269,6 +270,7 @@ struct sk_buff { __u16 csum_offset; }; }; + __u16 queue_mapping; __u32 priority; __u8 local_df:1, cloned:1, diff --git a/net/core/dev.c b/net/core/dev.c index 4317c1b..27c90e1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1477,6 +1477,8 @@ gso: spin_lock(&dev->queue_lock); q = dev->qdisc; if (q->enqueue) { + /* reset queue_mapping to zero */ + skb->queue_mapping = 0; rc = q->enqueue(skb, q); qdisc_run(dev); spin_unlock(&dev->queue_lock); @@ -3273,16 +3275,18 @@ static struct net_device_stats *internal_stats(struct net_device *dev) } /** - * alloc_netdev - allocate network device + * alloc_netdev_mq - allocate network device * @sizeof_priv: size of private data to allocate space for * @name: device name format string * @setup: callback to initialize device + * @queue_count: the number of subqueues to allocate * * Allocates a struct net_device with private data area for driver use - * and performs basic initialization. + * and performs basic initialization. Also allocates subqueue structs + * for each queue on the device. */ -struct net_device *alloc_netdev(int sizeof_priv, const char *name, - void (*setup)(struct net_device *)) +struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, + void (*setup)(struct net_device *), int queue_count) { void *p; struct net_device *dev; @@ -3307,12 +3311,23 @@ struct net_device *alloc_netdev(int sizeof_priv, const char *name, if (sizeof_priv) dev->priv = netdev_priv(dev); + alloc_size = (sizeof(struct net_device_subqueue) * queue_count); + + p = kzalloc(alloc_size, GFP_KERNEL); + if (!p) { + printk(KERN_ERR "alloc_netdev: Unable to allocate queues.\n"); + return NULL; + } + + dev->egress_subqueue = p; + dev->egress_subqueue_count = queue_count; + dev->get_stats = internal_stats; setup(dev); strcpy(dev->name, name); return dev; } -EXPORT_SYMBOL(alloc_netdev); +EXPORT_SYMBOL(alloc_netdev_mq); /** * free_netdev - free network device @@ -3326,6 +3341,7 @@ void free_netdev(struct net_device *dev) { #ifdef CONFIG_SYSFS /* Compatibility with error handling in drivers */ + kfree((char *)dev->egress_subqueue); if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); return; @@ -3337,6 +3353,7 @@ void free_netdev(struct net_device *dev) /* will free via device release */ put_device(&dev->dev); #else + kfree((char *)dev->egress_subqueue); kfree((char *)dev - dev->padded); #endif } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1422573..0528cf3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -418,6 +418,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) n->nohdr = 0; C(pkt_type); C(ip_summed); + C(queue_mapping); C(priority); #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) C(ipvs_property); @@ -459,6 +460,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif new->sk = NULL; new->dev = old->dev; + new->queue_mapping = old->queue_mapping; new->priority = old->priority; new->protocol = old->protocol; new->dst = dst_clone(old->dst); @@ -1926,6 +1928,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features) tail = nskb; nskb->dev = skb->dev; + nskb->queue_mapping = skb->queue_mapping; nskb->priority = skb->priority; nskb->protocol = skb->protocol; nskb->dst = dst_clone(skb->dst); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 0ac2524..87a509c 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -316,9 +316,10 @@ void ether_setup(struct net_device *dev) EXPORT_SYMBOL(ether_setup); /** - * alloc_etherdev - Allocates and sets up an Ethernet device + * alloc_etherdev_mq - Allocates and sets up an Ethernet device * @sizeof_priv: Size of additional driver-private structure to be allocated * for this Ethernet device + * @queue_count: The number of queues this device has. * * Fill in the fields of the device structure with Ethernet-generic * values. Basically does everything except registering the device. @@ -328,8 +329,8 @@ EXPORT_SYMBOL(ether_setup); * this private data area. */ -struct net_device *alloc_etherdev(int sizeof_priv) +struct net_device *alloc_etherdev_mq(int sizeof_priv, int queue_count) { - return alloc_netdev(sizeof_priv, "eth%d", ether_setup); + return alloc_netdev_mq(sizeof_priv, "eth%d", ether_setup, queue_count); } -EXPORT_SYMBOL(alloc_etherdev); +EXPORT_SYMBOL(alloc_etherdev_mq); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 475df84..a532554 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -111,6 +111,28 @@ config NET_SCH_PRIO To compile this code as a module, choose M here: the module will be called sch_prio. +config NET_SCH_PRIO_MQ + bool "Multiple hardware queue support for PRIO" + depends on NET_SCH_PRIO + ---help--- + Say Y here if you want to allow the PRIO qdisc to assign + flows to multiple hardware queues on an ethernet device. This + will still work on devices with 1 queue. + + Consider this scheduler for devices that do not use + hardware-based scheduling policies. Otherwise, use NET_SCH_RR. + + Most people will say N here. + +config NET_SCH_RR + tristate "Multi Band Round Robin Queuing (RR)" + ---help--- + Say Y here if you want to use an n-band round robin packet + scheduler. + + To compile this code as a module, choose M here: the + module will be caleld sch_rr. + config NET_SCH_RED tristate "Random Early Detection (RED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 020767a..d3ed44e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -26,6 +26,7 @@ obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o +obj-$(CONFIG_NET_SCH_RR) += sch_rr.o obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f28bb2d..b9dc2a6 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -123,7 +123,8 @@ static inline int qdisc_restart(struct net_device *dev) /* And release queue */ spin_unlock(&dev->queue_lock); - if (!netif_queue_stopped(dev)) { + if (!netif_queue_stopped(dev) && + !netif_subqueue_stopped(dev, skb->queue_mapping)) { int ret; ret = dev_hard_start_xmit(skb, dev); @@ -141,7 +142,6 @@ static inline int qdisc_restart(struct net_device *dev) goto collision; } } - /* NETDEV_TX_BUSY - we need to requeue */ /* Release the driver */ if (!nolock) { diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 269a6e1..c78dba4 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -43,6 +43,7 @@ struct prio_sched_data struct tcf_proto *filter_list; u8 prio2band[TC_PRIO_MAX+1]; struct Qdisc *queues[TCQ_PRIO_BANDS]; + u16 band2queue[TC_PRIO_MAX + 1]; }; @@ -70,13 +71,26 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) #endif if (TC_H_MAJ(band)) band = 0; +#ifdef CONFIG_NET_SCH_PRIO_MQ + skb->queue_mapping = + q->band2queue[q->prio2band[band&TC_PRIO_MAX]]; +#endif + return q->queues[q->prio2band[band&TC_PRIO_MAX]]; } band = res.classid; } band = TC_H_MIN(band) - 1; - if (band > q->bands) + if (band > q->bands) { +#ifdef CONFIG_NET_SCH_PRIO_MQ + skb->queue_mapping = q->band2queue[q->prio2band[0]]; +#endif return q->queues[q->prio2band[0]]; + } + +#ifdef CONFIG_NET_SCH_PRIO_MQ + skb->queue_mapping = q->band2queue[band]; +#endif return q->queues[band]; } @@ -144,12 +158,22 @@ prio_dequeue(struct Qdisc* sch) struct Qdisc *qdisc; for (prio = 0; prio < q->bands; prio++) { - qdisc = q->queues[prio]; - skb = qdisc->dequeue(qdisc); - if (skb) { - sch->q.qlen--; - return skb; +#ifdef CONFIG_NET_SCH_PRIO_MQ + /* Check if the target subqueue is available before + * pulling an skb. This way we avoid excessive requeues + * for slower queues. + */ + if (!netif_subqueue_stopped(sch->dev, q->band2queue[prio])) { +#endif + qdisc = q->queues[prio]; + skb = qdisc->dequeue(qdisc); + if (skb) { + sch->q.qlen--; + return skb; + } +#ifdef CONFIG_NET_SCH_PRIO_MQ } +#endif } return NULL; @@ -200,6 +224,10 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt) struct prio_sched_data *q = qdisc_priv(sch); struct tc_prio_qopt *qopt = RTA_DATA(opt); int i; + int queue; + int qmapoffset; + int offset; + int mod; if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) return -EINVAL; @@ -242,6 +270,32 @@ static int prio_tune(struct Qdisc *sch, struct rtattr *opt) } } } +#ifdef CONFIG_NET_SCH_PRIO_MQ + /* setup queue to band mapping */ + if (q->bands < sch->dev->egress_subqueue_count) { + qmapoffset = 1; + mod = sch->dev->egress_subqueue_count; + } else { + mod = q->bands % sch->dev->egress_subqueue_count; + qmapoffset = q->bands / sch->dev->egress_subqueue_count + + ((mod) ? 1 : 0); + } + + queue = 0; + offset = 0; + for (i = 0; i < q->bands; i++) { + q->band2queue[i] = queue; + if ( ((i + 1) - offset) == qmapoffset) { + queue++; + offset += qmapoffset; + if (mod) + mod--; + qmapoffset = q->bands / + sch->dev->egress_subqueue_count + + ((mod) ? 1 : 0); + } + } +#endif return 0; } diff --git a/net/sched/sch_rr.c b/net/sched/sch_rr.c new file mode 100644 index 0000000..ce9f237 --- /dev/null +++ b/net/sched/sch_rr.c @@ -0,0 +1,516 @@ +/* + * net/sched/sch_rr.c Simple n-band round-robin scheduler. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The core part of this qdisc is based on sch_prio. ->dequeue() is where + * this scheduler functionally differs. + * + * Author: PJ Waskiewicz, <[EMAIL PROTECTED]> + * + * Original Authors (from PRIO): Alexey Kuznetsov, <[EMAIL PROTECTED]> + * Fixes: 19990609: J Hadi Salim <[EMAIL PROTECTED]>: + * Init -- EINVAL when opt undefined + */ + +#include <linux/module.h> +#include <asm/uaccess.h> +#include <asm/system.h> +#include <linux/bitops.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/if_ether.h> +#include <linux/inet.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/notifier.h> +#include <net/ip.h> +#include <net/route.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/sock.h> +#include <net/pkt_sched.h> + + +struct rr_sched_data +{ + int bands; + int curband; + struct tcf_proto *filter_list; + u8 prio2band[TC_RR_MAX + 1]; + struct Qdisc *queues[TCQ_RR_BANDS]; + u16 band2queue[TC_RR_MAX + 1]; +}; + + +static struct Qdisc *rr_classify(struct sk_buff *skb, struct Qdisc *sch, + int *qerr) +{ + struct rr_sched_data *q = qdisc_priv(sch); + u32 band = skb->priority; + struct tcf_result res; + + *qerr = NET_XMIT_BYPASS; + if (TC_H_MAJ(skb->priority) != sch->handle) { +#ifdef CONFIG_NET_CLS_ACT + switch (tc_classify(skb, q->filter_list, &res)) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS; + case TC_ACT_SHOT: + return NULL; + } + + if (!q->filter_list ) { +#else + if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) { +#endif + if (TC_H_MAJ(band)) + band = 0; + skb->queue_mapping = + q->band2queue[q->prio2band[band&TC_RR_MAX]]; + + return q->queues[q->prio2band[band&TC_RR_MAX]]; + } + band = res.classid; + } + band = TC_H_MIN(band) - 1; + if (band > q->bands) { + skb->queue_mapping = q->band2queue[q->prio2band[0]]; + return q->queues[q->prio2band[0]]; + } + + skb->queue_mapping = q->band2queue[band]; + + return q->queues[band]; +} + +static int rr_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct Qdisc *qdisc; + int ret; + + qdisc = rr_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + + if (ret == NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) { + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + sch->qstats.drops++; + return ret; +} + + +static int rr_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct Qdisc *qdisc; + int ret; + + qdisc = rr_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + if (ret == NET_XMIT_BYPASS) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + if ((ret = qdisc->ops->requeue(skb, qdisc)) == NET_XMIT_SUCCESS) { + sch->q.qlen++; + sch->qstats.requeues++; + return 0; + } + sch->qstats.drops++; + return NET_XMIT_DROP; +} + + +static struct sk_buff *rr_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct rr_sched_data *q = qdisc_priv(sch); + struct Qdisc *qdisc; + int bandcount; + + /* Only take one pass through the queues. If nothing is available, + * return nothing. + */ + for (bandcount = 0; bandcount < q->bands; bandcount++) { + /* Check if the target subqueue is available before + * pulling an skb. This way we avoid excessive requeues + * for slower queues. If the queue is stopped, try the + * next queue. + */ + if (!netif_subqueue_stopped(sch->dev, q->band2queue[q->curband])) { + qdisc = q->queues[q->curband]; + skb = qdisc->dequeue(qdisc); + if (skb) { + sch->q.qlen--; + q->curband++; + if (q->curband >= q->bands) + q->curband = 0; + return skb; + } + } + q->curband++; + if (q->curband >= q->bands) + q->curband = 0; + } + return NULL; +} + +static unsigned int rr_drop(struct Qdisc* sch) +{ + struct rr_sched_data *q = qdisc_priv(sch); + int band; + unsigned int len; + struct Qdisc *qdisc; + + for (band = q->bands - 1; band >= 0; band--) { + qdisc = q->queues[band]; + if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) { + sch->q.qlen--; + return len; + } + } + return 0; +} + + +static void rr_reset(struct Qdisc* sch) +{ + int band; + struct rr_sched_data *q = qdisc_priv(sch); + + for (band = 0; band < q->bands; band++) + qdisc_reset(q->queues[band]); + sch->q.qlen = 0; +} + +static void rr_destroy(struct Qdisc* sch) +{ + int band; + struct rr_sched_data *q = qdisc_priv(sch); + + tcf_destroy_chain(q->filter_list); + for (band = 0; band < q->bands; band++) + qdisc_destroy(q->queues[band]); +} + +static int rr_tune(struct Qdisc *sch, struct rtattr *opt) +{ + struct rr_sched_data *q = qdisc_priv(sch); + struct tc_rr_qopt *qopt = RTA_DATA(opt); + int i; + int queue; + int qmapoffset; + int offset; + int mod; + + if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) + return -EINVAL; + if (qopt->bands > TCQ_RR_BANDS || qopt->bands < 2) + return -EINVAL; + + for (i = 0; i <= TC_RR_MAX; i++) { + if (qopt->priomap[i] >= qopt->bands) + return -EINVAL; + } + + sch_tree_lock(sch); + q->bands = qopt->bands; + memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); + q->curband = 0; + + for (i = q->bands; i < TCQ_RR_BANDS; i++) { + struct Qdisc *child = xchg(&q->queues[i], &noop_qdisc); + if (child != &noop_qdisc) { + qdisc_tree_decrease_qlen(child, child->q.qlen); + qdisc_destroy(child); + } + } + sch_tree_unlock(sch); + + for (i = 0; i < q->bands; i++) { + if (q->queues[i] == &noop_qdisc) { + struct Qdisc *child; + child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, i + 1)); + if (child) { + sch_tree_lock(sch); + child = xchg(&q->queues[i], child); + + if (child != &noop_qdisc) { + qdisc_tree_decrease_qlen(child, + child->q.qlen); + qdisc_destroy(child); + } + sch_tree_unlock(sch); + } + } + } + /* setup queue to band mapping - best effort to map into available + * hardware queues + */ + if (q->bands < sch->dev->egress_subqueue_count) { + qmapoffset = 1; + mod = sch->dev->egress_subqueue_count; + } else { + mod = q->bands % sch->dev->egress_subqueue_count; + qmapoffset = q->bands / sch->dev->egress_subqueue_count + + ((mod) ? 1 : 0); + } + + queue = 0; + offset = 0; + for (i = 0; i < q->bands; i++) { + q->band2queue[i] = queue; + if ( ((i + 1) - offset) == qmapoffset) { + queue++; + offset += qmapoffset; + if (mod) + mod--; + qmapoffset = q->bands / + sch->dev->egress_subqueue_count + + ((mod) ? 1 : 0); + } + } + + return 0; +} + +static int rr_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct rr_sched_data *q = qdisc_priv(sch); + int i; + + for (i = 0; i < TCQ_RR_BANDS; i++) + q->queues[i] = &noop_qdisc; + + if (opt == NULL) { + return -EINVAL; + } else { + int err; + + if ((err = rr_tune(sch, opt)) != 0) + return err; + } + return 0; +} + +static int rr_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct rr_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb_tail_pointer(skb); + struct tc_rr_qopt opt; + + opt.bands = q->bands; + memcpy(&opt.priomap, q->prio2band, TC_RR_MAX + 1); + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + nlmsg_trim(skb, b); + return -1; +} + +static int rr_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct rr_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (band >= q->bands) + return -EINVAL; + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->queues[band]; + q->queues[band] = new; + qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *rr_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct rr_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (band >= q->bands) + return NULL; + + return q->queues[band]; +} + +static unsigned long rr_get(struct Qdisc *sch, u32 classid) +{ + struct rr_sched_data *q = qdisc_priv(sch); + unsigned long band = TC_H_MIN(classid); + + if (band - 1 >= q->bands) + return 0; + return band; +} + +static unsigned long rr_bind(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + return rr_get(sch, classid); +} + + +static void rr_put(struct Qdisc *q, unsigned long cl) +{ + return; +} + +static int rr_change(struct Qdisc *sch, u32 handle, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct rr_sched_data *q = qdisc_priv(sch); + + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + +static int rr_delete(struct Qdisc *sch, unsigned long cl) +{ + struct rr_sched_data *q = qdisc_priv(sch); + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + + +static int rr_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct rr_sched_data *q = qdisc_priv(sch); + + if (cl - 1 > q->bands) + return -ENOENT; + tcm->tcm_handle |= TC_H_MIN(cl); + if (q->queues[cl - 1]) + tcm->tcm_info = q->queues[cl - 1]->handle; + return 0; +} + +static int rr_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct rr_sched_data *q = qdisc_priv(sch); + struct Qdisc *cl_q; + + cl_q = q->queues[cl - 1]; + if (gnet_stats_copy_basic(d, &cl_q->bstats) < 0 || + gnet_stats_copy_queue(d, &cl_q->qstats) < 0) + return -1; + + return 0; +} + +static void rr_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct rr_sched_data *q = qdisc_priv(sch); + int band; + + if (arg->stop) + return; + + for (band = 0; band < q->bands; band++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, band + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto **rr_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct rr_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static struct Qdisc_class_ops rr_class_ops = { + .graft = rr_graft, + .leaf = rr_leaf, + .get = rr_get, + .put = rr_put, + .change = rr_change, + .delete = rr_delete, + .walk = rr_walk, + .tcf_chain = rr_find_tcf, + .bind_tcf = rr_bind, + .unbind_tcf = rr_put, + .dump = rr_dump_class, + .dump_stats = rr_dump_class_stats, +}; + +static struct Qdisc_ops rr_qdisc_ops = { + .next = NULL, + .cl_ops = &rr_class_ops, + .id = "rr", + .priv_size = sizeof(struct rr_sched_data), + .enqueue = rr_enqueue, + .dequeue = rr_dequeue, + .requeue = rr_requeue, + .drop = rr_drop, + .init = rr_init, + .reset = rr_reset, + .destroy = rr_destroy, + .change = rr_tune, + .dump = rr_dump, + .owner = THIS_MODULE, +}; + +static int __init rr_module_init(void) +{ + return register_qdisc(&rr_qdisc_ops); +} + +static void __exit rr_module_exit(void) +{ + unregister_qdisc(&rr_qdisc_ops); +} + +module_init(rr_module_init) +module_exit(rr_module_exit) + +MODULE_LICENSE("GPL"); - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html