[PATCH 0/3] Splice network receive support
Hi, This series of patches applies on top of the splice series just posted. It implements basic network receive support, ie splicing from a socket to a pipe. There seems to be a skhead_buff_cache leak somewhere that I need to track down, otherwise it works fine for me. -- Jens Axboe - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] splice: don't assume regular pages in splice_to_pipe()
Allow caller to pass in a release function, there might be other resources that need releasing as well. Needed for network receive. Signed-off-by: Jens Axboe [EMAIL PROTECTED] --- fs/splice.c|9 - include/linux/splice.h |1 + 2 files changed, 9 insertions(+), 1 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index f24e367..25ec9c8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -247,11 +247,16 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, } while (page_nr spd-nr_pages) - page_cache_release(spd-pages[page_nr++]); + spd-spd_release(spd, page_nr++); return ret; } +static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) +{ + page_cache_release(spd-pages[i]); +} + static int __generic_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, @@ -270,6 +275,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, .partial = partial, .flags = flags, .ops = page_cache_pipe_buf_ops, + .spd_release = spd_release_page, }; index = *ppos PAGE_CACHE_SHIFT; @@ -1442,6 +1448,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, .partial = partial, .flags = flags, .ops = user_page_pipe_buf_ops, + .spd_release = spd_release_page, }; pipe = pipe_info(file-f_path.dentry-d_inode); diff --git a/include/linux/splice.h b/include/linux/splice.h index 1a1182b..04c1068 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -53,6 +53,7 @@ struct splice_pipe_desc { int nr_pages; /* number of pages in map */ unsigned int flags; /* splice flags */ const struct pipe_buf_operations *ops;/* ops associated with output pipe */ + void (*spd_release)(struct splice_pipe_desc *, unsigned int); }; typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, -- 1.5.2.1.174.gcd03 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] TCP splice receive support
Support for network splice receive. Signed-off-by: Jens Axboe [EMAIL PROTECTED] --- include/linux/net.h|3 + include/linux/skbuff.h |5 + include/net/tcp.h |3 + net/core/skbuff.c | 231 net/ipv4/af_inet.c |1 + net/ipv4/tcp.c | 129 +++ net/socket.c | 13 +++ 7 files changed, 385 insertions(+), 0 deletions(-) diff --git a/include/linux/net.h b/include/linux/net.h index efc4517..472ee12 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -19,6 +19,7 @@ #define _LINUX_NET_H #include linux/wait.h +#include linux/splice.h #include asm/socket.h struct poll_table_struct; @@ -165,6 +166,8 @@ struct proto_ops { struct vm_area_struct * vma); ssize_t (*sendpage) (struct socket *sock, struct page *page, int offset, size_t size, int flags); + ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); }; struct net_proto_family { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e7367c7..64e3eed 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1504,6 +1504,11 @@ extern int skb_store_bits(struct sk_buff *skb, int offset, extern __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to, int len, __wsum csum); +extern int skb_splice_bits(struct sk_buff *skb, + unsigned int offset, + struct pipe_inode_info *pipe, + unsigned int len, + unsigned int flags); extern void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); extern void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); diff --git a/include/net/tcp.h b/include/net/tcp.h index a8af9ae..8e86697 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -308,6 +308,9 @@ extern int tcp_twsk_unique(struct sock *sk, extern voidtcp_twsk_destructor(struct sock *sk); +extern ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); + static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7c6a34e..daea7b0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -52,6 +52,7 @@ #endif #include linux/string.h #include linux/skbuff.h +#include linux/splice.h #include linux/cache.h #include linux/rtnetlink.h #include linux/init.h @@ -71,6 +72,40 @@ static struct kmem_cache *skbuff_head_cache __read_mostly; static struct kmem_cache *skbuff_fclone_cache __read_mostly; +static void sock_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct sk_buff *skb = (struct sk_buff *) buf-private; + + kfree_skb(skb); +} + +static void sock_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct sk_buff *skb = (struct sk_buff *) buf-private; + + skb_get(skb); +} + +static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + + +/* Pipe buffer operations for a socket. */ +static struct pipe_buf_operations sock_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = generic_pipe_buf_pin, + .release = sock_pipe_buf_release, + .steal = sock_pipe_buf_steal, + .get = sock_pipe_buf_get, +}; + /* * Keep out-of-line to prevent kernel bloat. * __builtin_return_address is not used because it is not always @@ -1116,6 +1151,202 @@ fault: return -EFAULT; } +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + struct sk_buff *skb = (struct sk_buff *) spd-partial[i].private; + + kfree_skb(skb); +} + +/* + * Fill page/offset/length into spd, if it can hold more pages. + */ +static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, + unsigned int len, unsigned int offset, + struct sk_buff *skb) +{ +
[PATCH 2/3] tcp_read_sock: alloc recv_actor() return return negative error value
Signed-off-by: Jens Axboe [EMAIL PROTECTED] --- net/ipv4/tcp.c |8 ++-- 1 files changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index cd3c7e9..450f44b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1064,7 +1064,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, break; } used = recv_actor(desc, skb, offset, len); - if (used = len) { + if (used 0) { + if (!copied) + copied = used; + break; + } else if (used = len) { seq += used; copied += used; offset += used; @@ -1086,7 +1090,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ - if (copied) + if (copied 0) tcp_cleanup_rbuf(sk, copied); return copied; } -- 1.5.2.1.174.gcd03 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] CONFIG_INET depend on CONFIG_SYSCTL
From: Yoshinori Sato [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 16:38:55 +0900 It cannot build with CONFIG_SYSCTL=n and CONFIG_INET=y. In case of CONFIG_INET=y it should become CONFIG_SYSCTL=y. Signed-off-by: Yoshinori Sato [EMAIL PROTECTED] 1) Please post networking patches to netdev@vger.kernel.org which has been added to the CC: 2) It is much better to add the appropriate CONFIG_SYSCTL ifdefs to the INET code than to force it on for everyone. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-2.6 1/1] [TCP]: Fix left_out setting during FRTO
Without FRTO, the tcp_try_to_open is never called with lost_out 0 (see tcp_time_to_recover). However, when FRTO is enabled, the !tp-lost condition is not used until end of FRTO because that way TCP avoids premature entry to fast recovery during FRTO. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- This case was found during left_out drop audit (only relevant to net-2.6 since tcp-2.6 does a right thing after left_out drop). diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 74683d8..ed4a1bd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2037,7 +2037,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - tp-left_out = tp-sacked_out; + tcp_sync_left_out(tp); if (tp-retrans_out == 0) tp-retrans_stamp = 0; -- 1.5.0.6
Re: [PATCH] NET: Multiqueue network device support.
On Mon, 2007-06-11 at 08:23 -0400, jamal wrote: On Mon, 2007-11-06 at 13:58 +0200, Patrick McHardy wrote: Thats not true. Assume PSL has lots of packets, PSH is empty. We fill the PHL queue until their is no room left, so the driver has to stop the queue. Sure. Packets stashed on the any DMA ring are considered gone to the wire. That is a very valid assumption to make. Not at all! Packets could be on the DMA queue forever if you're feeding out more packets. Heck, on most wireless hardware packets can even be *expired* from the DMA queue and you get an indication that it was impossible to send them. johannes signature.asc Description: This is a digitally signed message part
Re: [2.6.21.1] soft lockup when removing netconsole module
On Tue, May 29, 2007 at 12:56:28AM -0700, Andrew Morton wrote: On Sat, 26 May 2007 17:40:12 +0200 Folkert van Heusden [EMAIL PROTECTED] wrote: When trying to remove the netconsole module, I got the following kernel output after a while (couple of minutes iirc): [525720.117293] BUG: soft lockup detected on CPU#1! [525720.117353] [c1004d53] show_trace_log_lvl+0x1a/0x30 [525720.117439] [c1004d7b] show_trace+0x12/0x14 [525720.117526] [c1004e75] dump_stack+0x16/0x18 [525720.117613] [c104dd5b] softlockup_tick+0xa6/0xc2 [525720.117694] [c1026855] run_local_timers+0x12/0x14 [525720.117738] [c1026669] update_process_times+0x72/0xa1 [525720.117744] [c1038673] tick_sched_timer+0x53/0xb6 [525720.117748] [c1033d62] hrtimer_interrupt+0x189/0x1e3 [525720.117753] [c100e9e2] local_apic_timer_interrupt+0x55/0x5b [525720.117761] [c100ea12] smp_apic_timer_interrupt+0x2a/0x39 [525720.117766] [c1004a3f] apic_timer_interrupt+0x33/0x38 [525720.117770] [c120f4b1] mutex_lock+0x8/0xa [525720.117775] [c102d2f0] flush_workqueue+0x2f/0x8f [525720.117780] [c102d7a0] cancel_rearming_delayed_workqueue+0x29/0x2b [525720.117785] [c102d7b1] cancel_rearming_delayed_work+0xf/0x11 [525720.117790] [c11be143] netpoll_cleanup+0x75/0xa5 [525720.117794] [f893712d] cleanup_netconsole+0x17/0x1a [netconsole] [525720.117804] [c1041f11] sys_delete_module+0x12f/0x14f [525720.117809] [c1003f74] syscall_call+0x7/0xb [525720.117812] === Also the rmmod hangs and would not exit even with kill -9. It also sucks up 100% cpu. Jason recently posted a mystery patch without telling us what problem it fixed. To be fair the problem should be known: http://marc.info/?l=linux-kernelm=117700287817801w=2 List: linux-kernel Subject:Re: [PATCH -mm] workqueue: debug possible endless loop in cancel_rearming_delayed_work From: Chuck Ebbert cebbert () redhat ! com Date: 2007-04-19 17:07:11 Message-ID: 4627A1BF.8080406 () redhat ! com Okay, an easy test for it: insmod netconsole ; rmmod netconsole In 2.6.20.x it loops forever and cancel_rearming_delayed_work() is part of the trace... I hoped the discussion about cancel_rearming_delayed_work would reach more people (there was also a patch proposal to add a warning to the usage comment). But it seem it was not enough... Of course such a problem should preferably be fixed by somebody who knows the code (alas I don't know netconsole), to be sure all needed cancels are still done after this change. I hope Jason's patch is right but I'm a little surprised I can't see netdev in cc (I'll try to fix this). Cheers, Jarek P. PS: I'm very sorry for such late response (holidays). It looks like you just found it: cancel_rearming_delayed_work() will hang if the work isn't actually pending. Please test this: From: Jason Wessel [EMAIL PROTECTED] Do not call cancel_rearming_delayed_work() if there is no pending work. Signed-off-by: Jason Wessel [EMAIL PROTECTED] Signed-off-by: Andrew Morton [EMAIL PROTECTED] --- net/core/netpoll.c |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff -puN net/core/netpoll.c~a net/core/netpoll.c --- a/net/core/netpoll.c~a +++ a/net/core/netpoll.c @@ -784,8 +784,10 @@ void netpoll_cleanup(struct netpoll *np) if (atomic_dec_and_test(npinfo-refcnt)) { skb_queue_purge(npinfo-arp_tx); skb_queue_purge(npinfo-txq); - cancel_rearming_delayed_work(npinfo-tx_work); - flush_scheduled_work(); + if (delayed_work_pending(npinfo-tx_work)) { + cancel_rearming_delayed_work(npinfo-tx_work); + flush_scheduled_work(); + } kfree(npinfo); } _ - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] network splice receive
On Sat, Jun 09, 2007 at 08:36:09AM +0200, Jens Axboe ([EMAIL PROTECTED]) wrote: On Fri, Jun 08 2007, Evgeniy Polyakov wrote: On Fri, Jun 08, 2007 at 06:57:25PM +0400, Evgeniy Polyakov ([EMAIL PROTECTED]) wrote: I will try some things for the nearest 30-60 minutes, and then will move to canoe trip until thuesday, so will not be able to work on this idea. Ok, replacing in fs/splice.c every page_cache_release() with static void splice_page_release(struct page *p) { if (!PageSlab(p)) page_cache_release(p); } Ehm, I don't see why that should be necessary. Except in splice_to_pipe(), I have considered that we need to pass in a release function if mapping fails at some point. But it's probably best to do that in the caller, since they have the knowledge of how to release the pages. The rest of the PageSlab() tests are bogus. I had a crashdump, where page was released via splice_to_pipe() indeed, I did not investigate if it is possible to release provided page in other places. I think if in future there will other slab usage cases except networking receiving, that might be useful, but as is it is not needed. and putting cloned skb into private field instead of original on in spd_fill_page() ends up without kernel hung. Why? Seems pointless to allocate a clone just to hold on to the skb, a reference should be equally good. I would not be opposed to doing it this way, I just don't see what a clone buys us as compared to just holding that reference to the skb. Receiving code does not expect shared skbs - too many fields are changed with assumptions that it is a private copy. I'm not sure it is correct, that page can be released in fs/splice.c without calling any callback from network code, when network data is being processed. Please explain! I had a crashdump, where page was attempted to be released in fs/splice.c:splice_to_pipe(), I do not have details handy, but the best solution would be to provide a release callback and use that instead of page_cache_release(). -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH SET] pktgen IPSEC 0/4
This is a set of patches that add ipsec functionality to pktgen. It is against Daves net-2.6.23 Robert, please take a closer look at this set and either sign off or comment for me to redo something. I have a short cycle before being busyed out where i can fix things. Dave, I would like to push these to net-2.6.23 as soon as Robert Acks them. cheers, jamal - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] pktgen IPSEC 1/4: Centralize pktgen packet overhead management
Manual labor still ... 1 of 4 cheers, jamal commit 38477d7ddfa58f58cce99bc902b4c18883647a71 Author: Jamal Hadi Salim [EMAIL PROTECTED] Date: Tue Jun 12 06:43:00 2007 -0400 [PKTGEN] Centralize packet overhead tracking Track the extra packet overhead for VLAN tags, MPLS, IPSEC etc Signed-off-by: Jamal Hadi Salim [EMAIL PROTECTED] diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9cd3a1c..1352316 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -228,6 +228,7 @@ struct pktgen_dev { int min_pkt_size; /* = ETH_ZLEN; */ int max_pkt_size; /* = ETH_ZLEN; */ + int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; __u32 delay_us; /* Default delay */ __u32 delay_ns; @@ -2075,6 +2076,13 @@ static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) pkt_dev-idle_acc += now - start; } +static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) +{ + pkt_dev-pkt_overhead += pkt_dev-nr_labels*sizeof(u32); + pkt_dev-pkt_overhead += VLAN_TAG_SIZE(pkt_dev); + pkt_dev-pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); +} + /* Increment/randomize headers according to flags and current values * for IP src/dest, UDP src/dst port, MAC-Addr src/dst */ @@ -2323,9 +2331,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, datalen = (odev-hard_header_len + 16) ~0xf; skb = alloc_skb(pkt_dev-cur_pkt_size + 64 + datalen + - pkt_dev-nr_labels*sizeof(u32) + - VLAN_TAG_SIZE(pkt_dev) + SVLAN_TAG_SIZE(pkt_dev), - GFP_ATOMIC); + pkt_dev-pkt_overhead, GFP_ATOMIC); if (!skb) { sprintf(pkt_dev-result, No memory); return NULL; @@ -2368,7 +2374,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev-cur_pkt_size - 14 - 20 - 8 - - pkt_dev-nr_labels*sizeof(u32) - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev); + pkt_dev-pkt_overhead; if (datalen sizeof(struct pktgen_hdr)) datalen = sizeof(struct pktgen_hdr); @@ -2391,8 +2397,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph-check = ip_fast_csum((void *)iph, iph-ihl); skb-protocol = protocol; skb-mac_header = (skb-network_header - ETH_HLEN - - pkt_dev-nr_labels * sizeof(u32) - - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev)); + pkt_dev-pkt_overhead); skb-dev = odev; skb-pkt_type = PACKET_HOST; @@ -2662,9 +2667,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, mod_cur_headers(pkt_dev); skb = alloc_skb(pkt_dev-cur_pkt_size + 64 + 16 + - pkt_dev-nr_labels*sizeof(u32) + - VLAN_TAG_SIZE(pkt_dev) + SVLAN_TAG_SIZE(pkt_dev), - GFP_ATOMIC); + pkt_dev-pkt_overhead, GFP_ATOMIC); if (!skb) { sprintf(pkt_dev-result, No memory); return NULL; @@ -2708,7 +2711,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev-cur_pkt_size - 14 - sizeof(struct ipv6hdr) - sizeof(struct udphdr) - - pkt_dev-nr_labels*sizeof(u32) - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev); + pkt_dev-pkt_overhead; if (datalen sizeof(struct pktgen_hdr)) { datalen = sizeof(struct pktgen_hdr); @@ -2738,8 +2741,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, ipv6_addr_copy(iph-saddr, pkt_dev-cur_in6_saddr); skb-mac_header = (skb-network_header - ETH_HLEN - - pkt_dev-nr_labels * sizeof(u32) - - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev)); + pkt_dev-pkt_overhead); skb-protocol = protocol; skb-dev = odev; skb-pkt_type = PACKET_HOST; @@ -2857,6 +2859,7 @@ static void pktgen_run(struct pktgen_thread *t) pkt_dev-started_at = getCurUs(); pkt_dev-next_tx_us = getCurUs(); /* Transmit immediately */ pkt_dev-next_tx_ns = 0; + set_pkt_overhead(pkt_dev); strcpy(pkt_dev-result, Starting); started++;
[PATCH] pktgen IPSEC 2/4: Introduce pktgen sequential flows
2 of 4 cheers, jamal commit 882c296bb3f153e1ac770a874c75cfb2bab8481b Author: Jamal Hadi Salim [EMAIL PROTECTED] Date: Tue Jun 12 07:24:00 2007 -0400 [PKTGEN] Introduce sequential flows By default all flows in pktgen are randomly selected. This patch introduces ability to have all defined flows to be sent sequentially. Robert defined randomness to be the default behavior. Signed-off-by: Jamal Hadi Salim [EMAIL PROTECTED] diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 1352316..bc4fb3b 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -181,6 +181,7 @@ #define F_MPLS_RND(18) /* Random MPLS labels */ #define F_VID_RND (19) /* Random VLAN ID */ #define F_SVID_RND(110) /* Random SVLAN ID */ +#define F_FLOW_SEQ(111) /* Sequential flows */ /* Thread control flag bits */ #define T_TERMINATE (10) @@ -207,8 +208,12 @@ static struct proc_dir_entry *pg_proc_dir = NULL; struct flow_state { __be32 cur_daddr; int count; + __u32 flags; }; +/* flow flag bits */ +#define F_INIT (10)/* flow has been initialized */ + struct pktgen_dev { /* * Try to keep frequent/infrequent used vars. separated. @@ -342,6 +347,7 @@ struct pktgen_dev { unsigned cflows;/* Concurrent flows (config) */ unsigned lflow; /* Flow length (config) */ unsigned nflows;/* accumulated flows (stats) */ + unsigned curfl; /* current sequenced flow (state)*/ char result[512]; }; @@ -691,6 +697,13 @@ static int pktgen_if_show(struct seq_file *seq, void *v) if (pkt_dev-flags F_MPLS_RND) seq_printf(seq, MPLS_RND ); + if (pkt_dev-cflows) { + if (pkt_dev-flags F_FLOW_SEQ) + seq_printf(seq, FLOW_SEQ ); /*in sequence flows*/ + else + seq_printf(seq, FLOW_RND ); + } + if (pkt_dev-flags F_MACSRC_RND) seq_printf(seq, MACSRC_RND ); @@ -1182,6 +1195,9 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, !SVID_RND) == 0) pkt_dev-flags = ~F_SVID_RND; + else if (strcmp(f, FLOW_SEQ) == 0) + pkt_dev-flags |= F_FLOW_SEQ; + else if (strcmp(f, !IPV6) == 0) pkt_dev-flags = ~F_IPV6; @@ -1190,7 +1206,7 @@ static ssize_t pktgen_if_write(struct file *file, Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s, f, IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, - MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND\n); + MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ\n); return count; } sprintf(pg_result, OK: flags=0x%x, pkt_dev-flags); @@ -2083,6 +2099,37 @@ static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) pkt_dev-pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); } +static inline int f_seen(struct pktgen_dev *pkt_dev, int flow) +{ + + if (pkt_dev-flows[flow].flags F_INIT) + return 1; + else + return 0; +} + +static inline int f_pick(struct pktgen_dev *pkt_dev) +{ + int flow = pkt_dev-curfl; + + if (pkt_dev-flags F_FLOW_SEQ) { + if (pkt_dev-flows[flow].count = pkt_dev-lflow) { + /* reset time */ + pkt_dev-flows[flow].count = 0; + pkt_dev-curfl += 1; + if (pkt_dev-curfl = pkt_dev-cflows) + pkt_dev-curfl = 0; /*reset */ + } + } else { + flow = random32() % pkt_dev-cflows; + + if (pkt_dev-flows[flow].count pkt_dev-lflow) + pkt_dev-flows[flow].count = 0; + } + + return pkt_dev-curfl; +} + /* Increment/randomize headers according to flags and current values * for IP src/dest, UDP src/dst port, MAC-Addr src/dst */ @@ -2092,12 +2139,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 imx; int flow = 0; - if (pkt_dev-cflows) { - flow = random32() % pkt_dev-cflows; - - if (pkt_dev-flows[flow].count pkt_dev-lflow) - pkt_dev-flows[flow].count = 0; - } + if (pkt_dev-cflows) + flow = f_pick(pkt_dev); /* Deal with source MAC */ if (pkt_dev-src_mac_count 1) { @@ -2213,7 +2256,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev-cur_saddr = htonl(t); } - if (pkt_dev-cflows pkt_dev-flows[flow].count != 0) { + if
[PATCH] pktgen IPSEC 3/4: Introduce xfrm SAD only lookup
3 of 4 .. cheers, jamal commit 677f1c1459218919f5aa2622625dc8709c2a98ce Author: Jamal Hadi Salim [EMAIL PROTECTED] Date: Tue Jun 12 07:28:59 2007 -0400 [XFRM] Introduce standalone SAD lookup This allows other in-kernel functions to do SAD lookups. The only known user at the moment is pktgen. Signed-off-by: Jamal Hadi Salim [EMAIL PROTECTED] diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 311f25a..79d2c37 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -920,6 +920,10 @@ extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family); +extern struct xfrm_state * xfrm_stateonly_find(xfrm_address_t *daddr, + xfrm_address_t *saddr, + unsigned short family, + u8 mode, u8 proto, u32 reqid); extern int xfrm_state_check_expire(struct xfrm_state *x); extern void xfrm_state_insert(struct xfrm_state *x); extern int xfrm_state_add(struct xfrm_state *x); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 85f3f43..b8562e4 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -686,6 +686,41 @@ out: return x; } +struct xfrm_state * +xfrm_stateonly_find(xfrm_address_t *daddr, xfrm_address_t *saddr, + unsigned short family, u8 mode, u8 proto, u32 reqid) +{ + unsigned int h = xfrm_dst_hash(daddr, saddr, reqid, family); + struct xfrm_state *rx = NULL, *x = NULL; + struct hlist_node *entry; + + spin_lock(xfrm_state_lock); + hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) { + if (x-props.family == family + x-props.reqid == reqid + !(x-props.flags XFRM_STATE_WILDRECV) + xfrm_state_addr_check(x, daddr, saddr, family) + mode == x-props.mode + proto == x-id.proto) { + + if (x-km.state != XFRM_STATE_VALID) + continue; + else { + rx = x; + break; + } + } + } + + if (rx) + xfrm_state_hold(rx); + spin_unlock(xfrm_state_lock); + + + return rx; +} +EXPORT_SYMBOL(xfrm_stateonly_find); + static void __xfrm_state_insert(struct xfrm_state *x) { unsigned int h;
[PATCH] pktgen IPSEC 4/4: Add IPSEC support to pktgen
4 of 4 cheers, jamal commit e035613eae587251b8c98b7d503eab207f1d26e2 Author: Jamal Hadi Salim [EMAIL PROTECTED] Date: Tue Jun 12 07:43:30 2007 -0400 [PKTGEN] IPSEC support Added transport mode ESP support for starters. I will send more of these modes and types once i have resolved the tunnel mode isses. Signed-off-by: Jamal Hadi Salim [EMAIL PROTECTED] diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bc4fb3b..bcec8e4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -152,6 +152,9 @@ #include net/checksum.h #include net/ipv6.h #include net/addrconf.h +#ifdef CONFIG_XFRM +#include net/xfrm.h +#endif #include asm/byteorder.h #include linux/rcupdate.h #include asm/bitops.h @@ -182,6 +185,7 @@ #define F_VID_RND (19) /* Random VLAN ID */ #define F_SVID_RND(110) /* Random SVLAN ID */ #define F_FLOW_SEQ(111) /* Sequential flows */ +#define F_IPSEC_ON(112) /* ipsec on for flows */ /* Thread control flag bits */ #define T_TERMINATE (10) @@ -208,6 +212,9 @@ static struct proc_dir_entry *pg_proc_dir = NULL; struct flow_state { __be32 cur_daddr; int count; +#ifdef CONFIG_XFRM + struct xfrm_state *x; +#endif __u32 flags; }; @@ -348,7 +355,10 @@ struct pktgen_dev { unsigned lflow; /* Flow length (config) */ unsigned nflows;/* accumulated flows (stats) */ unsigned curfl; /* current sequenced flow (state)*/ - +#ifdef CONFIG_XFRM + __u8ipsmode;/* IPSEC mode (config) */ + __u8ipsproto; /* IPSEC type (config) */ +#endif char result[512]; }; @@ -704,6 +714,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, FLOW_RND ); } + if (pkt_dev-flags F_IPSEC_ON) + seq_printf(seq, IPSEC ); + if (pkt_dev-flags F_MACSRC_RND) seq_printf(seq, MACSRC_RND ); @@ -1198,6 +1211,11 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, FLOW_SEQ) == 0) pkt_dev-flags |= F_FLOW_SEQ; +#ifdef CONFIG_XFRM + else if (strcmp(f, IPSEC) == 0) + pkt_dev-flags |= F_IPSEC_ON; +#endif + else if (strcmp(f, !IPV6) == 0) pkt_dev-flags = ~F_IPV6; @@ -1206,7 +1224,7 @@ static ssize_t pktgen_if_write(struct file *file, Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s, f, IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, - MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ\n); + MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC\n); return count; } sprintf(pg_result, OK: flags=0x%x, pkt_dev-flags); @@ -2094,6 +2112,7 @@ static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { + pkt_dev-pkt_overhead = 0; pkt_dev-pkt_overhead += pkt_dev-nr_labels*sizeof(u32); pkt_dev-pkt_overhead += VLAN_TAG_SIZE(pkt_dev); pkt_dev-pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); @@ -2130,6 +2149,31 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) return pkt_dev-curfl; } + +#ifdef CONFIG_XFRM +/* If there was already an IPSEC SA, we keep it as is, else + * we go look for it ... +*/ +inline +void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) +{ + struct xfrm_state *x = pkt_dev-flows[flow].x; + if (!x) { + /*slow path: we dont already have xfrm_state*/ + x = xfrm_stateonly_find((xfrm_address_t *)pkt_dev-cur_daddr, + (xfrm_address_t *)pkt_dev-cur_saddr, + AF_INET, + pkt_dev-ipsmode, + pkt_dev-ipsproto, 0); + if (x) { + pkt_dev-flows[flow].x = x; + set_pkt_overhead(pkt_dev); + pkt_dev-pkt_overhead+=x-props.header_len; + } + + } +} +#endif /* Increment/randomize headers according to flags and current values * for IP src/dest, UDP src/dst port, MAC-Addr src/dst */ @@ -2289,6 +2333,10 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev-flows[flow].flags |= F_INIT; pkt_dev-flows[flow].cur_daddr = pkt_dev-cur_daddr; +#ifdef CONFIG_XFRM + if (pkt_dev-flags F_IPSEC_ON) + get_ipsec_sa(pkt_dev, flow); +#endif
[RFC PATCH net-2.6] [TCP]: Congestion control API RTT sampling fix
I was thinking something like this to fix the cc module breakage introduced by the API change (haven't tested it besides compile): [RFC PATCH net-2.6] [TCP]: Congestion control API RTT sampling fix Commit 164891aadf1721fca4dce473bb0e0998181537c6 broke RTT sampling of congestion control modules. Inaccurate timestamps could be fed to them without providing any way for them to identify such cases. Previously RTT sampler was called only if FLAG_RETRANS_DATA_ACKED was not set filtering inaccurate timestamps nicely. In addition, the new behavior could give an invalid timestamp (zero) to RTT sampler if only skbs with TCPCB_RETRANS were ACKed. This solves both problems. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/linux/ktime.h | 18 ++ include/linux/skbuff.h |4 net/ipv4/tcp_illinois.c |3 +++ net/ipv4/tcp_input.c|6 +- net/ipv4/tcp_lp.c |3 ++- net/ipv4/tcp_vegas.c|3 +++ net/ipv4/tcp_veno.c |3 +++ 7 files changed, 38 insertions(+), 2 deletions(-) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index c762954..9f7fa3e 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -102,6 +102,12 @@ static inline ktime_t ktime_set(const long secs, const unsigned long nsecs) #define ktime_add_ns(kt, nsval) \ ({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; }) +/* Compare two ktime_t variables, returns 1 if equal */ +static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 == cmp2.tv64; +} + /* convert a timespec to ktime_t format: */ static inline ktime_t timespec_to_ktime(struct timespec ts) { @@ -200,6 +206,18 @@ static inline ktime_t ktime_add(const ktime_t add1, const ktime_t add2) extern ktime_t ktime_add_ns(const ktime_t kt, u64 nsec); /** + * ktime_equal - Compares two ktime_t variables to see if they are equal + * @cmp1: comparable1 + * @cmp2: comparable2 + * + * Compare two ktime_t variables, returns 1 if equal + */ +static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2) +{ + return !((cmp1.tv.sec ^ cmp2.tv.sec) | (cmp1.tv.usec ^ cmp2.tv.usec)); +} + +/** * timespec_to_ktime - convert a timespec to ktime_t format * @ts:the timespec variable to convert * diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e7367c7..6f0b2f7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1579,6 +1579,10 @@ static inline ktime_t net_timedelta(ktime_t t) return ktime_sub(ktime_get_real(), t); } +static inline ktime_t net_invalid_timestamp(void) +{ + return ktime_set(0, 0); +} extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); extern __sum16 __skb_checksum_complete(struct sk_buff *skb); diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 4adc47c..5f8d01b 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -90,6 +90,9 @@ static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, ktime_t last) ca-acked = pkts_acked; + if (ktime_equal(last, net_invalid_timestamp()) + return; + rtt = ktime_to_us(net_timedelta(last)); /* ignore bogus values, this prevents wraparound in alpha math */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ed4a1bd..d506bdc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2409,7 +2409,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) int acked = 0; int prior_packets = tp-packets_out; __s32 seq_rtt = -1; - ktime_t last_ackt = ktime_set(0,0); + ktime_t last_ackt = net_invalid_timestamp(); while ((skb = tcp_write_queue_head(sk)) skb != tcp_send_head(sk)) { @@ -2487,6 +2487,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk); + /* Is the ACK triggering packet unambiguous? */ + if (acked FLAG_RETRANS_DATA_ACKED) + last_ackt = net_invalid_timestamp(); + if (ca_ops-pkts_acked) ca_ops-pkts_acked(sk, pkts_acked, last_ackt); } diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 43294ad..efa358b 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -266,7 +266,8 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, ktime_t last) struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); - tcp_lp_rtt_sample(sk, ktime_to_us(net_timedelta(last))); + if (!ktime_equal(last, net_invalid_timestamp()) + tcp_lp_rtt_sample(sk, ktime_to_us(net_timedelta(last))); /* calc inference */ if (tcp_time_stamp tp-rx_opt.rcv_tsecr) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 73e19cf..bd7a08f 100644 ---
Re: [PATCH][RFC] network splice receive
On Tue, Jun 12 2007, Evgeniy Polyakov wrote: On Sat, Jun 09, 2007 at 08:36:09AM +0200, Jens Axboe ([EMAIL PROTECTED]) wrote: On Fri, Jun 08 2007, Evgeniy Polyakov wrote: On Fri, Jun 08, 2007 at 06:57:25PM +0400, Evgeniy Polyakov ([EMAIL PROTECTED]) wrote: I will try some things for the nearest 30-60 minutes, and then will move to canoe trip until thuesday, so will not be able to work on this idea. Ok, replacing in fs/splice.c every page_cache_release() with static void splice_page_release(struct page *p) { if (!PageSlab(p)) page_cache_release(p); } Ehm, I don't see why that should be necessary. Except in splice_to_pipe(), I have considered that we need to pass in a release function if mapping fails at some point. But it's probably best to do that in the caller, since they have the knowledge of how to release the pages. The rest of the PageSlab() tests are bogus. I had a crashdump, where page was released via splice_to_pipe() indeed, I did not investigate if it is possible to release provided page in other places. I think if in future there will other slab usage cases except networking receiving, that might be useful, but as is it is not needed. Read the just posted code, it has moved way beyond this :-) and putting cloned skb into private field instead of original on in spd_fill_page() ends up without kernel hung. Why? Seems pointless to allocate a clone just to hold on to the skb, a reference should be equally good. I would not be opposed to doing it this way, I just don't see what a clone buys us as compared to just holding that reference to the skb. Receiving code does not expect shared skbs - too many fields are changed with assumptions that it is a private copy. Actually the main problem is that tcp_read_sock() unconditionally frees the skb, so it wouldn't help if we grabbed a reference to it. I've yet to receive an explanation of why it does so, seem awkward and violates the whole principle of reference counted objects. Davem?? So for now, skb_splice_bits() clones the incoming skb to avoid that. I'd hope we can get rid of that by fixing tcp_read_sock(), though. -- Jens Axboe - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
pmtu discovery on sa esp
Hello everybody. I have just upgraded from 2.6.21.3 to 2.6.22-rc4 and I get a ton of pmtu discovery on sa esp/blablab/blabla messages (this box is running openswan). Is this an expected behaviour? TIA - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
On Tue, 2007-12-06 at 11:19 +0200, Johannes Berg wrote: On Mon, 2007-06-11 at 08:23 -0400, jamal wrote: Sure. Packets stashed on the any DMA ring are considered gone to the wire. That is a very valid assumption to make. Not at all! Packets could be on the DMA queue forever if you're feeding out more packets. Heck, on most wireless hardware packets can even be *expired* from the DMA queue and you get an indication that it was impossible to send them. The spirit of the discussion you are quoting was much higher level than that. Yes what you describe can happen on any DMA (to hard-disk etc) A simpler example, if you tcpdump on an outgoing packet you see it on its way to the driver - it is accounted for as gone[1]. In any case, read the rest of the thread. cheers, jamal [1] Current Linux tcpdumping is not that accurate, but i dont wanna go into that discussion - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] network splice receive
On Tue, Jun 12, 2007 at 01:33:54PM +0200, Jens Axboe ([EMAIL PROTECTED]) wrote: I had a crashdump, where page was released via splice_to_pipe() indeed, I did not investigate if it is possible to release provided page in other places. I think if in future there will other slab usage cases except networking receiving, that might be useful, but as is it is not needed. Read the just posted code, it has moved way beyond this :-) It is just a side result of traditional optimization technique called vim ':%s/page_cache_release/splice_page_release' :) and putting cloned skb into private field instead of original on in spd_fill_page() ends up without kernel hung. Why? Seems pointless to allocate a clone just to hold on to the skb, a reference should be equally good. I would not be opposed to doing it this way, I just don't see what a clone buys us as compared to just holding that reference to the skb. Receiving code does not expect shared skbs - too many fields are changed with assumptions that it is a private copy. Actually the main problem is that tcp_read_sock() unconditionally frees the skb, so it wouldn't help if we grabbed a reference to it. I've yet to receive an explanation of why it does so, seem awkward and violates the whole principle of reference counted objects. Davem?? So for now, skb_splice_bits() clones the incoming skb to avoid that. I'd hope we can get rid of that by fixing tcp_read_sock(), though. It does that because it knows, that skb is not allowed to be shared there. Similar things are being done in udp for example - code changes internal mebers of skb, since it knows skb is not shared. For example generic_make_request() is not allowed to change, say, bio-bi_sector or bi_destructor, since it does not own a block request, not matter what bi_cnt is. From another side, -bi_destructor() can do whatever it wants with bio without any check for its reference counter. According to sk_eat_skb() - it is an optimisation to remove atomic check. -- Jens Axboe -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: pmtu discovery on sa esp
Marco Berizzi wrote: Hello everybody. I have just upgraded from 2.6.21.3 to 2.6.22-rc4 and I get a ton of pmtu discovery on sa esp/blablab/blabla messages (this box is running openswan). Is this an expected behaviour? We have some MTU opimiztations in 2.6.22-rc that might be related. Please check with tcpdump what exactly is happening and whether the 2.6.22-rc box is sending too large packets. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] network splice receive
On Tue, Jun 12 2007, Evgeniy Polyakov wrote: and putting cloned skb into private field instead of original on in spd_fill_page() ends up without kernel hung. Why? Seems pointless to allocate a clone just to hold on to the skb, a reference should be equally good. I would not be opposed to doing it this way, I just don't see what a clone buys us as compared to just holding that reference to the skb. Receiving code does not expect shared skbs - too many fields are changed with assumptions that it is a private copy. Actually the main problem is that tcp_read_sock() unconditionally frees the skb, so it wouldn't help if we grabbed a reference to it. I've yet to receive an explanation of why it does so, seem awkward and violates the whole principle of reference counted objects. Davem?? So for now, skb_splice_bits() clones the incoming skb to avoid that. I'd hope we can get rid of that by fixing tcp_read_sock(), though. It does that because it knows, that skb is not allowed to be shared there. Similar things are being done in udp for example - code changes internal mebers of skb, since it knows skb is not shared. For example generic_make_request() is not allowed to change, say, bio-bi_sector or bi_destructor, since it does not own a block request, not matter what bi_cnt is. From another side, -bi_destructor() can do whatever it wants with bio without any check for its reference counter. But generic_make_request() DOES change -bi_sector, that's how partition remapping works :-). The destructor can of course do whatever it wants, by definition the bio is not referenced at that point (or it would not have been called). So while I think your analogy is quite poor, I do now follow the code (even if I think it's ugly). There's quite a big difference between changing parts of the elements of a structure to just grabbing a reference to it. If the skb cannot be referenced, skb_get() should return NULL. But that aside, I see the issue. I'll just stick to the clone, it works fine as-is (well almost, there's a leak there, but functionally it's ok!). -- Jens Axboe - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch 2/4] lockdep: fixup sk_callback_lock annotation
the two init sites resulted in inconsistend names for the lock class. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] Acked-by: Ingo Molnar [EMAIL PROTECTED] Cc: netdev@vger.kernel.org --- net/core/sock.c | 23 +++ 1 file changed, 19 insertions(+), 4 deletions(-) Index: linux-2.6/net/core/sock.c === --- linux-2.6.orig/net/core/sock.c +++ linux-2.6/net/core/sock.c @@ -171,6 +171,19 @@ static const char *af_family_slock_key_s slock-AF_TIPC , slock-AF_BLUETOOTH, slock-AF_IUCV , slock-AF_RXRPC , slock-AF_MAX }; +static const char *af_family_clock_key_strings[AF_MAX+1] = { + clock-AF_UNSPEC, clock-AF_UNIX , clock-AF_INET , + clock-AF_AX25 , clock-AF_IPX , clock-AF_APPLETALK, + clock-AF_NETROM, clock-AF_BRIDGE , clock-AF_ATMPVC , + clock-AF_X25 , clock-AF_INET6, clock-AF_ROSE , + clock-AF_DECnet, clock-AF_NETBEUI , clock-AF_SECURITY , + clock-AF_KEY , clock-AF_NETLINK , clock-AF_PACKET , + clock-AF_ASH , clock-AF_ECONET , clock-AF_ATMSVC , + clock-21 , clock-AF_SNA , clock-AF_IRDA , + clock-AF_PPPOX , clock-AF_WANPIPE , clock-AF_LLC , + clock-27 , clock-28 , clock-29 , + clock-AF_TIPC , clock-AF_BLUETOOTH, clock-AF_MAX +}; #endif /* @@ -941,8 +954,9 @@ struct sock *sk_clone(const struct sock rwlock_init(newsk-sk_dst_lock); rwlock_init(newsk-sk_callback_lock); - lockdep_set_class(newsk-sk_callback_lock, - af_callback_keys + newsk-sk_family); + lockdep_set_class_and_name(newsk-sk_callback_lock, + af_callback_keys + newsk-sk_family, + af_family_clock_key_strings[newsk-sk_family]); newsk-sk_dst_cache = NULL; newsk-sk_wmem_queued = 0; @@ -1530,8 +1544,9 @@ void sock_init_data(struct socket *sock, rwlock_init(sk-sk_dst_lock); rwlock_init(sk-sk_callback_lock); - lockdep_set_class(sk-sk_callback_lock, - af_callback_keys + sk-sk_family); + lockdep_set_class_and_name(sk-sk_callback_lock, + af_callback_keys + sk-sk_family, + af_family_clock_key_strings[sk-sk_family]); sk-sk_state_change = sock_def_wakeup; sk-sk_data_ready = sock_def_readable; -- - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] network splice receive
On Tue, Jun 12, 2007 at 02:40:05PM +0200, Jens Axboe ([EMAIL PROTECTED]) wrote: On Tue, Jun 12 2007, Evgeniy Polyakov wrote: and putting cloned skb into private field instead of original on in spd_fill_page() ends up without kernel hung. Why? Seems pointless to allocate a clone just to hold on to the skb, a reference should be equally good. I would not be opposed to doing it this way, I just don't see what a clone buys us as compared to just holding that reference to the skb. Receiving code does not expect shared skbs - too many fields are changed with assumptions that it is a private copy. Actually the main problem is that tcp_read_sock() unconditionally frees the skb, so it wouldn't help if we grabbed a reference to it. I've yet to receive an explanation of why it does so, seem awkward and violates the whole principle of reference counted objects. Davem?? So for now, skb_splice_bits() clones the incoming skb to avoid that. I'd hope we can get rid of that by fixing tcp_read_sock(), though. It does that because it knows, that skb is not allowed to be shared there. Similar things are being done in udp for example - code changes internal mebers of skb, since it knows skb is not shared. For example generic_make_request() is not allowed to change, say, bio-bi_sector or bi_destructor, since it does not own a block request, not matter what bi_cnt is. From another side, -bi_destructor() can do whatever it wants with bio without any check for its reference counter. But generic_make_request() DOES change -bi_sector, that's how partition remapping works :-). The destructor can of course do whatever it wants, by definition the bio is not referenced at that point (or it would not have been called). So while I think your analogy is quite poor, I do now follow the code (even if I think it's ugly). There's quite a big Yeah, that was quite long time ago I hacked block layer :) Good we found a way to explain the issue. difference between changing parts of the elements of a structure to just grabbing a reference to it. If the skb cannot be referenced, skb_get() should return NULL. But that aside, I see the issue. I'll just stick to the clone, it works fine as-is (well almost, there's a leak there, but functionally it's ok!). Btw, is it allowed to use splice from network with, say, nfs? Since RPC code uses sk_user_data as long as network splice. -- Jens Axboe -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] network splice receive
On Tue, Jun 12 2007, Evgeniy Polyakov wrote: difference between changing parts of the elements of a structure to just grabbing a reference to it. If the skb cannot be referenced, skb_get() should return NULL. But that aside, I see the issue. I'll just stick to the clone, it works fine as-is (well almost, there's a leak there, but functionally it's ok!). Btw, is it allowed to use splice from network with, say, nfs? Since RPC code uses sk_user_data as long as network splice. It doesn't anymore, see the version posted today (or yesterday, but it would be silly to read older code than the newest :-) -- Jens Axboe - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
jamal wrote: the qdisc has a chance to hand out either a packet of the same priority or higher priority, but at the cost of at worst (n - 1) * m unnecessary dequeues+requeues in case there is only a packet of lowest priority and we need to fully serve all higher priority HW queues before it can actually be dequeued. yes, i see that. [It actually is related to the wake threshold you use in the driver. tg3 and e1000 for example will do it after 30 or so packets. But i get your point - what you are trying to describe is a worst case scenario]. Yes. Using a higher threshold reduces the overhead, but leads to lower priority packets getting out even if higher priority packets are present in the qdisc. Note that if we use the threshold with multiple queue states (threshold per ring) this doesn't happen. The other possibility would be to activate the queue again once all rings can take packets again, but that wouldn't fix the problem, which you can easily see if you go back to my example and assume we still have a low priority packet within the qdisc when the lowest priority ring fills up (and the queue is stopped), and after we tried to wake it and stopped it again the higher priority packet arrives. In your use case, only low prio packets are available on the stack. Above you mention arrival of high prio - assuming thats intentional and not it being late over there ;- If higher prio packets are arriving on the qdisc when you open up, then given strict prio those packets get to go to the driver first until there are no more left; followed of course by low prio which then shutdown the path again... Whats happening is: Lowest priority ring fills up, queue is stopped. We have more packets for it in the qdisc. A higher priority packet is transmitted, the queue is woken up again, the lowest priority packet goes to the driver and hits the full ring, packet is requeued and queue shut down until ring frees up again. Now a high priority packet arrives. It won't get to the driver anymore. But its not very important since having two different wakeup-strategies would be a bit strange anyway, so lets just rule out this possibility. Considering your proposal in combination with RR, you can see the same problem of unnecessary dequeues+requeues. Well, we havent really extended the use case from prio to RR. But this is a good start as any since all sorts of work conserving schedulers will behave in a similar fashion .. Since there is no priority for waking the queue when a equal or higher priority ring got dequeued as in the prio case, I presume you would wake the queue whenever a packet was sent. I suppose that is a viable approach if the hardware is RR based. Actually in the case of e1000 it is WRR not plain RR, but that is a moot point which doesnt affect the discussion. For the RR qdisc dequeue after requeue should hand out the same packet, independantly of newly enqueued packets (which doesn't happen and is a bug in Peter's RR version), so in the worst case the HW has to make the entire round before a packet can get dequeued in case the corresponding HW queue is full. This is a bit better than prio, but still up to n - 1 unnecessary requeues+dequeues. I think it can happen more often than for prio though. I think what would better to be use is DRR. I pointed the code i did a long time ago to Peter. With DRR, a deficit is viable to be carried forward. If both driver and HW do it, its probably OK for short term, but it shouldn't grow too large since short-term fairness is also important. But the unnecessary dequeues+requeues can still happen. Forgetting about things like multiple qdisc locks and just looking at queueing behaviour, the question seems to come down to whether the unnecessary dequeues/requeues are acceptable (which I don't think since they are easily avoidable). As i see it, the worst case scenario would have a finite time. A 100Mbps NIC should be able to dish out, depending on packet size, 148Kpps to 8.6Kpps; a GigE 10x that. so i think the phase in general wont last that long given the assumption is packets are coming in from the stack to the driver with about the packet rate equivalent to wire rate (for the case of all work conserving schedulers). In the general case there should be no contention at all. It does have finite time, but its still undesirable. The average case would probably have been more interesting, but its also harder :) I also expect to see lots of requeues under normal load that doesn't ressemble the worst-case, but only tests can confirm that. OTOH you could turn it around and argue that the patches won't do much harm since ripping them out again (modulo queue mapping) should result in the same behaviour with just more overhead. I am not sure i understood - but note that i have asked for a middle ground from the begining. I just mean that we could rip the patches out at any
[PATCH] pktgen IPSEC 3/4: Introduce xfrm SAD only lookup
jamal writes: 3 of 4 .. [XFRM] Introduce standalone SAD lookup This allows other in-kernel functions to do SAD lookups. The only known user at the moment is pktgen. Signed-off-by: Jamal Hadi Salim [EMAIL PROTECTED] xfrm is not my area. Acked-by: Robert Olsson [EMAIL PROTECTED] Cheers --ro diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 311f25a..79d2c37 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -920,6 +920,10 @@ extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family); +extern struct xfrm_state * xfrm_stateonly_find(xfrm_address_t *daddr, + xfrm_address_t *saddr, + unsigned short family, + u8 mode, u8 proto, u32 reqid); extern int xfrm_state_check_expire(struct xfrm_state *x); extern void xfrm_state_insert(struct xfrm_state *x); extern int xfrm_state_add(struct xfrm_state *x); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 85f3f43..b8562e4 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -686,6 +686,41 @@ out: return x; } +struct xfrm_state * +xfrm_stateonly_find(xfrm_address_t *daddr, xfrm_address_t *saddr, +unsigned short family, u8 mode, u8 proto, u32 reqid) +{ +unsigned int h = xfrm_dst_hash(daddr, saddr, reqid, family); +struct xfrm_state *rx = NULL, *x = NULL; +struct hlist_node *entry; + +spin_lock(xfrm_state_lock); +hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) { +if (x-props.family == family +x-props.reqid == reqid +!(x-props.flags XFRM_STATE_WILDRECV) +xfrm_state_addr_check(x, daddr, saddr, family) +mode == x-props.mode +proto == x-id.proto) { + +if (x-km.state != XFRM_STATE_VALID) +continue; +else { +rx = x; +break; +} +} +} + +if (rx) +xfrm_state_hold(rx); +spin_unlock(xfrm_state_lock); + + +return rx; +} +EXPORT_SYMBOL(xfrm_stateonly_find); + static void __xfrm_state_insert(struct xfrm_state *x) { unsigned int h; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] pktgen IPSEC 1/4: Centralize pktgen packet overhead management
jamal writes: Manual labor still ... 1 of 4 [PKTGEN] Centralize packet overhead tracking Track the extra packet overhead for VLAN tags, MPLS, IPSEC etc Signed-off-by: Jamal Hadi Salim [EMAIL PROTECTED] Thanks, Jamal. I'll guess the ipsec part is to be considered work-in-progress and you're doing both the work and the progress. Signed-off-by: Robert Olsson [EMAIL PROTECTED] Cheers --ro diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9cd3a1c..1352316 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -228,6 +228,7 @@ struct pktgen_dev { int min_pkt_size; /* = ETH_ZLEN; */ int max_pkt_size; /* = ETH_ZLEN; */ +int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; __u32 delay_us; /* Default delay */ __u32 delay_ns; @@ -2075,6 +2076,13 @@ static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) pkt_dev-idle_acc += now - start; } +static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) +{ +pkt_dev-pkt_overhead += pkt_dev-nr_labels*sizeof(u32); +pkt_dev-pkt_overhead += VLAN_TAG_SIZE(pkt_dev); +pkt_dev-pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); +} + /* Increment/randomize headers according to flags and current values * for IP src/dest, UDP src/dst port, MAC-Addr src/dst */ @@ -2323,9 +2331,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, datalen = (odev-hard_header_len + 16) ~0xf; skb = alloc_skb(pkt_dev-cur_pkt_size + 64 + datalen + -pkt_dev-nr_labels*sizeof(u32) + -VLAN_TAG_SIZE(pkt_dev) + SVLAN_TAG_SIZE(pkt_dev), -GFP_ATOMIC); +pkt_dev-pkt_overhead, GFP_ATOMIC); if (!skb) { sprintf(pkt_dev-result, No memory); return NULL; @@ -2368,7 +2374,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev-cur_pkt_size - 14 - 20 - 8 - - pkt_dev-nr_labels*sizeof(u32) - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev); + pkt_dev-pkt_overhead; if (datalen sizeof(struct pktgen_hdr)) datalen = sizeof(struct pktgen_hdr); @@ -2391,8 +2397,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph-check = ip_fast_csum((void *)iph, iph-ihl); skb-protocol = protocol; skb-mac_header = (skb-network_header - ETH_HLEN - - pkt_dev-nr_labels * sizeof(u32) - - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev)); + pkt_dev-pkt_overhead); skb-dev = odev; skb-pkt_type = PACKET_HOST; @@ -2662,9 +2667,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, mod_cur_headers(pkt_dev); skb = alloc_skb(pkt_dev-cur_pkt_size + 64 + 16 + -pkt_dev-nr_labels*sizeof(u32) + -VLAN_TAG_SIZE(pkt_dev) + SVLAN_TAG_SIZE(pkt_dev), -GFP_ATOMIC); +pkt_dev-pkt_overhead, GFP_ATOMIC); if (!skb) { sprintf(pkt_dev-result, No memory); return NULL; @@ -2708,7 +2711,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev-cur_pkt_size - 14 - sizeof(struct ipv6hdr) - sizeof(struct udphdr) - - pkt_dev-nr_labels*sizeof(u32) - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev); + pkt_dev-pkt_overhead; if (datalen sizeof(struct pktgen_hdr)) { datalen = sizeof(struct pktgen_hdr); @@ -2738,8 +2741,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, ipv6_addr_copy(iph-saddr, pkt_dev-cur_in6_saddr); skb-mac_header = (skb-network_header - ETH_HLEN - - pkt_dev-nr_labels * sizeof(u32) - - VLAN_TAG_SIZE(pkt_dev) - SVLAN_TAG_SIZE(pkt_dev)); + pkt_dev-pkt_overhead); skb-protocol = protocol; skb-dev = odev; skb-pkt_type = PACKET_HOST; @@ -2857,6 +2859,7 @@ static void pktgen_run(struct pktgen_thread *t) pkt_dev-started_at = getCurUs(); pkt_dev-next_tx_us = getCurUs(); /* Transmit immediately */ pkt_dev-next_tx_ns = 0; +set_pkt_overhead(pkt_dev); strcpy(pkt_dev-result, Starting); started++; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] pktgen IPSEC 3/4: Introduce xfrm SAD only lookup
Looks good too me, just a few minor nitpicks as usual :) jamal wrote: [XFRM] Introduce standalone SAD lookup +struct xfrm_state * +xfrm_stateonly_find(xfrm_address_t *daddr, xfrm_address_t *saddr, + unsigned short family, u8 mode, u8 proto, u32 reqid) +{ + unsigned int h = xfrm_dst_hash(daddr, saddr, reqid, family); + struct xfrm_state *rx = NULL, *x = NULL; + struct hlist_node *entry; + + spin_lock(xfrm_state_lock); + hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) { + if (x-props.family == family + x-props.reqid == reqid + !(x-props.flags XFRM_STATE_WILDRECV) + xfrm_state_addr_check(x, daddr, saddr, family) + mode == x-props.mode + proto == x-id.proto) { + ^^ please delete empty line + if (x-km.state != XFRM_STATE_VALID) + continue; ^ one indentation level too much + else { + rx = x; + break; + } The whole thing could be compacted by moving the XFRM_STATE_VALID check to the first condition: if (x-props.family == family x-props.reqid == reqid !(x-props.flags XFRM_STATE_WILDRECV) xfrm_state_addr_check(x, daddr, saddr, family) mode == x-props.mode proto == x-id.proto x-km.state == XFRM_STATE_VALID) { rx = x; break; } or alternatively turn the != XFRM_STATE_VALID into == if you want to keep the first condition similar to xfrm_state_find (but the mode and proto conditions are reversed anyways). BTW, wouldn't it make sense to allow use of the SPI as well? - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] pktgen IPSEC 4/4: Add IPSEC support to pktgen
jamal writes: 4 of 4 [PKTGEN] IPSEC support Added transport mode ESP support for starters. I will send more of these modes and types once i have resolved the tunnel mode isses. Signed-off-by: Jamal Hadi Salim [EMAIL PROTECTED] Signed-off-by: Robert Olsson [EMAIL PROTECTED] Cheers --ro diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bc4fb3b..bcec8e4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -152,6 +152,9 @@ #include net/checksum.h #include net/ipv6.h #include net/addrconf.h +#ifdef CONFIG_XFRM +#include net/xfrm.h +#endif #include asm/byteorder.h #include linux/rcupdate.h #include asm/bitops.h @@ -182,6 +185,7 @@ #define F_VID_RND (19)/* Random VLAN ID */ #define F_SVID_RND(110) /* Random SVLAN ID */ #define F_FLOW_SEQ(111) /* Sequential flows */ +#define F_IPSEC_ON(112) /* ipsec on for flows */ /* Thread control flag bits */ #define T_TERMINATE (10) @@ -208,6 +212,9 @@ static struct proc_dir_entry *pg_proc_dir = NULL; struct flow_state { __be32 cur_daddr; int count; +#ifdef CONFIG_XFRM +struct xfrm_state *x; +#endif __u32 flags; }; @@ -348,7 +355,10 @@ struct pktgen_dev { unsigned lflow; /* Flow length (config) */ unsigned nflows;/* accumulated flows (stats) */ unsigned curfl; /* current sequenced flow (state)*/ - +#ifdef CONFIG_XFRM +__u8ipsmode;/* IPSEC mode (config) */ +__u8ipsproto; /* IPSEC type (config) */ +#endif char result[512]; }; @@ -704,6 +714,9 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, FLOW_RND ); } +if (pkt_dev-flags F_IPSEC_ON) +seq_printf(seq, IPSEC ); + if (pkt_dev-flags F_MACSRC_RND) seq_printf(seq, MACSRC_RND ); @@ -1198,6 +1211,11 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, FLOW_SEQ) == 0) pkt_dev-flags |= F_FLOW_SEQ; +#ifdef CONFIG_XFRM +else if (strcmp(f, IPSEC) == 0) +pkt_dev-flags |= F_IPSEC_ON; +#endif + else if (strcmp(f, !IPV6) == 0) pkt_dev-flags = ~F_IPV6; @@ -1206,7 +1224,7 @@ static ssize_t pktgen_if_write(struct file *file, Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s, f, IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, -MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ\n); +MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC\n); return count; } sprintf(pg_result, OK: flags=0x%x, pkt_dev-flags); @@ -2094,6 +2112,7 @@ static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { +pkt_dev-pkt_overhead = 0; pkt_dev-pkt_overhead += pkt_dev-nr_labels*sizeof(u32); pkt_dev-pkt_overhead += VLAN_TAG_SIZE(pkt_dev); pkt_dev-pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); @@ -2130,6 +2149,31 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) return pkt_dev-curfl; } + +#ifdef CONFIG_XFRM +/* If there was already an IPSEC SA, we keep it as is, else + * we go look for it ... +*/ +inline +void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) +{ +struct xfrm_state *x = pkt_dev-flows[flow].x; +if (!x) { +/*slow path: we dont already have xfrm_state*/ +x = xfrm_stateonly_find((xfrm_address_t *)pkt_dev-cur_daddr, +(xfrm_address_t *)pkt_dev-cur_saddr, +AF_INET, +pkt_dev-ipsmode, +pkt_dev-ipsproto, 0); +if (x) { +pkt_dev-flows[flow].x = x; +set_pkt_overhead(pkt_dev); +pkt_dev-pkt_overhead+=x-props.header_len; +} + +} +} +#endif /* Increment/randomize headers according to flags and current values * for IP src/dest, UDP src/dst port, MAC-Addr src/dst */ @@ -2289,6 +2333,10 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev-flows[flow].flags |= F_INIT; pkt_dev-flows[flow].cur_daddr = pkt_dev-cur_daddr; +#ifdef CONFIG_XFRM +if (pkt_dev-flags F_IPSEC_ON) +
RE: [PATCH] NET: Multiqueue network device support.
Hi Jamal, Here is a simple scenario (nothing here is rare of extreme case): - Busy wireless environment - FTP TX on BE queue (low priority) - Skype TX on VO queue (high priority) The channel is busy with high priority packets hence the BE packets are transmitted to the air rarely so the DMA/HW queue of the BE access category gets full and the qdisc is stopped. Now periodic VO-tagged Skype packets arrive. I would expect that they get the priority (and pass) in all stages of the stack and reach the HW ASAP and compete there on the medium with the other access categories and the other clients on the channel. Now this packet will be stuck in the qdisc and wait there until a BE packet is transmitted, which can take a long time. This is a real problem. There is also a problem with the queues that will be dedicated to TX aggregation in 11n (currently implemented) - the packets will be classified to queues by the destination MAC address and not only by the priority class, but I don't want to get into that now. I think that there are enough arguments now why the patch that started this thread is needed... Please see below some replies to your questions. Regards, Guy. jamal wrote: It could be estimated well by the host sw; but lets defer that to later in case i am clueless on something or you misunderstood something i said. It cannot be estimated well by the host SW. This is one of the main issues - we can't put it aside... I understand. Please correct me if am wrong: The only reason AC_BK packet will go out instead of AC_VO when contending in hardware is because of a statistical opportunity not the firmware intentionaly trying to allow AC_BK out i.e it is influenced by the three variables: 1) The contention window 2) the backoff timer and 3)the tx opportunity And if you look at the default IEEE parameters as in that url slide 43, the only time AC_BK will win is luck. In most scenarios BK packets will be transmitted and will win the medium against VO packets (thought, in some non-favored ratio). Heres a really dated paper before the standard was ratified: http://www.mwnl.snu.ac.kr/~schoi/publication/Conferences/02-EW.pdf Sorry, I'm really overloaded - I won't be able to review the docs you sent (really apologize for that). So essentially the test you mention changes priorities in real time. What is the purpose of this test? Is WMM expected to change its priorities in real time? The WMM parameters of the AC are set and controlled by the network/BSS (access point) administrator and can be used in anyway. There are the default parameters but they can be changed. Regards, Guy. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: cannot set IP for ethernet
Oliver Neukum wrote: with 2.6.22-rc4-git2 I am getting errors when setting IP for ethernet interfaces: ioctl(4, SIOCSIFADDR, 0x7fff94931600) = -1 ENOBUFS (No buffer space available) The error is independant of the interface. It happens to all interfaces. There's nothing in the syslog. valisk:/home/oliver # uname -a Linux valisk 2.6.22-rc4-git2-default #3 SMP Tue Jun 12 13:27:54 CEST 2007 x86_64 x86_64 x86_64 GNU/Linux This can happen if the initial inetdev allocation when the netdevice is registered fails. I think it would make sense to try to allocate again when adding addresses in that case, otherwise there is no way of recovery other than unregistering and registering the device again. diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index abf6352..dc77e91 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -401,8 +401,11 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) ASSERT_RTNL(); if (!in_dev) { - inet_free_ifa(ifa); - return -ENOBUFS; + in_dev = inetdev_init(dev); + if (!in_dev) { + inet_free_ifa(ifa); + return -ENOBUFS; + } } ipv4_devconf_setall(in_dev); if (ifa-ifa_dev != in_dev) { @@ -514,8 +517,11 @@ static struct in_ifaddr *rtm_to_ifaddr(struct nlmsghdr *nlh) in_dev = __in_dev_get_rtnl(dev); if (in_dev == NULL) { - err = -ENOBUFS; - goto errout; + in_dev = inetdev_init(dev); + if (!in_dev) { + err = -ENOBUFS; + goto errout; + } } ipv4_devconf_setall(in_dev);
Re: [PATCH] NET: Multiqueue network device support.
On Tue, 2007-12-06 at 15:21 +0200, Patrick McHardy wrote: jamal wrote: Yes. Using a higher threshold reduces the overhead, but leads to lower priority packets getting out even if higher priority packets are present in the qdisc. As per earlier discussion, the packets already given to hardware should be fine to go out first. If they get overriden by the chance arrival of higher prio packets from the stack, then that is fine. Note that if we use the threshold with multiple queue states (threshold per ring) this doesn't happen. I think if you do the math, youll find that (n - 1) * m is actually not that unreasonable given parameters typically used on the drivers; Lets for example take the parameters from e1000; the tx ring is around 256, the wake threshold is 32 packets (although i have found a better number is 1/2 the tx size and have that changed in my batching patches). Assume such a driver with above parameters doing Gige exists and it implements 4 queus (n = 4); in such a case, (n-1)*m/32 is 3*256/32 = 3*8 = 24 times. You have to admit your use case is a real corner case but lets be conservative since we are doing a worst case scenario and from that perspective consider that gige can be achieved at pkt levels of 86Kpps to 1.48Mpps and if you are non-work conserving you will be running at that rate and lets pick the low end of 86Kpps - what that means is there is a blip (remember again this to be a corner case) for a few microsecs once in a while with probability of what you described actually occuring... Ok, so then update the threshold to 1/2 the tx ring etc and it is even less. You get the message. If both driver and HW do it, its probably OK for short term, but it shouldn't grow too large since short-term fairness is also important. But the unnecessary dequeues+requeues can still happen. In a corner case, yes there is a probability that will happen. I think its extremely low. It does have finite time, but its still undesirable. The average case would probably have been more interesting, but its also harder :) I also expect to see lots of requeues under normal load that doesn't ressemble the worst-case, but only tests can confirm that. And that is what i was asking of Peter. Some testing. Clearly the subqueueing is more complex; what i am asking for is for the driver to bear the brunt and not for it to be an impacting architectural change. I am not sure i understood - but note that i have asked for a middle ground from the begining. I just mean that we could rip the patches out at any point again without user visible impact aside from more overhead. So even if they turn out to be a mistake its easily correctable. That is a good compromise i think. The reason i am spending my time discussing this is i believe this to be a very important subsystem. You know i have been voiceferous for years on this topic. What i was worried about is these patches make it and become engrained with hot lava on stone. I've also looked into moving all multiqueue specific handling to the top-level qdisc out of sch_generic, unfortunately that leads to races unless all subqueue state operations takes dev-qdisc_lock. Besides the overhead I think it would lead to ABBA deadlocks. I am confident you can handle that. So how do we move forward? What you described above is a good compromise IMO. I dont have much time to chase this path at the moment but what it does is give me freedom to revisit later on with data points. More importantly you understand my view;- And of course you did throw a lot of rocks but it a definete alternative ;- cheers, jamal - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] pktgen IPSEC 1/4: Centralize pktgen packet overhead management
On Tue, 2007-12-06 at 15:21 +0200, Robert Olsson wrote: I'll guess the ipsec part is to be considered work-in-progress and you're doing both the work and the progress. ;- Much thanks Robert. cheers, jamal - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] pktgen IPSEC 3/4: Introduce xfrm SAD only lookup
On Tue, 2007-12-06 at 15:45 +0200, Patrick McHardy wrote: Looks good too me, just a few minor nitpicks as usual :) I like the nitpicks - they make the code better (as long as we put a time limit on them ;-) ^^ please delete empty line will do. + if (x-km.state != XFRM_STATE_VALID) + continue; ^ one indentation level too much will fix. The whole thing could be compacted by moving the XFRM_STATE_VALID check to the first condition: if (x-props.family == family x-props.reqid == reqid !(x-props.flags XFRM_STATE_WILDRECV) xfrm_state_addr_check(x, daddr, saddr, family) mode == x-props.mode proto == x-id.proto x-km.state == XFRM_STATE_VALID) { rx = x; break; } or alternatively turn the != XFRM_STATE_VALID into == if you want to keep the first condition similar to xfrm_state_find (but the mode and proto conditions are reversed anyways). Will do. BTW, wouldn't it make sense to allow use of the SPI as well? SPI is the least user friendly parameter - but i could add it later. I want to add tunnel mode next then i can revisit SPI. Thanks for taking the time to review this Patrick. cheers, jamal - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] NET: Multiqueue network device support.
Guy, I apologize for not responding immediately - i promise to in a few hours when i get back (and read it over some good coffee) - seems like you have some good stuff there; thanks for taking the time despite the overload. cheers, jamal On Tue, 2007-12-06 at 17:04 +0300, Cohen, Guy wrote: Hi Jamal, Here is a simple scenario (nothing here is rare of extreme case): - Busy wireless environment - FTP TX on BE queue (low priority) - Skype TX on VO queue (high priority) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH net-2.6] [TCP]: Congestion control API RTT sampling fix
On Tue, 12 Jun 2007 15:06:57 +0300 (EEST) Ilpo Järvinen [EMAIL PROTECTED] wrote: I was thinking something like this to fix the cc module breakage introduced by the API change (haven't tested it besides compile): [RFC PATCH net-2.6] [TCP]: Congestion control API RTT sampling fix Commit 164891aadf1721fca4dce473bb0e0998181537c6 broke RTT sampling of congestion control modules. Inaccurate timestamps could be fed to them without providing any way for them to identify such cases. Previously RTT sampler was called only if FLAG_RETRANS_DATA_ACKED was not set filtering inaccurate timestamps nicely. In addition, the new behavior could give an invalid timestamp (zero) to RTT sampler if only skbs with TCPCB_RETRANS were ACKed. This solves both problems. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/linux/ktime.h | 18 ++ include/linux/skbuff.h |4 net/ipv4/tcp_illinois.c |3 +++ net/ipv4/tcp_input.c|6 +- net/ipv4/tcp_lp.c |3 ++- net/ipv4/tcp_vegas.c|3 +++ net/ipv4/tcp_veno.c |3 +++ 7 files changed, 38 insertions(+), 2 deletions(-) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index c762954..9f7fa3e 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -102,6 +102,12 @@ static inline ktime_t ktime_set(const long secs, const unsigned long nsecs) #define ktime_add_ns(kt, nsval) \ ({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; }) +/* Compare two ktime_t variables, returns 1 if equal */ +static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 == cmp2.tv64; +} + /* convert a timespec to ktime_t format: */ static inline ktime_t timespec_to_ktime(struct timespec ts) { @@ -200,6 +206,18 @@ static inline ktime_t ktime_add(const ktime_t add1, const ktime_t add2) extern ktime_t ktime_add_ns(const ktime_t kt, u64 nsec); /** + * ktime_equal - Compares two ktime_t variables to see if they are equal + * @cmp1:comparable1 + * @cmp2:comparable2 + * + * Compare two ktime_t variables, returns 1 if equal + */ +static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2) +{ + return !((cmp1.tv.sec ^ cmp2.tv.sec) | (cmp1.tv.usec ^ cmp2.tv.usec)); +} Since ktime is a union just comparing the two 64bit values should be simpler. static inline int ktime_equal(const ktime_t t1, const ktime_t t2) { return t1.s64 == t2.s64; } - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] [RFC -v3] NET: Implement a standard ndev_printk family
On Mon, 2007-06-11 at 17:40 -0700, Auke Kok wrote: +#define ndev_err(netdev, level, format, arg...) \ + do { \ + struct net_device *__nd = (netdev); \ + if ((__nd)-msg_enable NETIF_MSG_##level) \ + printk(KERN_ERR %s: %s: format, (__nd)-name, \ + (__nd)-dev.parent-bus_id, ## arg); \ + } while (0) + I think it's better to remove the macro concatenation/obfuscation of the NETIF_MSG_##level argument and simply pass the appropriate NETIF_MSG_type directly to these ndev_level calls. It would also simplify the more than 300 calls in drivers/net of if (netif_msg_type(ptr)) printk(foo) to ndev_level(netdev, NETIF_MSG_type, fmt, args) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] network splice receive v2
On Mon, Jun 11, 2007 at 01:59:26PM +0200, Jens Axboe ([EMAIL PROTECTED]) wrote: Patches are against the #splice branch of the block repo, official url of that is: git://git.kernel.dk/data/git/linux-2.6-block.git/ and it's based on Linus main tree. Let me know if I should supply netdev branch patches instead, or even just provide a rolled up patch (or patch series) for anyone curious to test or play with it. Hi Jens. I've just pulled your tree (splice-net, but splice tree looks the same, git pull says 'Already up-to-date.') on top of linus git and got following bug trace. I will investigate it further tomorrow. [ 51.942373] [ cut here ] [ 51.947041] kernel BUG at include/linux/mm.h:285! [ 51.951786] invalid opcode: [1] PREEMPT SMP [ 51.956680] CPU 0 [ 51.958784] Modules linked in: button loop snd_intel8x0 snd_ac97_codec psmouse ac97_bus snd_pcm snd_timer snd soundcore snd_page_alloc k8temp i2c_nforcen [ 51.988793] Pid: 2604, comm: splice-fromnet Not tainted 2.6.22-rc4-splice #2 [ 51.995886] RIP: 0010:[80389b15] [80389b15] __skb_splice_bits+0xcd/0x201 [ 52.004520] RSP: 0018:810037f23c28 EFLAGS: 00010246 [ 52.009872] RAX: RBX: 810037f23d98 RCX: 003f [ 52.017053] RDX: 81003fe93808 RSI: 81003fe93808 RDI: 0003c0a3 [ 52.024233] RBP: 810037f23c78 R08: R09: 81003780e4b8 [ 52.031412] R10: 803b01d9 R11: 810037f23de8 R12: 009a [ 52.038591] R13: R14: 810037f23c90 R15: 05a8 [ 52.045771] FS: 2b9181d2c6d0() GS:804fb000() knlGS: [ 52.053920] CS: 0010 DS: ES: CR0: 8005003b [ 52.059714] CR2: 2b9181bb60e0 CR3: 3d109000 CR4: 06e0 [ 52.066894] Process splice-fromnet (pid: 2604, threadinfo 810037f22000, task 8100010f4100) [ 52.075908] Stack: 004612d0 810037f23c94 81003780e4b8 37f23c78 [ 52.084214] faf2050e 81003780e4b8 81003780e4b8 81003e8f22d8 [ 52.091860] 81003c99c820 4d5f4ede 810037f23dd8 8038bf20 [ 52.099265] Call Trace: [ 52.101998] [8038bf20] skb_splice_bits+0x6c/0xd0 [ 52.107619] [803dc720] _read_unlock_irq+0x31/0x4e [ 52.113330] [803afc1c] tcp_splice_data_recv+0x20/0x22 [ 52.119386] [803afaf3] tcp_read_sock+0xa2/0x1ab [ 52.124920] [803afbfc] tcp_splice_data_recv+0x0/0x22 [ 52.130888] [803b0232] tcp_splice_read+0xa1/0x21b [ 52.136593] [803891cf] sock_def_readable+0x0/0x6f [ 52.142303] [80384a25] sock_splice_read+0x15/0x17 [ 52.148010] [8029e773] do_splice_to+0x76/0x88 [ 52.153370] [8029fc87] sys_splice+0x1a8/0x232 [ 52.158733] [802097ce] system_call+0x7e/0x83 [ 52.164005] [ 52.165544] [ 52.165545] Code: 0f 0b eb fe 44 39 65 d4 8b 4d d4 41 0f 47 cc 90 ff 42 08 48 [ 52.175364] RIP [80389b15] __skb_splice_bits+0xcd/0x201 [ 52.181636] RSP 810037f23c28 -- Evgeniy Polyakov - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] network splice receive v2
On Tue, Jun 12 2007, Evgeniy Polyakov wrote: On Mon, Jun 11, 2007 at 01:59:26PM +0200, Jens Axboe ([EMAIL PROTECTED]) wrote: Patches are against the #splice branch of the block repo, official url of that is: git://git.kernel.dk/data/git/linux-2.6-block.git/ and it's based on Linus main tree. Let me know if I should supply netdev branch patches instead, or even just provide a rolled up patch (or patch series) for anyone curious to test or play with it. Hi Jens. I've just pulled your tree (splice-net, but splice tree looks the same, git pull says 'Already up-to-date.') on top of linus git and got following bug trace. I will investigate it further tomorrow. Please tell me the contents of splice-net, it looks like you didn't actually use the new code. That BUG_ON() is in get_page(), which splice-net no longer uses. So the bug report cannot be valid for the current code. -- Jens Axboe - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] [RFC -v3] NET: Implement a standard ndev_printk family
Jeff Garzik wrote: Joe Perches wrote: On Mon, 2007-06-11 at 17:40 -0700, Auke Kok wrote: +#define ndev_err(netdev, level, format, arg...) \ + do { \ + struct net_device *__nd = (netdev); \ + if ((__nd)-msg_enable NETIF_MSG_##level) \ + printk(KERN_ERR %s: %s: format, (__nd)-name, \ + (__nd)-dev.parent-bus_id, ## arg); \ + } while (0) + I think it's better to remove the macro concatenation/obfuscation of the NETIF_MSG_##level argument and simply pass the appropriate NETIF_MSG_type directly to these ndev_level calls. It would also simplify the more than 300 calls in drivers/net of if (netif_msg_type(ptr)) printk(foo) to ndev_level(netdev, NETIF_MSG_type, fmt, args) I think this is a whole lot of iteration and effort for a non-problem. Why do you say that? What is your motivation for that statement? Can you be a bit more descriptive/constructive? I have often seen comments on drivers adding new printk's and lots of them completely ignore the msg_enable bits while advertising that they do thought some debug/ethtool way. tg3, sky2, r8169, etc... all advertise that they allow setting/changing msg_enable yet don't actually do _anything_ with the bits. Only 3 other driver besides the ones I've patched get it right How is that a non-problem? - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
From: Patrick McHardy [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 15:21:54 +0200 So how do we move forward? We're going to put hw multiqueue support in, all of this discussion has been pointless, I just watch this thread and basically laugh at the resistence to hw multiqueue support :-) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
If hardware w/ multiple queues will the capability for different MAC addresses, different RX filters, etc. does it make sense to add that below the net_device level? We will have to add all the configuration machinery at the per-queue level that already exists at the per-netdev level. Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: mac80211 fixes for 2.6.22
From: John W. Linville [EMAIL PROTECTED] Date: Mon, 11 Jun 2007 21:16:16 -0400 Here are a few mac80211 patches appropriate for 2.6.22. Individual patches to follow, or you can pull at your leisure... ... git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git mac80211-fixes Pulled, thanks John. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
Jeff Garzik wrote: If hardware w/ multiple queues will the capability for different MAC addresses, different RX filters, etc. does it make sense to add that below the net_device level? We will have to add all the configuration machinery at the per-queue level that already exists at the per-netdev level. Perhaps the mac-vlan patch would be a good fit. Currently it is all software based, but if the hardware can filter on MAC, it can basically do mac-vlan acceleration. The mac-vlan devices are just like 'real' ethernet devices, so they can be used with whatever schemes work with regular devices. Thanks, Ben Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html -- Ben Greear [EMAIL PROTECTED] Candela Technologies Inc http://www.candelatech.com - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Please pull 'libertas-fixes' branch of wireless-2.6
Fixes identified by the libertas team as important for 2.6.22... --- The following changes since commit 717c9339202a42ae7bec7d3c4b84deecdcae9f81: Dan Williams (1): libertas: reduce SSID and BSSID mixed-case abuse are found in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git libertas-fixes Dan Williams (1): libertas: actually send mesh frames to mesh netdev Luis Carlos (1): libertas: convert libertas_mpp into anycast_mask Luis Carlos Cobo Rus (2): libertas: pull current channel from firmware on mesh autostart libertas: deauthenticate from AP in channel switch drivers/net/wireless/libertas/assoc.c | 13 + drivers/net/wireless/libertas/assoc.h |2 ++ drivers/net/wireless/libertas/cmdresp.c |1 + drivers/net/wireless/libertas/dev.h |1 + drivers/net/wireless/libertas/host.h|4 ++-- drivers/net/wireless/libertas/main.c| 27 ++- drivers/net/wireless/libertas/rx.c |5 ++--- 7 files changed, 35 insertions(+), 18 deletions(-) diff --git a/drivers/net/wireless/libertas/assoc.c b/drivers/net/wireless/libertas/assoc.c index ee82413..f67efa0 100644 --- a/drivers/net/wireless/libertas/assoc.c +++ b/drivers/net/wireless/libertas/assoc.c @@ -200,6 +200,14 @@ static int update_channel(wlan_private * priv) cmd_option_waitforrsp, 0, NULL); } +void libertas_sync_channel(struct work_struct *work) +{ + wlan_private *priv = container_of(work, wlan_private, sync_channel); + + if (update_channel(priv) != 0) + lbs_pr_info(Channel synchronization failed.); +} + static int assoc_helper_channel(wlan_private *priv, struct assoc_request * assoc_req) { @@ -403,6 +411,11 @@ static int should_deauth_infrastructure(wlan_adapter *adapter, return 1; } + if (test_bit(ASSOC_FLAG_CHANNEL, assoc_req-flags)) { + lbs_deb_assoc(Deauthenticating due to channel switch.\n); + return 1; + } + /* FIXME: deal with 'auto' mode somehow */ if (test_bit(ASSOC_FLAG_MODE, assoc_req-flags)) { if (assoc_req-mode != IW_MODE_INFRA) diff --git a/drivers/net/wireless/libertas/assoc.h b/drivers/net/wireless/libertas/assoc.h index b5eddf8..5e9c31f 100644 --- a/drivers/net/wireless/libertas/assoc.h +++ b/drivers/net/wireless/libertas/assoc.h @@ -9,6 +9,8 @@ void libertas_association_worker(struct work_struct *work); struct assoc_request * wlan_get_association_request(wlan_adapter *adapter); +void libertas_sync_channel(struct work_struct *work); + #define ASSOC_DELAY (HZ / 2) static inline void wlan_postpone_association_work(wlan_private *priv) { diff --git a/drivers/net/wireless/libertas/cmdresp.c b/drivers/net/wireless/libertas/cmdresp.c index ebedd63..0c3b9a5 100644 --- a/drivers/net/wireless/libertas/cmdresp.c +++ b/drivers/net/wireless/libertas/cmdresp.c @@ -987,6 +987,7 @@ int libertas_process_event(wlan_private * priv) netif_carrier_on(priv-mesh_dev) ; } adapter-mode = IW_MODE_ADHOC ; + schedule_work(priv-sync_channel); break; default: diff --git a/drivers/net/wireless/libertas/dev.h b/drivers/net/wireless/libertas/dev.h index d6c340a..785192b 100644 --- a/drivers/net/wireless/libertas/dev.h +++ b/drivers/net/wireless/libertas/dev.h @@ -150,6 +150,7 @@ struct _wlan_private { struct delayed_work assoc_work; struct workqueue_struct *assoc_thread; + struct work_struct sync_channel; /** Hardware access */ int (*hw_register_dev) (wlan_private * priv); diff --git a/drivers/net/wireless/libertas/host.h b/drivers/net/wireless/libertas/host.h index cedf1db..7509cc1 100644 --- a/drivers/net/wireless/libertas/host.h +++ b/drivers/net/wireless/libertas/host.h @@ -310,8 +310,8 @@ enum cmd_mesh_access_opts { cmd_act_mesh_get_ttl = 1, cmd_act_mesh_set_ttl, cmd_act_mesh_get_stats, - cmd_act_mesh_get_mpp, - cmd_act_mesh_set_mpp, + cmd_act_mesh_get_anycast, + cmd_act_mesh_set_anycast, }; /** Card Event definition */ diff --git a/drivers/net/wireless/libertas/main.c b/drivers/net/wireless/libertas/main.c index ec9be0c..623ab4b 100644 --- a/drivers/net/wireless/libertas/main.c +++ b/drivers/net/wireless/libertas/main.c @@ -178,45 +178,45 @@ u16 libertas_region_code_to_index[MRVDRV_MAX_REGION_CODE] = */ /** - * @brief Get function for sysfs attribute libertas_mpp + * @brief Get function for sysfs attribute anycast_mask */ -static ssize_t libertas_mpp_get(struct device * dev, +static ssize_t libertas_anycast_get(struct device * dev, struct device_attribute *attr, char * buf) { struct cmd_ds_mesh_access mesh_access; memset(mesh_access, 0, sizeof(mesh_access));
Please pull 'libertas-upstream' branch of wireless-2.6
Patches identified by the libertas team as suitable for 2.6.23... --- The following changes since commit 82fde74b94f11eee1e9c30e43fb162f80a5e63c0: Luis Carlos (1): libertas: convert libertas_mpp into anycast_mask are found in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git libertas-upstream Dan Williams (23): libertas: actually remove version.h libertas: kill wlan_scan_process_results libertas: kill ieeetypes_capinfo bitfield, use ieee80211.h types libertas: rename WLAN_802_11_KEY to enc_key and clean up usage libertas: clean up indentation in libertas_association_worker libertas: clean up 802.11 IE post-scan handling libertas: rename private ioctl constants and clean up ioctl handling libertas: remove if_bootcmd.c libertas: fix mixed-case abuse in cmd_ds_802_11_scan libertas: fix mixed-case abuse in cmd_ds_802_11_ad_hoc_result libertas: fix mixed-case abuse in cmd_ds_802_11_ad_hoc_start libertas: re-uppercase command defines and other constants libertas: fix debug build breakage due to field rename libertas: remove thread.h and make kthread usage clearer libertas: new mesh control knobs libertas: bump version to 322.p1 libertas: wlan_ - libertas_ rename in ioctl.c libertas: fix more mixed-case abuse libertas: move generic firmware reset command to common code libertas: wlan_ - libertas_ function prefix renames for main.c libertas: simplify and clean up data rate handling libertas: fix MESH_[GET/SET]_BCASTR ioctl, clean up ioctl subcmd handling libertas: style fixes Luis Carlos Cobo (1): libertas: specific mesh scan for mshX interface drivers/net/wireless/libertas/11d.c| 22 +- drivers/net/wireless/libertas/Makefile |1 - drivers/net/wireless/libertas/README | 65 drivers/net/wireless/libertas/assoc.c | 85 +++--- drivers/net/wireless/libertas/cmd.c| 338 ++-- drivers/net/wireless/libertas/cmdresp.c| 172 +- drivers/net/wireless/libertas/debugfs.c| 130 drivers/net/wireless/libertas/decl.h |6 +- drivers/net/wireless/libertas/defs.h | 66 ++--- drivers/net/wireless/libertas/dev.h| 34 +-- drivers/net/wireless/libertas/ethtool.c|8 +- drivers/net/wireless/libertas/fw.c | 43 ++-- drivers/net/wireless/libertas/host.h | 438 +- drivers/net/wireless/libertas/hostcmd.h| 69 ++--- drivers/net/wireless/libertas/if_bootcmd.c | 40 --- drivers/net/wireless/libertas/if_usb.c | 58 ++-- drivers/net/wireless/libertas/if_usb.h |1 - drivers/net/wireless/libertas/ioctl.c | 478 +++- drivers/net/wireless/libertas/join.c | 368 +++--- drivers/net/wireless/libertas/join.h |2 + drivers/net/wireless/libertas/main.c | 237 +-- drivers/net/wireless/libertas/rx.c |9 +- drivers/net/wireless/libertas/scan.c | 355 - drivers/net/wireless/libertas/scan.h | 10 +- drivers/net/wireless/libertas/thread.h | 52 --- drivers/net/wireless/libertas/tx.c |2 +- drivers/net/wireless/libertas/types.h | 65 + drivers/net/wireless/libertas/version.h|1 - drivers/net/wireless/libertas/wext.c | 428 -- drivers/net/wireless/libertas/wext.h | 68 +++-- 30 files changed, 1684 insertions(+), 1967 deletions(-) delete mode 100644 drivers/net/wireless/libertas/if_bootcmd.c delete mode 100644 drivers/net/wireless/libertas/thread.h delete mode 100644 drivers/net/wireless/libertas/version.h Omnibus patch attached as libertas-upstream.diff.bz2 due to size concerns. -- John W. Linville [EMAIL PROTECTED] libertas-upstream.diff.bz2 Description: BZip2 compressed data
Re: [PATCH] NET: Multiqueue network device support.
From: Ben Greear [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 14:17:44 -0700 Jeff Garzik wrote: If hardware w/ multiple queues will the capability for different MAC addresses, different RX filters, etc. does it make sense to add that below the net_device level? We will have to add all the configuration machinery at the per-queue level that already exists at the per-netdev level. Perhaps the mac-vlan patch would be a good fit. Currently it is all software based, but if the hardware can filter on MAC, it can basically do mac-vlan acceleration. The mac-vlan devices are just like 'real' ethernet devices, so they can be used with whatever schemes work with regular devices. Interesting. But to answer Jeff's question, that's not really the model being used to implement multiple queues. The MAC is still very much centralized in most designs. So one way they'll do it is to support assigning N MAC addresses, and you configure the input filters of the chip to push packets for each MAC to the proper receive queue. So the MAC will accept any of those in the N MAC addresses as it's own, then you use the filtering facilities to steer frames to the correct RX queue. The TX and RX queues can be so isolated as to be able to be exported to virtualization nodes. You can give them full access to the DMA queues and assosciated mailboxes. So instead of all of this bogus virtualized device overhead, you just give the guest access to the real device. So you can use multiple queues either for better single node SMP performance, or better virtualization performance. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: TCP_MD5 and Intel e1000
From: David Miller [EMAIL PROTECTED] Date: Tue, 22 May 2007 03:14:32 -0700 (PDT) From: YOSHIFUJI Hideaki / 吉藤英明 [EMAIL PROTECTED] Date: Tue, 22 May 2007 18:36:47 +0900 (JST) In article [EMAIL PROTECTED] (at Tue, 22 May 2007 10:57:38 +0200), Eric Dumazet [EMAIL PROTECTED] says: I have tried to set up quagga with tcp-md5 support from kernel. All seems ok with a intel e100 NIC, but as i testetd with a intel e1000 NIC the tcp packets have an invalid md5 digest. If i run tcpdump on the mashine the packets are generated, it shows on the outgoing interface invalid md5 digests. Are there known issues about tcp-md5 and e1000 NICs? : You could try ethtool -K tx off, and/or other ethtool -K settings Disabling offloading should help; currently tcp-md5 stack blindly copy md5-signature from the first segment which is not appropriate for rest of segments. It is clear we should disable TSO for sockets making use of TCP-MD5. I'm going to fix this as follows: commit 3d7dbeac58d0669c37e35a3b91bb41c0146395ce Author: David S. Miller [EMAIL PROTECTED] Date: Tue Jun 12 14:36:42 2007 -0700 [TCP]: Disable TSO if MD5SIG is enabled. Signed-off-by: David S. Miller [EMAIL PROTECTED] diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 97e294e..354721d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -878,6 +878,7 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr, kfree(newkey); return -ENOMEM; } + sk-sk_route_caps = ~NETIF_F_GSO_MASK; } if (tcp_alloc_md5sig_pool() == NULL) { kfree(newkey); @@ -1007,7 +1008,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, return -EINVAL; tp-md5sig_info = p; - + sk-sk_route_caps = ~NETIF_F_GSO_MASK; } newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4f06a51..193d9d6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -590,6 +590,7 @@ static int tcp_v6_md5_do_add(struct sock *sk, struct in6_addr *peer, kfree(newkey); return -ENOMEM; } + sk-sk_route_caps = ~NETIF_F_GSO_MASK; } tcp_alloc_md5sig_pool(); if (tp-md5sig_info-alloced6 == tp-md5sig_info-entries6) { @@ -724,6 +725,7 @@ static int tcp_v6_parse_md5_keys (struct sock *sk, char __user *optval, return -ENOMEM; tp-md5sig_info = p; + sk-sk_route_caps = ~NETIF_F_GSO_MASK; } newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
David Miller wrote: From: Ben Greear [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 14:17:44 -0700 Jeff Garzik wrote: If hardware w/ multiple queues will the capability for different MAC addresses, different RX filters, etc. does it make sense to add that below the net_device level? We will have to add all the configuration machinery at the per-queue level that already exists at the per-netdev level. Perhaps the mac-vlan patch would be a good fit. Currently it is all software based, but if the hardware can filter on MAC, it can basically do mac-vlan acceleration. The mac-vlan devices are just like 'real' ethernet devices, so they can be used with whatever schemes work with regular devices. Interesting. But to answer Jeff's question, that's not really the model being used to implement multiple queues. The MAC is still very much centralized in most designs. So one way they'll do it is to support assigning N MAC addresses, and you configure the input filters of the chip to push packets for each MAC to the proper receive queue. So the MAC will accept any of those in the N MAC addresses as it's own, then you use the filtering facilities to steer frames to the correct RX queue. Not quite... You'll have to deal with multiple Rx filters, not just the current one-filter-for-all model present in today's NICs. Pools of queues will have separate configured characteristics. The steer portion you mention is a bottleneck that wants to be eliminated. Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
David Miller wrote: From: Ben Greear [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 14:17:44 -0700 Jeff Garzik wrote: If hardware w/ multiple queues will the capability for different MAC addresses, different RX filters, etc. does it make sense to add that below the net_device level? We will have to add all the configuration machinery at the per-queue level that already exists at the per-netdev level. Perhaps the mac-vlan patch would be a good fit. Currently it is all software based, but if the hardware can filter on MAC, it can basically do mac-vlan acceleration. The mac-vlan devices are just like 'real' ethernet devices, so they can be used with whatever schemes work with regular devices. Interesting. But to answer Jeff's question, that's not really the model being used to implement multiple queues. The MAC is still very much centralized in most designs. So one way they'll do it is to support assigning N MAC addresses, and you configure the input filters of the chip to push packets for each MAC to the proper receive queue. So the MAC will accept any of those in the N MAC addresses as it's own, then you use the filtering facilities to steer frames to the correct RX queue. The TX and RX queues can be so isolated as to be able to be exported to virtualization nodes. You can give them full access to the DMA queues and assosciated mailboxes. So instead of all of this bogus virtualized device overhead, you just give the guest access to the real device. So you can use multiple queues either for better single node SMP performance, or better virtualization performance. That sounds plausible for many uses, but it may also be useful to have the virtual devices. Having 802.1Q VLANs be 'real' devices has worked out quite well, so I think there is a place for a 'mac-vlan' as well. With your description above, the 'correct RX queue' could be the only queue that the mac-vlan sees, so it would behave somewhat like a vanilla ethernet driver. When the mac-vlan transmits, it could transmit directly into it's particular TX queue on the underlying device. In a non guest environment, I believe the mac-vlan will act somewhat like a more flexible form of an ip-alias. When name-spaces are implemented, the mac-vlan would very easily allow the different name-spaces to share the same physical hardware. The overhead should be minimal, and it's likely that using a 'real' network device will be a lot easier to maintain than trying to directly share separate queues on a single device that is somehow visible in multiple namespaces. And, since the mac-vlan can work as pure software on top of any NIC that can go promisc and send with arbitrary source MAC, it will already work with virtually all wired ethernet devices currently in existence. Thanks, Ben -- Ben Greear [EMAIL PROTECTED] Candela Technologies Inc http://www.candelatech.com - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
The MAC is still very much centralized in most designs. So one way they'll do it is to support assigning N MAC addresses, and you configure the input filters of the chip to push packets for each MAC to the proper receive queue. So the MAC will accept any of those in the N MAC addresses as it's own, then you use the filtering facilities to steer frames to the correct RX queue. Not quite... You'll have to deal with multiple Rx filters, not just the current one-filter-for-all model present in today's NICs. Pools of queues will have separate configured characteristics. The steer portion you mention is a bottleneck that wants to be eliminated. I think you're misunderstanding. These NICs still have only one physical port, so sending or receiving real packets onto a physical wire is fundamentally serialized. The steering of packets to receive queues is done right after the packets are received from the wire -- in fact it can be done as soon as the NIC has parsed enough of the headers to make a decision, which might be before the full packet has even been received. The steering is no more of a bottleneck than the physical link is. - R. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
From: Jeff Garzik [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 17:46:20 -0400 Not quite... You'll have to deal with multiple Rx filters, not just the current one-filter-for-all model present in today's NICs. Pools of queues will have separate configured characteristics. The steer portion you mention is a bottleneck that wants to be eliminated. It runs in hardware at wire speed, what's the issue? :-) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
From: Ben Greear [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 14:46:50 -0700 And, since the mac-vlan can work as pure software on top of any NIC that can go promisc and send with arbitrary source MAC, it will already work with virtually all wired ethernet devices currently in existence. Absolutely, I'm not against something like mac-vlan at all. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Please pull 'libertas' branch of wireless-2.6 (resent w/o attachment)
Resending w/o the attached patch, in case it was too big...yikes! Individual patches are available here: http://www.kernel.org/pub/linux/kernel/people/linville/wireless-2.6/libertas John --- Jeff, This is the same as the previous pull request, only rebased on 2.6.22-rc4. Since this is a big pull already, I didn't want to complicate it with the additional patches identified by the libertas team as 2.6.22-worthy. John --- The following changes since commit 5ecd3100e695228ac5e0ce0e325e252c0f11806f: Linus Torvalds (1): Linux 2.6.22-rc4 are found in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git libertas Chris Ball (1): libertas: wakeup both mesh and normal wakeup when getting out of scan Dan Williams (25): libertas: call SET_NETDEV_DEV from common code libertas: replace 'macaddress' with 'bssid' libertas: correctly unregister mesh netdev on error libertas: don't tear down netdev in libertas_activate_card libertas: make scan result handling more flexible libertas: fix 'keep previous scan' behavior libertas: move channel changing into association framework libertas: make association paths consistent libertas: use MAC_FMT and MAC_ARG where appropriate libertas: use compare_ether_addr() rather than memcmp() where appropriate libertas: fix debug enter/leave prints for libertas_execute_next_command libertas: correctly balance locking in libertas_process_rx_command libertas: correct error report paths for wlan_fwt_list_ioctl libertas: fix deadlock SIOCGIWSCAN handler libertas: fix default adhoc channel libertas: honor specific channel requests during association libertas: send SIOCGIWSCAN event after partial scans too libertas: debug print spacing fixes in assoc.c libertas: add more verbose debugging to libertas_cmd_80211_authenticate libertas: Make WPA work through supplicant handshake libertas: sparse fixes libertas: tweak association debug output libertas: remove structure WLAN_802_11_SSID and libertas_escape_essid libertas: remove WPA_SUPPLICANT structure libertas: reduce SSID and BSSID mixed-case abuse David Woodhouse (6): libertas: fix character set in README libertas: first pass at fixing up endianness issues libertas: More endianness fixes. libertas: more endianness fixes, in tx.c this time libertas: don't byte-swap firmware version number. It's a byte array. libertas: fix big-endian associate command. Holger Schurig (23): libertas: rename wlan_association_worker libertas: a debug output was missing a newline libertas: fix removal of all debugfs files libertas: remove __FILE__ from debug output libertas: remove unused/superfluous definitions of DEV_NAME_LEN libertas: move vendor product id's into if_usb.c libertas: make libertas_wlan_data_rates static libertas: exclude non-used code when PROC_DEBUG is not set libertas: make debug configurable libertas: tune debug code libertas: single out mesh code libertas: change debug output of libertas_interrupt() libertas: get rid of libertas_sbi_get_priv() libertas: fix SSID output libertas: changed some occurences of kmalloc() + memset(a,0,sz) to kzalloc() libertas: move reset_device() code main.c to if_usb.c libertas: split wlan_add_card() libertas: indirect all hardware access via hw_ functions libertas: move contents of fw.h to decl.h libertas: split module into two (libertas.ko and usb8xxx.ko) libertas: fix RESET logic at unload time libertas: let DRV_NAME be overridable libertas: remove unused variables in wlan_dev_t Javier Cardona (2): libertas: fixed transmission flow control on the mesh interface libertas: added transmission failures to mesh statistics Luis Carlos Cobo (4): libertas: fixed incorrect assigment of fcs errors to frag errors libertas: add URB debug info libertas: fixed kernel oops on module/card removal libertas: updated mesh commands for 5.220.9.p11 Luis Carlos Cobo Rus (6): libertas: version bump (321p0) and cmds update for new fw (5.220.10.p0) libertas: cleanup of fwt_list_route processing libertas: updated readme file libertas: make mac address configuration work with mesh interface too libertas: split wext for eth and msh libertas: support for mesh autostart on firmware 5.220.11 Marcelo Tosatti (5): libertas: scan two channels per scan command libertas: remove deprecated pm_register and associated code libertas: fix scanning from associate path libertas: fix error handling of card initialization libertas: fix oops on rmmod drivers/net/wireless/Kconfig | 19 +- drivers/net/wireless/libertas/11d.c|
Re: [PATCH] NET: Multiqueue network device support.
From: Jason Lunz [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 17:47:53 -0400 Are you aware of any hardware designs that allow other ways to map packets onto rx queues? I can think of several scenarios where it could be advantageous to map packets by IP 3- or 5-tuple to get cpu locality all the way up the stack on a flow-by-flow basis. But doing this would require some way to request this mapping from the hardware. These chips allow this too, Microsoft defined a standard for RX queue interrupt hashing by flow so everyone puts it, or something like it, in hardware. In the extreme case it would be cool if it were possible to push a bpf-like classifier down into the hardware to allow arbitrary kinds of flow distribution. Maybe not a fully bpf, but many chips allow something close. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
Roland Dreier wrote: The MAC is still very much centralized in most designs. So one way they'll do it is to support assigning N MAC addresses, and you configure the input filters of the chip to push packets for each MAC to the proper receive queue. So the MAC will accept any of those in the N MAC addresses as it's own, then you use the filtering facilities to steer frames to the correct RX queue. Not quite... You'll have to deal with multiple Rx filters, not just the current one-filter-for-all model present in today's NICs. Pools of queues will have separate configured characteristics. The steer portion you mention is a bottleneck that wants to be eliminated. I think you're misunderstanding. These NICs still have only one physical port, so sending or receiving real packets onto a physical wire is fundamentally serialized. The steering of packets to receive queues is done right after the packets are received from the wire -- in fact it can be done as soon as the NIC has parsed enough of the headers to make a decision, which might be before the full packet has even been received. The steering is no more of a bottleneck than the physical link is. No, you're misreading. People are putting in independent configurable Rx filters because a single Rx filter setup for all queues was a bottleneck. Not a performance bottleneck but a configuration and flexibility limitation that's being removed. And where shall we put the configuration machinery, to support sub-queues? Shall we duplicate the existing configuration code for sub-queues? What will ifconfig/ip usage look like? How will it differ from configurating full net_devices, if you are assigning the same types of parameters? Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
From: Roland Dreier [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 14:52:11 -0700 I think you're misunderstanding. These NICs still have only one physical port, so sending or receiving real packets onto a physical wire is fundamentally serialized. The steering of packets to receive queues is done right after the packets are received from the wire -- in fact it can be done as soon as the NIC has parsed enough of the headers to make a decision, which might be before the full packet has even been received. The steering is no more of a bottleneck than the physical link is. Yep, that's right. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
From: Jeff Garzik [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 17:59:43 -0400 And where shall we put the configuration machinery, to support sub-queues? Shall we duplicate the existing configuration code for sub-queues? What will ifconfig/ip usage look like? How will it differ from configurating full net_devices, if you are assigning the same types of parameters? If you're asking about the virtualization scenerio, the control node (dom0 or whatever) is the only entity which can get at programming the filters and will set it up properly based upon which parts of the physical device are being exported to which guest nodes. For the non-virtualized case, it's a good question. But really the current hardware is just about simple queue steering, and simple static DRR/WRED fairness algorithms applied to the queues in hardware. We don't need to add support for configuring anything fancy from the start just to get something working. Especially the important bits such as the virtualization case and the interrupt and queue distribution case on SMP. The latter can even be configured automatically by the driver, and that's in fact what I expect drivers to do initially. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
On Tue, Jun 12, 2007 at 02:55:34PM -0700, David Miller wrote: These chips allow this too, Microsoft defined a standard for RX queue interrupt hashing by flow so everyone puts it, or something like it, in hardware. I think you're referring to RSS? http://www.microsoft.com/whdc/device/network/NDIS_RSS.mspx http://msdn2.microsoft.com/en-us/library/ms795609.aspx Jason - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
David Miller wrote: If you're asking about the virtualization scenerio, the control node (dom0 or whatever) is the only entity which can get at programming the filters and will set it up properly based upon which parts of the physical device are being exported to which guest nodes. You're avoiding the question. Clearly guest VMs must contact the host VM (dom0) to get real work done. They are ultimately going to have to pass the same configuration info as the non-virt case. For the non-virtualized case, it's a good question. ... But really the current hardware is just about simple queue steering, and simple static DRR/WRED fairness algorithms applied to the queues in hardware. We don't need to add support for configuring anything fancy from the start just to get something working. Correct. But if we don't plan for the future that's currently in the silicon pipeline, our ass will be in a sling WHEN we must figure out the best configuration points for sub-queues. Or are we prepared to rip out sub-queues for a non-experimental solution, when confronted with the obvious necessity of configuring them? You know I want multi-queue and increased parallelism it provides. A lot. But let's not dig ourselves into a hole we must climb out of in 6-12 months. We need to think about configuration issues -now-. Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
Ben Greear wrote: That sounds plausible for many uses, but it may also be useful to have the virtual devices. Having 802.1Q VLANs be 'real' devices has worked out quite well, so I think there is a place for a 'mac-vlan' as well. Virtual devices are pretty much the only solution we have right now, both in terms of available control points, and in terms of mapping to similar existing solutions (like wireless and its multiple net devices). Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
On Tue, Jun 12, 2007 at 02:26:58PM -0700, David Miller wrote: The MAC is still very much centralized in most designs. So one way they'll do it is to support assigning N MAC addresses, and you configure the input filters of the chip to push packets for each MAC to the proper receive queue. So the MAC will accept any of those in the N MAC addresses as it's own, then you use the filtering facilities to steer frames to the correct RX queue. The TX and RX queues can be so isolated as to be able to be exported to virtualization nodes. You can give them full access to the DMA queues and assosciated mailboxes. So instead of all of this bogus virtualized device overhead, you just give the guest access to the real device. So you can use multiple queues either for better single node SMP performance, or better virtualization performance. Are you aware of any hardware designs that allow other ways to map packets onto rx queues? I can think of several scenarios where it could be advantageous to map packets by IP 3- or 5-tuple to get cpu locality all the way up the stack on a flow-by-flow basis. But doing this would require some way to request this mapping from the hardware. In the extreme case it would be cool if it were possible to push a bpf-like classifier down into the hardware to allow arbitrary kinds of flow distribution. Jason - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Multiqueue network device support.
Jeff Garzik wrote: Ben Greear wrote: That sounds plausible for many uses, but it may also be useful to have the virtual devices. Having 802.1Q VLANs be 'real' devices has worked out quite well, so I think there is a place for a 'mac-vlan' as well. Virtual devices are pretty much the only solution we have right now, both in terms of available control points, and in terms of mapping to similar existing solutions (like wireless and its multiple net devices). I believe Patrick is working on cleaning up mac-vlans and converting them to use the new netlink configuration API, so there should be a patch for these hitting the list shortly. Thanks, Ben -- Ben Greear [EMAIL PROTECTED] Candela Technologies Inc http://www.candelatech.com - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Please pull 'libertas-fixes' branch of wireless-2.6
John W. Linville wrote: Fixes identified by the libertas team as important for 2.6.22... --- The following changes since commit 717c9339202a42ae7bec7d3c4b84deecdcae9f81: Dan Williams (1): libertas: reduce SSID and BSSID mixed-case abuse are found in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git libertas-fixes Dan Williams (1): libertas: actually send mesh frames to mesh netdev Luis Carlos (1): libertas: convert libertas_mpp into anycast_mask Luis Carlos Cobo Rus (2): libertas: pull current channel from firmware on mesh autostart libertas: deauthenticate from AP in channel switch Just to be clear, you intend 'libertas' and 'libertas-fixes' (in that order) for 2.6.22, and 'libertas-upstream' for 2.6.23? Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] NetXen: Add correct routines to setup multicast address
Mithlesh Thukral wrote: NetXen: Add multi cast filter code This patch adds multi cast filter code to NetXen NIC driver. It also adds capabilities to setup the multicast address in hardware from the host side. Signed-off by: Mithlesh Thukral [EMAIL PROTECTED] --- drivers/net/netxen/netxen_nic.h | 24 drivers/net/netxen/netxen_nic_hdr.h |3 drivers/net/netxen/netxen_nic_hw.c | 132 +- 3 files changed, 156 insertions(+), 3 deletions(-) Michael seems to keep finding endian bugs in this code... - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/15] spidernet driver bug fixes
Linas Vepstas wrote: On Fri, Jun 08, 2007 at 01:20:20PM -0400, Jeff Garzik wrote: On Fri, Jun 08, 2007 at 12:06:08PM -0500, Linas Vepstas wrote: On Fri, Jun 08, 2007 at 11:12:31AM +1000, Michael Ellerman wrote: On Thu, 2007-06-07 at 14:17 -0500, Linas Vepstas wrote: The major bug fixes are: I realise it's late, but shouldn't major bugfixes be going into 22 ? Yeah, I suppose, I admit I've lost track of the process. You need to order your bug fixes first in the queue. OK, here are the patches, re-ordered. There is a different number than last time, as I threw out one, merged one, and got cold feet on a third one. They still pass the tests. The first five patches focus on three serious bugs, fixing crashes or hangs. -- patch 1 -- kernel crash when ifdown while receiving packets. -- patch 2,3,4 -- device driver deadlocks on RX ram full mesgs. (kernel stays up, ifdown/up clear the problem). -- patch 5 -- misconfigured TX interrupts results in 3x-4x per degradation for small packets. -- patch 6 -- rx stats may be mangled -- patch 7 -- hw checksum sometimes breaks ipv6 operation -- patches 8-15 -- misc tweaks, and documentation. I re-ran my stress tests with patches 1-7 applied; they pass. This is a bit frustrating, because this includes many patches that you ALREADY told me to queue for 2.6.23, which I did, in netdev-2.6.git#upstream. Should I just drop all spidernet patches and start over? Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[2.6 patch] net/sunrpc/rpcb_clnt.c: make struct rpcb_program static
This patch makes the needlessly global struct rpcb_program static. Signed-off-by: Adrian Bunk [EMAIL PROTECTED] --- net/sunrpc/rpcb_clnt.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- linux-2.6.22-rc4-mm2/net/sunrpc/rpcb_clnt.c.old 2007-06-12 23:25:01.0 +0200 +++ linux-2.6.22-rc4-mm2/net/sunrpc/rpcb_clnt.c 2007-06-12 23:25:19.0 +0200 @@ -118,7 +118,7 @@ #define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) static voidrpcb_getport_done(struct rpc_task *, void *); -extern struct rpc_program rpcb_program; +static struct rpc_program rpcb_program; struct rpcbind_args { struct rpc_xprt * r_xprt; @@ -616,7 +616,7 @@ static struct rpc_stat rpcb_stats; -struct rpc_program rpcb_program = { +static struct rpc_program rpcb_program = { .name = rpcbind, .number = RPCBIND_PROGRAM, .nrvers = ARRAY_SIZE(rpcb_version), - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Resend: [PATCH] pktgen IPSEC 3/4: Introduce xfrm SAD only lookup
This takes into considerations Patricks feedback. cheers, jamal commit 4fe3190756589ef8155eb97fe725f2564f1fc77d Author: Jamal Hadi Salim [EMAIL PROTECTED] Date: Tue Jun 12 12:35:39 2007 -0400 [XFRM] Introduce standalone SAD lookup This allows other in-kernel functions to do SAD lookups. The only known user at the moment is pktgen. Signed-off-by: Jamal Hadi Salim [EMAIL PROTECTED] diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 311f25a..79d2c37 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -920,6 +920,10 @@ extern struct xfrm_state *xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family); +extern struct xfrm_state * xfrm_stateonly_find(xfrm_address_t *daddr, + xfrm_address_t *saddr, + unsigned short family, + u8 mode, u8 proto, u32 reqid); extern int xfrm_state_check_expire(struct xfrm_state *x); extern void xfrm_state_insert(struct xfrm_state *x); extern int xfrm_state_add(struct xfrm_state *x); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 85f3f43..8d14cd4 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -686,6 +686,37 @@ out: return x; } +struct xfrm_state * +xfrm_stateonly_find(xfrm_address_t *daddr, xfrm_address_t *saddr, + unsigned short family, u8 mode, u8 proto, u32 reqid) +{ + unsigned int h = xfrm_dst_hash(daddr, saddr, reqid, family); + struct xfrm_state *rx = NULL, *x = NULL; + struct hlist_node *entry; + + spin_lock(xfrm_state_lock); + hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) { + if (x-props.family == family + x-props.reqid == reqid + !(x-props.flags XFRM_STATE_WILDRECV) + xfrm_state_addr_check(x, daddr, saddr, family) + mode == x-props.mode + proto == x-id.proto + x-km.state == XFRM_STATE_VALID) { + rx = x; + break; + } + } + + if (rx) + xfrm_state_hold(rx); + spin_unlock(xfrm_state_lock); + + + return rx; +} +EXPORT_SYMBOL(xfrm_stateonly_find); + static void __xfrm_state_insert(struct xfrm_state *x) { unsigned int h;
Resend: [PATCH] pktgen IPSEC 4/4: Add IPSEC support to pktgen
Sorry Robert, I found a problem compiling when i turned off XFRM. This fixes it. cheers, jamal commit bfd389bba7654aa118f0949ff0de45a3bce9700c Author: Jamal Hadi Salim [EMAIL PROTECTED] Date: Tue Jun 12 18:59:33 2007 -0400 [PKTGEN] IPSEC support Added transport mode ESP support for starters. I will send more of these modes and types once i have resolved the tunnel mode isses. Signed-off-by: Jamal Hadi Salim [EMAIL PROTECTED] diff --git a/net/core/pktgen.c b/net/core/pktgen.c index bc4fb3b..e7d1dff 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -152,6 +152,9 @@ #include net/checksum.h #include net/ipv6.h #include net/addrconf.h +#ifdef CONFIG_XFRM +#include net/xfrm.h +#endif #include asm/byteorder.h #include linux/rcupdate.h #include asm/bitops.h @@ -182,6 +185,7 @@ #define F_VID_RND (19) /* Random VLAN ID */ #define F_SVID_RND(110) /* Random SVLAN ID */ #define F_FLOW_SEQ(111) /* Sequential flows */ +#define F_IPSEC_ON(112) /* ipsec on for flows */ /* Thread control flag bits */ #define T_TERMINATE (10) @@ -208,6 +212,9 @@ static struct proc_dir_entry *pg_proc_dir = NULL; struct flow_state { __be32 cur_daddr; int count; +#ifdef CONFIG_XFRM + struct xfrm_state *x; +#endif __u32 flags; }; @@ -348,7 +355,10 @@ struct pktgen_dev { unsigned lflow; /* Flow length (config) */ unsigned nflows;/* accumulated flows (stats) */ unsigned curfl; /* current sequenced flow (state)*/ - +#ifdef CONFIG_XFRM + __u8ipsmode;/* IPSEC mode (config) */ + __u8ipsproto; /* IPSEC type (config) */ +#endif char result[512]; }; @@ -704,6 +714,11 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, FLOW_RND ); } +#ifdef CONFIG_XFRM + if (pkt_dev-flags F_IPSEC_ON) + seq_printf(seq, IPSEC ); +#endif + if (pkt_dev-flags F_MACSRC_RND) seq_printf(seq, MACSRC_RND ); @@ -1198,6 +1213,11 @@ static ssize_t pktgen_if_write(struct file *file, else if (strcmp(f, FLOW_SEQ) == 0) pkt_dev-flags |= F_FLOW_SEQ; +#ifdef CONFIG_XFRM + else if (strcmp(f, IPSEC) == 0) + pkt_dev-flags |= F_IPSEC_ON; +#endif + else if (strcmp(f, !IPV6) == 0) pkt_dev-flags = ~F_IPV6; @@ -1206,7 +1226,7 @@ static ssize_t pktgen_if_write(struct file *file, Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s, f, IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, - MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ\n); + MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, IPSEC\n); return count; } sprintf(pg_result, OK: flags=0x%x, pkt_dev-flags); @@ -2094,6 +2114,7 @@ static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { + pkt_dev-pkt_overhead = 0; pkt_dev-pkt_overhead += pkt_dev-nr_labels*sizeof(u32); pkt_dev-pkt_overhead += VLAN_TAG_SIZE(pkt_dev); pkt_dev-pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); @@ -2130,6 +2151,31 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) return pkt_dev-curfl; } + +#ifdef CONFIG_XFRM +/* If there was already an IPSEC SA, we keep it as is, else + * we go look for it ... +*/ +inline +void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) +{ + struct xfrm_state *x = pkt_dev-flows[flow].x; + if (!x) { + /*slow path: we dont already have xfrm_state*/ + x = xfrm_stateonly_find((xfrm_address_t *)pkt_dev-cur_daddr, + (xfrm_address_t *)pkt_dev-cur_saddr, + AF_INET, + pkt_dev-ipsmode, + pkt_dev-ipsproto, 0); + if (x) { + pkt_dev-flows[flow].x = x; + set_pkt_overhead(pkt_dev); + pkt_dev-pkt_overhead+=x-props.header_len; + } + + } +} +#endif /* Increment/randomize headers according to flags and current values * for IP src/dest, UDP src/dst port, MAC-Addr src/dst */ @@ -2289,6 +2335,10 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev-flows[flow].flags |= F_INIT; pkt_dev-flows[flow].cur_daddr = pkt_dev-cur_daddr; +#ifdef CONFIG_XFRM + if (pkt_dev-flags F_IPSEC_ON) +
Re: [PATCH net-2.6 1/1] [TCP]: Fix left_out setting during FRTO
From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Tue, 12 Jun 2007 11:50:29 +0300 (EEST) Without FRTO, the tcp_try_to_open is never called with lost_out 0 (see tcp_time_to_recover). However, when FRTO is enabled, the !tp-lost condition is not used until end of FRTO because that way TCP avoids premature entry to fast recovery during FRTO. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] Thanks for catching this, patch applied. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [2.6 patch] net/sunrpc/rpcb_clnt.c: make struct rpcb_program static
Adrian Bunk wrote: This patch makes the needlessly global struct rpcb_program static. Signed-off-by: Adrian Bunk [EMAIL PROTECTED] Acked-by: Chuck Lever [EMAIL PROTECTED] --- net/sunrpc/rpcb_clnt.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- linux-2.6.22-rc4-mm2/net/sunrpc/rpcb_clnt.c.old 2007-06-12 23:25:01.0 +0200 +++ linux-2.6.22-rc4-mm2/net/sunrpc/rpcb_clnt.c 2007-06-12 23:25:19.0 +0200 @@ -118,7 +118,7 @@ #define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING) static void rpcb_getport_done(struct rpc_task *, void *); -extern struct rpc_program rpcb_program; +static struct rpc_program rpcb_program; struct rpcbind_args { struct rpc_xprt * r_xprt; @@ -616,7 +616,7 @@ static struct rpc_stat rpcb_stats; -struct rpc_program rpcb_program = { +static struct rpc_program rpcb_program = { .name = rpcbind, .number = RPCBIND_PROGRAM, .nrvers = ARRAY_SIZE(rpcb_version), begin:vcard fn:Chuck Lever n:Lever;Chuck org:Oracle Corporation;Corporate Architecture: Linux Projects Group adr:;;1015 Granger Avenue;Ann Arbor;MI;48104;USA title:Principal Member of Staff tel;work:+1 248 614 5091 x-mozilla-html:FALSE url:http://oss.oracle.com/~cel/ version:2.1 end:vcard
Re: [PATCH 0/15] spidernet driver bug fixes
On Tue, Jun 12, 2007 at 07:00:17PM -0400, Jeff Garzik wrote: Linas Vepstas wrote: On Fri, Jun 08, 2007 at 01:20:20PM -0400, Jeff Garzik wrote: On Fri, Jun 08, 2007 at 12:06:08PM -0500, Linas Vepstas wrote: On Fri, Jun 08, 2007 at 11:12:31AM +1000, Michael Ellerman wrote: On Thu, 2007-06-07 at 14:17 -0500, Linas Vepstas wrote: The major bug fixes are: I realise it's late, but shouldn't major bugfixes be going into 22 ? Yeah, I suppose, I admit I've lost track of the process. You need to order your bug fixes first in the queue. OK, here are the patches, re-ordered. There is a different number than last time, as I threw out one, merged one, and got cold feet on a third one. They still pass the tests. The first five patches focus on three serious bugs, fixing crashes or hangs. -- patch 1 -- kernel crash when ifdown while receiving packets. -- patch 2,3,4 -- device driver deadlocks on RX ram full mesgs. (kernel stays up, ifdown/up clear the problem). -- patch 5 -- misconfigured TX interrupts results in 3x-4x per degradation for small packets. -- patch 6 -- rx stats may be mangled -- patch 7 -- hw checksum sometimes breaks ipv6 operation -- patches 8-15 -- misc tweaks, and documentation. I re-ran my stress tests with patches 1-7 applied; they pass. This is a bit frustrating, because this includes many patches that you ALREADY told me to queue for 2.6.23, which I did, in netdev-2.6.git#upstream. Sigh. I redid the series so as to avoid this problem, per the previous conversation. Should I just drop all spidernet patches and start over? No. Apply the series I just sent you, dropping the one called patch 6/15, the one from Florin Malita, as it appears you'd previously picked this up. The rest of the patches should apply cleanly; I just cheked. I just did a git pull of git://git.kernel.org/pub/scm/linux/kernel/git/jgarzik/netdev-2.6 and checked. The result of patching is exactly as it should be. Just in case it wasn't clear, I'd like to see patches 1-5 go into 2.6.22 ... as these address the most critical complaints I'd gotten recently. --linas - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] NET: Multiqueue network device support.
Hi Guy, On Tue, 2007-12-06 at 17:04 +0300, Cohen, Guy wrote: Hi Jamal, Here is a simple scenario (nothing here is rare of extreme case): - Busy wireless environment - FTP TX on BE queue (low priority) - Skype TX on VO queue (high priority) The channel is busy with high priority packets hence the BE packets are transmitted to the air rarely so the DMA/HW queue of the BE access category gets full and the qdisc is stopped. Now periodic VO-tagged Skype packets arrive. I would expect that they get the priority (and pass) in all stages of the stack and reach the HW ASAP and compete there on the medium with the other access categories and the other clients on the channel. Now this packet will be stuck in the qdisc and wait there until a BE packet is transmitted, which can take a long time. This is a real problem. Understood. My take is that this is resolvable by understanding the nature of the beast. IOW, the strategy of when to open up on such a medium is not conventional as one of a wired netdev. You can use signalling from the media such as an AP giving you signals for different ACs to open up; example: if the AC_BE is not being allowed out and it is just rotting because the AP is favoring VO, then you need to occasionally open up the tx path for the driver etc. There is also a problem with the queues that will be dedicated to TX aggregation in 11n (currently implemented) - the packets will be classified to queues by the destination MAC address and not only by the priority class, but I don't want to get into that now. We have an infrastructure at the qdisc level for selecting queues based on literally anything you can think of in a packet as well as metadata. So i think this aspect should be fine. I think that there are enough arguments now why the patch that started this thread is needed... Sorry Guy, I dont see it that way - unfortunately i dont think anybody else other than Patrick understood what i said and this thread is going on for too long i doubt 99% of the people are following any more ;- In most scenarios BK packets will be transmitted and will win the medium against VO packets (thought, in some non-favored ratio). So if understand you correctly: over a period of time, yes BK will make it out but under contention it will loose; is that always? Is there some mathematics behind this stuff? Sorry, I'm really overloaded - I won't be able to review the docs you sent (really apologize for that). No problem. I totaly understand. The WMM parameters of the AC are set and controlled by the network/BSS (access point) administrator and can be used in anyway. There are the default parameters but they can be changed. It would certainly lead to unexpected behavior if you start favoring BE over VO, no? Would that ever happen by adjusting the WMM parameters? cheers, jamal - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[ANNOUNCE] new driver ixgbe for Intel(R) 10GbE PCI Express adapters.
All, We are pleased to release the new driver ixgbe for Intel(R) 82598 based 10GbE PCI Express adapters. The 82598 silicon and the adapters will be released soon. Please find the full driver as a patch to latest linus-2.6 tree here: git-pull git://lost.foo-projects.org/~aveerani/git/linux-2.6 ixgbe Also, I am posting the driver patch in the followup mail. Short introduction on the ixgbe driver and 82598 silicon: The 82598 (PCI Express) silicon's architecture and SW interface is vastly different from legacy 82597 (PCI-X device). The register offsets and the bit definitions are very different from 82597. The EEPROM/FLASH, SERDES interface for external PHY and other interfaces are also different. 82598 has new Tx and Rx descriptor interfaces (adavanced descriptors) to support packet/header split Rx feauture and Rx packet steering (based on 5 tuples or MAC addresses). It supports list of new features like MSI-X, Multiple Rx and Tx queues, TSO for IPv6. Because of all these differences, we had to write a new driver for 82598 and the new driver is lot cleaner with no 82597 errata workarounds in the hot path. This driver has been tested extensively for the last couple of months in our labs. Please review and provide comments. Apart from implementing the community feedback, here is the list of things in TODO list... 1. Add suspend/resume support. 2. Rewrite the driver handling of LLTX logic. Will post a patch very soon for review. 3. Add PCI error handler support. thanks, Ayyappan - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH][NET_SCHED] Make HTB scheduler work with TSO.
Currently the HTB scheduler does not correctly account for TSO packets which causes large inaccuracies in the bandwidth control when using TSO. This patch allows the HTB scheduler to work with TSO enabled devices. Signed-off-by: Ranjit Manomohan [EMAIL PROTECTED] diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 035788c..e872724 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -153,15 +153,12 @@ #endif /* of un.leaf originals should be done. */ }; -/* TODO: maybe compute rate when size is too large .. or drop ? */ static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate, int size) { int slot = size rate-rate.cell_log; - if (slot 255) { - cl-xstats.giants++; - slot = 255; - } + if (slot 255) + return (rate-data[255]*(slot 8) + rate-data[slot 0xFF]); return rate-data[slot]; } @@ -634,13 +631,14 @@ #endif cl-qstats.drops++; return NET_XMIT_DROP; } else { - cl-bstats.packets++; + cl-bstats.packets += + skb_is_gso(skb)?skb_shinfo(skb)-gso_segs:1; cl-bstats.bytes += skb-len; htb_activate(q, cl); } sch-q.qlen++; - sch-bstats.packets++; + sch-bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)-gso_segs:1; sch-bstats.bytes += skb-len; return NET_XMIT_SUCCESS; } @@ -717,8 +715,9 @@ #endif * In such case we remove class from event queue first. */ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl, -int level, int bytes) +int level, struct sk_buff *skb) { + int bytes = skb-len; long toks, diff; enum htb_cmode old_mode; @@ -753,13 +752,15 @@ #define HTB_ACCNT(T,B,R) toks = diff + c #ifdef HTB_RATECM /* update rate counters */ cl-sum_bytes += bytes; - cl-sum_packets++; + cl-sum_packets += skb_is_gso(skb)? + skb_shinfo(skb)-gso_segs:1; #endif /* update byte stats except for leaves which are already updated */ if (cl-level) { cl-bstats.bytes += bytes; - cl-bstats.packets++; + cl-bstats.packets += skb_is_gso(skb)? + skb_shinfo(skb)-gso_segs:1; } cl = cl-parent; } @@ -943,7 +944,7 @@ next: gives us slightly better performance */ if (!cl-un.leaf.q-q.qlen) htb_deactivate(q, cl); - htb_charge_class(q, cl, level, skb-len); + htb_charge_class(q, cl, level, skb); } return skb; } - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/15] spidernet driver bug fixes
Linas Vepstas wrote: On Tue, Jun 12, 2007 at 07:00:17PM -0400, Jeff Garzik wrote: Linas Vepstas wrote: On Fri, Jun 08, 2007 at 01:20:20PM -0400, Jeff Garzik wrote: On Fri, Jun 08, 2007 at 12:06:08PM -0500, Linas Vepstas wrote: On Fri, Jun 08, 2007 at 11:12:31AM +1000, Michael Ellerman wrote: On Thu, 2007-06-07 at 14:17 -0500, Linas Vepstas wrote: The major bug fixes are: I realise it's late, but shouldn't major bugfixes be going into 22 ? Yeah, I suppose, I admit I've lost track of the process. You need to order your bug fixes first in the queue. OK, here are the patches, re-ordered. There is a different number than last time, as I threw out one, merged one, and got cold feet on a third one. They still pass the tests. The first five patches focus on three serious bugs, fixing crashes or hangs. -- patch 1 -- kernel crash when ifdown while receiving packets. -- patch 2,3,4 -- device driver deadlocks on RX ram full mesgs. (kernel stays up, ifdown/up clear the problem). -- patch 5 -- misconfigured TX interrupts results in 3x-4x per degradation for small packets. -- patch 6 -- rx stats may be mangled -- patch 7 -- hw checksum sometimes breaks ipv6 operation -- patches 8-15 -- misc tweaks, and documentation. I re-ran my stress tests with patches 1-7 applied; they pass. This is a bit frustrating, because this includes many patches that you ALREADY told me to queue for 2.6.23, which I did, in netdev-2.6.git#upstream. Sigh. I redid the series so as to avoid this problem, per the previous conversation. Should I just drop all spidernet patches and start over? No. Apply the series I just sent you, dropping the one called patch 6/15, the one from Florin Malita, as it appears you'd previously picked this up. The rest of the patches should apply cleanly; I just cheked. I just did a git pull of git://git.kernel.org/pub/scm/linux/kernel/git/jgarzik/netdev-2.6 and checked. The result of patching is exactly as it should be. Just in case it wasn't clear, I'd like to see patches 1-5 go into 2.6.22 ... as these address the most critical complaints I'd gotten recently. --linas As I just stated, many of the patches in the current patch series have already been applied to netdev-2.6.git#upstream: Linas Vepstas (11): s2io: add PCI error recovery support s2io: add PCI error recovery support spidernet: beautify error messages spidernet: move a block of code around spidernet: zero out a pointer. spidernet: null out skb pointer after its been used. spidernet: Don't terminate the RX ring spidernet: enhance the dump routine spidernet: reset the card when an rxramfull is seen spidernet: service TX later. spidernet: increase the NAPI weight These are clearly duplicating some of the patches in your patchseries, which means you are woefully out of sync with upstream. Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.20.7 TCP cubic (and bic) initial slow start way too slow?
On Tue, 12 Jun 2007 15:12:58 -0700 (PDT) David Miller [EMAIL PROTECTED] wrote: From: Bill Fink [EMAIL PROTECTED] Date: Wed, 16 May 2007 02:44:09 -0400 [EMAIL PROTECTED] ~]# netstat -s | grep -i retrans 25446 segments retransmited 20936 fast retransmits 4503 retransmits in slow start 4 sack retransmits failed It then only took 2.14 seconds to transfer 1 GB of data. That's all for now. Thanks for all of your testing and numbers Bill. Inhong et al., we have to do something about this, the issue has been known and sitting around for weeks if not months. How safely can we set the default initial_ssthresh to zero in Cubic and BIC? Yes. set it to zero. The module parameter could even go, and just leave the route metric as a way to set/remember it. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Please pull 'libertas-fixes' branch of wireless-2.6
On Tue, Jun 12, 2007 at 06:54:35PM -0400, Jeff Garzik wrote: John W. Linville wrote: Fixes identified by the libertas team as important for 2.6.22... --- The following changes since commit 717c9339202a42ae7bec7d3c4b84deecdcae9f81: Dan Williams (1): libertas: reduce SSID and BSSID mixed-case abuse are found in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git libertas-fixes Dan Williams (1): libertas: actually send mesh frames to mesh netdev Luis Carlos (1): libertas: convert libertas_mpp into anycast_mask Luis Carlos Cobo Rus (2): libertas: pull current channel from firmware on mesh autostart libertas: deauthenticate from AP in channel switch Just to be clear, you intend 'libertas' and 'libertas-fixes' (in that order) for 2.6.22, and 'libertas-upstream' for 2.6.23? Yes, correct. Thanks, John -- John W. Linville [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
r8169 tx problem (1s pause with ping)
Hello folks, I'm seeing something odd with r8169 on FC7: doing a ping -s 1600 alternates between a 1s latency and sub 1ms. Has anyone else seen anything like this? The system in question is an Asus M2A-VM with an onboard RTL8111 (I think). NAPI doesn't seem to make a difference. The kernel in question is currently a vanilla 2.6.21.5. Sub-mtu sized packets behave normally. 02:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8111/8168B PCI Express Gigabit Ethernet controller (rev 01) PING 1.2.3.4 (1.2.3.4) 1600(1628) bytes of data. 1608 bytes from 1.2.3.4: icmp_seq=1 ttl=64 time=1000 ms 1608 bytes from 1.2.3.4: icmp_seq=2 ttl=64 time=0.816 ms 1608 bytes from 1.2.3.4: icmp_seq=3 ttl=64 time=1000 ms 1608 bytes from 1.2.3.4: icmp_seq=4 ttl=64 time=0.661 ms -ben -- Time is of no importance, Mr. President, only life is important. Don't Email: [EMAIL PROTECTED]. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/15] spidernet driver bug fixes
On Tue, 2007-06-12 at 19:00 -0400, Jeff Garzik wrote: Linas Vepstas wrote: On Fri, Jun 08, 2007 at 01:20:20PM -0400, Jeff Garzik wrote: On Fri, Jun 08, 2007 at 12:06:08PM -0500, Linas Vepstas wrote: On Fri, Jun 08, 2007 at 11:12:31AM +1000, Michael Ellerman wrote: On Thu, 2007-06-07 at 14:17 -0500, Linas Vepstas wrote: The major bug fixes are: I realise it's late, but shouldn't major bugfixes be going into 22 ? Yeah, I suppose, I admit I've lost track of the process. You need to order your bug fixes first in the queue. OK, here are the patches, re-ordered. There is a different number than last time, as I threw out one, merged one, and got cold feet on a third one. They still pass the tests. The first five patches focus on three serious bugs, fixing crashes or hangs. -- patch 1 -- kernel crash when ifdown while receiving packets. -- patch 2,3,4 -- device driver deadlocks on RX ram full mesgs. (kernel stays up, ifdown/up clear the problem). -- patch 5 -- misconfigured TX interrupts results in 3x-4x per degradation for small packets. -- patch 6 -- rx stats may be mangled -- patch 7 -- hw checksum sometimes breaks ipv6 operation -- patches 8-15 -- misc tweaks, and documentation. I re-ran my stress tests with patches 1-7 applied; they pass. This is a bit frustrating, because this includes many patches that you ALREADY told me to queue for 2.6.23, which I did, in netdev-2.6.git#upstream. Linas posted the patches, I responded querying whether the bug fixes should go into 2.6.22, and then you told him you need to order your bug fixes first in the queue. Which seemed pretty clear to me that you'd wait for the reordered series. cheers -- Michael Ellerman OzLabs, IBM Australia Development Lab wwweb: http://michael.ellerman.id.au phone: +61 2 6212 1183 (tie line 70 21183) We do not inherit the earth from our ancestors, we borrow it from our children. - S.M.A.R.T Person signature.asc Description: This is a digitally signed message part
Re: [PATCH 0/15] spidernet driver bug fixes
Michael Ellerman wrote: Linas posted the patches, I responded querying whether the bug fixes should go into 2.6.22, and then you told him you need to order your bug fixes first in the queue. Which seemed pretty clear to me that you'd wait for the reordered series. This was presuming Linas actually knew what he himself had submitted previously, and had been accepted... I explicitly emailed Linas on May 24, 2007 detailing each patch that had been applied, and to which netdev-2.6.git branch it had been applied (and thus whether it was queued for 2.6.22 or 2.6.23). Relevant Message-id is [EMAIL PROTECTED], and was sent not only to Linas but also to netdev@vger.kernel.org, [EMAIL PROTECTED], and [EMAIL PROTECTED] These changes were subsequently made public immediately via git://git.kernel.org/.../jgarzik/netdev-2.6.git branches 'upstream-fixes' and 'upstream', and were followed a few days later by akpm's public tree, starting with 2.6.22-rc3-mm1 (and all subsequent releases). All of the above seemed pretty clear, too. To move forward, it sounds like the best thing to do is drop all spidernet patches and start over, yes? Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
arp-scan triggers via-velocity eth0: excessive work at interrupt
It kind of surprised me that sending 254 arp packets by using the arp-scan tool (http://www.nta-monitor.com/tools/arp-scan/) on a /24 consistently triggers a burst of eth0: excessive work at interrupt. This is a 600 MHz PIII, 2.6.22-rc4, via-velocity driver. model name : Pentium III (Katmai) stepping: 3 cpu MHz : 601.406 cache size : 512 KB 00:09.0 Ethernet controller [0200]: VIA Technologies, Inc. VT6120/VT6121/VT6122 Gigabit Ethernet Adapter [1106:3119] (rev 11) Just double-checking... the program actually sent 463 packets (256 + a retry to all those that didn't respond to the first one), and triggers 11 copies of the kernel message. Command line: arp-scan -I eth0 -l [-v] - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: IC Plus Corp IC Plus IP1000
[EMAIL PROTECTED] wrote: I wonder if it at some time will be included in the standard Linux kernel? I am of course interested because my main board has it built in, so I would be willing to test it. Me, too! This has been discussed sporadically for the last year, and I can confirm that the driver source from the manufacturer's web page is starting to suffer bit rot, but after patching the more egregious breakage (references to linux/config.h, UTS_RELEASE and pci_module_init() stop it from compiling), it works. It doesn't even spew eth0: excessive work at interrupt when running arp-scan, unlike certain in-tree drivers. :-) I got a bit of a rude shock today after doing an emergency replacement on a socket 939 motherboard and blandly assuring a Windows-experienced co-worker that despite a change from nForce to VIA KT890 chipset, the system should just work. One round of floppy shuffle and code-fixing later, my co-worker is not impressed by the Linux version of Have driver disk. :-) Is anyone able to push it to completion? I have a vague idea that the vendor lost interest. (Should I write to Greg K-H and tell him Free Linux Driver Developed!?) I can play testing guinea-pig if needed. Thanks! - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: IC Plus Corp IC Plus IP1000
Peter Rasmussen wrote: I am not on this list, but found this address on: http://linux-net.osdl.org/index.php/Mailing_Lists. My question is regarding the ethernet controller (from lspci): Sundance Technology Inc / IC Plus Corp IC Plus IP1000 Family Gigabit Ethernet (rev 41) that seems to have a driver for it published on: http://www.icplus.com.tw/driver-pp-IP1000A.html Unfortunately I am not able to build it as described. I wonder if it at some time will be included in the standard Linux kernel? I am of course interested because my main board has it built in, so I would be willing to test it. Use the 'sundance' driver that's been in the kernel for quite a while. Jeff - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Please pull 'libertas-fixes' branch of wireless-2.6
John W. Linville wrote: Fixes identified by the libertas team as important for 2.6.22... --- The following changes since commit 717c9339202a42ae7bec7d3c4b84deecdcae9f81: Dan Williams (1): libertas: reduce SSID and BSSID mixed-case abuse are found in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git libertas-fixes Dan Williams (1): libertas: actually send mesh frames to mesh netdev Luis Carlos (1): libertas: convert libertas_mpp into anycast_mask Luis Carlos Cobo Rus (2): libertas: pull current channel from firmware on mesh autostart libertas: deauthenticate from AP in channel switch drivers/net/wireless/libertas/assoc.c | 13 + drivers/net/wireless/libertas/assoc.h |2 ++ drivers/net/wireless/libertas/cmdresp.c |1 + drivers/net/wireless/libertas/dev.h |1 + drivers/net/wireless/libertas/host.h|4 ++-- drivers/net/wireless/libertas/main.c| 27 ++- drivers/net/wireless/libertas/rx.c |5 ++--- 7 files changed, 35 insertions(+), 18 deletions(-) pulled into #upstream-fixes - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Please pull 'libertas' branch of wireless-2.6 (resent w/o attachment)
John W. Linville wrote: Resending w/o the attached patch, in case it was too big...yikes! Individual patches are available here: http://www.kernel.org/pub/linux/kernel/people/linville/wireless-2.6/libertas John --- Jeff, This is the same as the previous pull request, only rebased on 2.6.22-rc4. Since this is a big pull already, I didn't want to complicate it with the additional patches identified by the libertas team as 2.6.22-worthy. John --- The following changes since commit 5ecd3100e695228ac5e0ce0e325e252c0f11806f: Linus Torvalds (1): Linux 2.6.22-rc4 are found in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git libertas Chris Ball (1): libertas: wakeup both mesh and normal wakeup when getting out of scan Dan Williams (25): libertas: call SET_NETDEV_DEV from common code libertas: replace 'macaddress' with 'bssid' libertas: correctly unregister mesh netdev on error libertas: don't tear down netdev in libertas_activate_card libertas: make scan result handling more flexible libertas: fix 'keep previous scan' behavior libertas: move channel changing into association framework libertas: make association paths consistent libertas: use MAC_FMT and MAC_ARG where appropriate libertas: use compare_ether_addr() rather than memcmp() where appropriate libertas: fix debug enter/leave prints for libertas_execute_next_command libertas: correctly balance locking in libertas_process_rx_command libertas: correct error report paths for wlan_fwt_list_ioctl libertas: fix deadlock SIOCGIWSCAN handler libertas: fix default adhoc channel libertas: honor specific channel requests during association libertas: send SIOCGIWSCAN event after partial scans too libertas: debug print spacing fixes in assoc.c libertas: add more verbose debugging to libertas_cmd_80211_authenticate libertas: Make WPA work through supplicant handshake libertas: sparse fixes libertas: tweak association debug output libertas: remove structure WLAN_802_11_SSID and libertas_escape_essid libertas: remove WPA_SUPPLICANT structure libertas: reduce SSID and BSSID mixed-case abuse David Woodhouse (6): libertas: fix character set in README libertas: first pass at fixing up endianness issues libertas: More endianness fixes. libertas: more endianness fixes, in tx.c this time libertas: don't byte-swap firmware version number. It's a byte array. libertas: fix big-endian associate command. Holger Schurig (23): libertas: rename wlan_association_worker libertas: a debug output was missing a newline libertas: fix removal of all debugfs files libertas: remove __FILE__ from debug output libertas: remove unused/superfluous definitions of DEV_NAME_LEN libertas: move vendor product id's into if_usb.c libertas: make libertas_wlan_data_rates static libertas: exclude non-used code when PROC_DEBUG is not set libertas: make debug configurable libertas: tune debug code libertas: single out mesh code libertas: change debug output of libertas_interrupt() libertas: get rid of libertas_sbi_get_priv() libertas: fix SSID output libertas: changed some occurences of kmalloc() + memset(a,0,sz) to kzalloc() libertas: move reset_device() code main.c to if_usb.c libertas: split wlan_add_card() libertas: indirect all hardware access via hw_ functions libertas: move contents of fw.h to decl.h libertas: split module into two (libertas.ko and usb8xxx.ko) libertas: fix RESET logic at unload time libertas: let DRV_NAME be overridable libertas: remove unused variables in wlan_dev_t Javier Cardona (2): libertas: fixed transmission flow control on the mesh interface libertas: added transmission failures to mesh statistics Luis Carlos Cobo (4): libertas: fixed incorrect assigment of fcs errors to frag errors libertas: add URB debug info libertas: fixed kernel oops on module/card removal libertas: updated mesh commands for 5.220.9.p11 Luis Carlos Cobo Rus (6): libertas: version bump (321p0) and cmds update for new fw (5.220.10.p0) libertas: cleanup of fwt_list_route processing libertas: updated readme file libertas: make mac address configuration work with mesh interface too libertas: split wext for eth and msh libertas: support for mesh autostart on firmware 5.220.11 Marcelo Tosatti (5): libertas: scan two channels per scan command libertas: remove deprecated pm_register and associated code libertas: fix scanning from associate path libertas: fix error handling of card initialization libertas: fix oops on rmmod drivers/net/wireless/Kconfig | 19 +-
Re: [PATCH 1/2] NetXen: Fix link status messages
Mithlesh Thukral wrote: NetXen: Fix incorrect link status even with switch turned OFF. NetXen driver failed to accurately indicate when a link is up or down. This was encountered during failover testing, when the first port indicated that the link was up even when the 10G switch it was assigned to in the Bladecenter was turned off completely. Signed-off by: Wen Xiong [EMAIL PROTECTED] Signed-off by: Mithlesh Thukral [EMAIL PROTECTED] --- drivers/net/netxen/netxen_nic.h |1 + drivers/net/netxen/netxen_nic_init.c | 21 + drivers/net/netxen/netxen_nic_isr.c | 24 3 files changed, 38 insertions(+), 8 deletions(-) applied to #upstream-fixes - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/3] myri10ge: limit the number of recoveries
Brice Goglin wrote: Limit the number of recoveries from a NIC hw watchdog reset to 1 by default. It enables detection of defective NICs immediately since these memory parity errors are expected to happen very rarely (less than once per century*NIC). Signed-off-by: Brice Goglin [EMAIL PROTECTED] --- drivers/net/myri10ge/myri10ge.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) applied 1-3 to #upstream-fixes - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[KJ PATCH] Replacing alloc_pages(gfp,0) with alloc_page(gfp) in net/core/skbuff.c
Replacing alloc_pages(gfp,0) with alloc_page(gfp) in net/core/skbuff.c Signed-off-by: Shani Moideen [EMAIL PROTECTED] diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1422573..b923181 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1805,7 +1805,7 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb, return -EFAULT; /* allocate a new page for next frag */ - page = alloc_pages(sk-sk_allocation, 0); + page = alloc_page(sk-sk_allocation); /* If alloc_page fails just return failure and caller will * free previous allocated pages by doing kfree_skb() -- Shani - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[KJ PATCH] Replacing alloc_pages(gfp,0) with alloc_page(gfp) in net/core/pktgen.c
Replacing alloc_pages(gfp,0) with alloc_page(gfp) in net/core/pktgen.c Signed-off-by: Shani Moideen [EMAIL PROTECTED] diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b92a322..2600c7f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2414,7 +2414,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, i = 0; while (datalen 0) { - struct page *page = alloc_pages(GFP_KERNEL, 0); + struct page *page = alloc_page(GFP_KERNEL); skb_shinfo(skb)-frags[i].page = page; skb_shinfo(skb)-frags[i].page_offset = 0; skb_shinfo(skb)-frags[i].size = @@ -2762,7 +2762,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, i = 0; while (datalen 0) { - struct page *page = alloc_pages(GFP_KERNEL, 0); + struct page *page = alloc_page(GFP_KERNEL); skb_shinfo(skb)-frags[i].page = page; skb_shinfo(skb)-frags[i].page_offset = 0; skb_shinfo(skb)-frags[i].size = -- Shani - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[KJ PATCH] Replacing alloc_pages(gfp,0) with alloc_page(gfp) in net/core/sock.c
Replacing alloc_pages(gfp,0) with alloc_page(gfp) in net/core/sock.c Signed-off-by: Shani Moideen [EMAIL PROTECTED] diff --git a/net/core/sock.c b/net/core/sock.c index 22183c2..25bb52b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1193,7 +1193,7 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, struct page *page; skb_frag_t *frag; - page = alloc_pages(sk-sk_allocation, 0); + page = alloc_page(sk-sk_allocation); if (!page) { err = -ENOBUFS; skb_shinfo(skb)-nr_frags = i; -- Shani - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Patch to drivers/usb/serial/sierra.c to support Sierra Wireless Aircard 595U
*** linux-2.6.21.5/drivers/usb/serial/sierra.c Mon Jun 11 11:37:06 2007 --- linux-2.6.21.5a/drivers/usb/serial/sierra.c Fri Jun 8 23:37:06 2007 *** *** 44,49 --- 44,50 { USB_DEVICE(0x1199, 0x0112) }, /* Sierra Wireless AirCard 580 */ { USB_DEVICE(0x0F3D, 0x0112) }, /* AirPrime/Sierra PC 5220 */ + { USB_DEVICE(0x1199, 0x0120) }, /* Sierra Wireless Aircard 595U */ { } }; MODULE_DEVICE_TABLE(usb, id_table); *** *** 66,71 --- 67,73 { USB_DEVICE(0x1199, 0x6803) }, /* Sierra Wireless MC8765 */ { USB_DEVICE(0x1199, 0x6812) }, /* Sierra Wireless MC8775 */ { USB_DEVICE(0x1199, 0x6820) }, /* Sierra Wireless AirCard 875 */ + { USB_DEVICE(0x1199, 0x0120) }, /* Sierra Wireless Aircard 595U */ { } }; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[git patches] net driver fixes
This is a resend of the submission from June 9th, along with added stuff: * big update to new (in 2.6.22) wireless driver libertas * revert e100 's-bit' change; see commit message for more info * more myri, NetXen fixes Please pull from 'upstream-linus' branch of master.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git upstream-linus to receive the following updates: drivers/net/e100.c | 72 ++- drivers/net/ehea/ehea.h|2 +- drivers/net/ehea/ehea_main.c | 12 +- drivers/net/ibmveth.c | 80 +- drivers/net/myri10ge/myri10ge.c| 29 +- drivers/net/netxen/netxen_nic.h| 48 +- drivers/net/netxen/netxen_nic_ethtool.c|8 +- drivers/net/netxen/netxen_nic_hw.c | 12 +- drivers/net/netxen/netxen_nic_init.c | 44 +- drivers/net/netxen/netxen_nic_isr.c| 24 + drivers/net/netxen/netxen_nic_main.c |7 + drivers/net/netxen/netxen_nic_niu.c|8 +- drivers/net/phy/marvell.c | 62 +- drivers/net/usb/Kconfig|4 +- drivers/net/via-velocity.c |2 +- drivers/net/wireless/Kconfig | 19 +- drivers/net/wireless/libertas/11d.c| 152 ++-- drivers/net/wireless/libertas/11d.h|6 +- drivers/net/wireless/libertas/Makefile |4 +- drivers/net/wireless/libertas/README | 52 +- drivers/net/wireless/libertas/assoc.c | 358 +--- drivers/net/wireless/libertas/assoc.h | 10 +- drivers/net/wireless/libertas/cmd.c| 559 +-- drivers/net/wireless/libertas/cmdresp.c| 376 drivers/net/wireless/libertas/debugfs.c| 432 drivers/net/wireless/libertas/decl.h | 20 +- drivers/net/wireless/libertas/defs.h | 101 ++- drivers/net/wireless/libertas/dev.h| 99 +- drivers/net/wireless/libertas/ethtool.c| 55 +- drivers/net/wireless/libertas/fw.c | 111 +-- drivers/net/wireless/libertas/fw.h | 13 - drivers/net/wireless/libertas/host.h | 17 +- drivers/net/wireless/libertas/hostcmd.h| 392 drivers/net/wireless/libertas/if_bootcmd.c |6 +- drivers/net/wireless/libertas/if_usb.c | 448 + drivers/net/wireless/libertas/if_usb.h | 32 +- drivers/net/wireless/libertas/ioctl.c | 286 -- drivers/net/wireless/libertas/join.c | 464 - drivers/net/wireless/libertas/join.h | 13 +- drivers/net/wireless/libertas/main.c | 690 ++--- drivers/net/wireless/libertas/rx.c | 64 +- drivers/net/wireless/libertas/sbi.h| 40 - drivers/net/wireless/libertas/scan.c | 1529 +--- drivers/net/wireless/libertas/scan.h | 81 +- drivers/net/wireless/libertas/thread.h |8 +- drivers/net/wireless/libertas/tx.c | 74 +- drivers/net/wireless/libertas/types.h | 63 +- drivers/net/wireless/libertas/wext.c | 778 --- drivers/net/wireless/libertas/wext.h | 13 +- 49 files changed, 4001 insertions(+), 3778 deletions(-) delete mode 100644 drivers/net/wireless/libertas/fw.h delete mode 100644 drivers/net/wireless/libertas/sbi.h Brian King (2): ibmveth: Fix h_free_logical_lan error on pool resize ibmveth: Automatically enable larger rx buffer pools for larger mtu Brice Goglin (3): myri10ge: limit the number of recoveries myri10ge: report when the link partner is running in Myrinet mode myri10ge: update driver version Chris Ball (1): libertas: wakeup both mesh and normal wakeup when getting out of scan Dan Williams (26): libertas: call SET_NETDEV_DEV from common code libertas: replace 'macaddress' with 'bssid' libertas: correctly unregister mesh netdev on error libertas: don't tear down netdev in libertas_activate_card libertas: make scan result handling more flexible libertas: fix 'keep previous scan' behavior libertas: move channel changing into association framework libertas: make association paths consistent libertas: use MAC_FMT and MAC_ARG where appropriate libertas: use compare_ether_addr() rather than memcmp() where appropriate libertas: fix debug enter/leave prints for libertas_execute_next_command libertas: correctly balance locking in libertas_process_rx_command libertas: correct error report paths for wlan_fwt_list_ioctl libertas: fix deadlock SIOCGIWSCAN handler libertas: fix default adhoc channel libertas: honor specific channel requests during association libertas: send SIOCGIWSCAN event after partial scans too libertas: debug print spacing fixes in assoc.c libertas: add more verbose debugging to libertas_cmd_80211_authenticate libertas: Make WPA work through supplicant handshake libertas: sparse fixes
Re: 2.6.20.7 TCP cubic (and bic) initial slow start way too slow?
On Tue, 12 Jun 2007, Stephen Hemminger wrote: On Tue, 12 Jun 2007 15:12:58 -0700 (PDT) David Miller [EMAIL PROTECTED] wrote: From: Bill Fink [EMAIL PROTECTED] Date: Wed, 16 May 2007 02:44:09 -0400 [EMAIL PROTECTED] ~]# netstat -s | grep -i retrans 25446 segments retransmited 20936 fast retransmits 4503 retransmits in slow start 4 sack retransmits failed It then only took 2.14 seconds to transfer 1 GB of data. That's all for now. Thanks for all of your testing and numbers Bill. Inhong et al., we have to do something about this, the issue has been known and sitting around for weeks if not months. How safely can we set the default initial_ssthresh to zero in Cubic and BIC? Yes. set it to zero. The module parameter could even go, and just leave the route metric as a way to set/remember it. Actually, after thinking about this some more I had some second thoughts about the matter. For my scenario of an uncongested 10-GigE path an initial_ssthresh=0 is definitely what is desired. But perhaps on a congested link with lots of connections, the initial_ssthresh=100 setting might have some benefit. I don't have an easy way of testing that so I was hoping Injong or someone else might do that and report back. If there was a benefit, perhaps it would be useful to have a per-route option for setting the initial_ssthresh. That would leave the question of what to make the default. There was also the mystery of why cubic's slow start performance was so much worse than bic's. If a real benefit could be demonstrated for the congested case, and if bic's slow start behavior could be grafted onto cubic, then bic's current slow start performance (with initial_ssthresh=100) might serve as an adequate compromise between performance and not being overly aggressive for the default behavior. OTOH just setting it to zero as a default should also be fine as that's the standard Reno behavior. I'm leaning in that direction personally, but I'm possibly biased because of my environment, where I'm trying to get maximum performance out of 10-GigE WAN networks that aren't particularly congested normally. -Bill - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] NET: Multiqueue network device support.
-Original Message- From: [EMAIL PROTECTED] [mailto:netdev- [EMAIL PROTECTED] On Behalf Of Jason Lunz Sent: Tuesday, June 12, 2007 2:48 PM To: David Miller Cc: [EMAIL PROTECTED]; [EMAIL PROTECTED]; netdev@vger.kernel.org; [EMAIL PROTECTED]; [EMAIL PROTECTED]; [EMAIL PROTECTED]; [EMAIL PROTECTED] Subject: Re: [PATCH] NET: Multiqueue network device support. On Tue, Jun 12, 2007 at 02:26:58PM -0700, David Miller wrote: The MAC is still very much centralized in most designs. So one way they'll do it is to support assigning N MAC addresses, and you configure the input filters of the chip to push packets for each MAC to the proper receive queue. So the MAC will accept any of those in the N MAC addresses as it's own, then you use the filtering facilities to steer frames to the correct RX queue. The TX and RX queues can be so isolated as to be able to be exported to virtualization nodes. You can give them full access to the DMA queues and assosciated mailboxes. So instead of all of this bogus virtualized device overhead, you just give the guest access to the real device. So you can use multiple queues either for better single node SMP performance, or better virtualization performance. Are you aware of any hardware designs that allow other ways to map packets onto rx queues? I can think of several scenarios where it could be advantageous to map packets by IP 3- or 5-tuple to get cpu locality all the way up the stack on a flow-by-flow basis. But doing this would require some way to request this mapping from the hardware. 10GbE Xframe NICs do that, as well as rx steering by MAC address, VLAN, MS RSS, generic hashing and bunch of other criteria (there is actually a decent chapter on rx steering in the ASIC manual at www.neterion.com support page). The caveat is that in the current products the tuple table is limited to 256 entries only. Next ASIC bumps this number to 64k. In the extreme case it would be cool if it were possible to push a bpf-like classifier down into the hardware to allow arbitrary kinds of flow distribution. Jason - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/4] NetXen: Add correct routines to setup multicast address
Mithlesh, You don't initialize max_mc_count anywhere. The multicast address pool can hold 16 addresses for ports {0,1} and 4 for ports {2,3}. You should have following line in the probe routine. adapter-max_mc_count = (adapter-portnum 1) ? 4 : 16; -- Dhananjay Phadke NetXen Inc. Mithlesh Thukral wrote: NetXen: Add multi cast filter code This patch adds multi cast filter code to NetXen NIC driver. It also adds capabilities to setup the multicast address in hardware from the host side. Signed-off by: Mithlesh Thukral [EMAIL PROTECTED] --- drivers/net/netxen/netxen_nic.h | 24 + drivers/net/netxen/netxen_nic_hdr.h |3 drivers/net/netxen/netxen_nic_hw.c | 119 +- 3 files changed, 143 insertions(+), 3 deletions(-) diff --git a/drivers/net/netxen/netxen_nic.h b/drivers/net/netxen/netxen_nic.h index a0b39ee..2fddfd1 100644 --- a/drivers/net/netxen/netxen_nic.h +++ b/drivers/net/netxen/netxen_nic.h @@ -261,6 +261,27 @@ #define netxen_set_msg_ctxid(config_word #define netxen_set_msg_opcode(config_word, val) \ ((config_word) = ~(0xf28), (config_word) |= (val 0xf) 28) +#define netxen_set_addr_ctl_id_pool0(config_word, val) \ + ((config_word) = ~3, (config_word) |= val 0x3) +#define netxen_set_addr_ctl_enable_xtnd_0(config_word) \ + ((config_word) |= 1 2) +#define netxen_set_addr_ctl_id_pool1(config_word, val) \ + ((config_word) = ~(0x34), (config_word) |= (val 0x3) 4) +#define netxen_set_addr_ctl_enable_xtnd_1(config_word) \ + ((config_word) |= 1 6) +#define netxen_set_addr_ctl_id_pool2(config_word, val) \ + ((config_word) = ~(0x38), (config_word) |= (val 0x3) 8) +#define netxen_set_addr_ctl_enable_xtnd_2(config_word) \ + ((config_word) |= 1 10) +#define netxen_set_addr_ctl_id_pool3(config_word, val) \ + ((config_word) = ~(0x312), (config_word) |= (val 0x3) 12) +#define netxen_set_addr_ctl_enable_xtnd_3(config_word) \ + ((config_word) |= 1 14) +#define netxen_set_addr_ctl_mode(config_word, val) \ + ((config_word) = ~(0x326), (config_word) |= (val 0x3) 26) +#define netxen_set_addr_ctl_enable_poll(config_word, val)\ + ((config_word) = ~(0xf30), (config_word) |= (val 0xf) 30) + struct netxen_rcv_context { __le64 rcv_ring_addr; __le32 rcv_ring_size; @@ -883,6 +904,9 @@ struct netxen_adapter { unsigned char mac_addr[ETH_ALEN]; int mtu; int portnum; + u8 promisc; + u8 mc_enabled; + u8 max_mc_count; spinlock_t tx_lock; spinlock_t lock; diff --git a/drivers/net/netxen/netxen_nic_hdr.h b/drivers/net/netxen/netxen_nic_hdr.h index 608e37b..2bfecbc 100644 --- a/drivers/net/netxen/netxen_nic_hdr.h +++ b/drivers/net/netxen/netxen_nic_hdr.h @@ -545,6 +545,9 @@ #define NETXEN_MULTICAST_ADDR_HI_1(NETX #define NETXEN_MULTICAST_ADDR_HI_2 (NETXEN_CRB_NIU + 0x1018) #define NETXEN_MULTICAST_ADDR_HI_3 (NETXEN_CRB_NIU + 0x101c) +#define NETXEN_UNICAST_ADDR_BASE (NETXEN_CRB_NIU + 0x1080) +#define NETXEN_MULTICAST_ADDR_BASE (NETXEN_CRB_NIU + 0x1100) + #define NETXEN_NIU_GB_MAC_CONFIG_0(I) \ (NETXEN_CRB_NIU + 0x3 + (I)*0x1) #define NETXEN_NIU_GB_MAC_CONFIG_1(I) \ diff --git a/drivers/net/netxen/netxen_nic_hw.c b/drivers/net/netxen/netxen_nic_hw.c index baff17a..c5d4ff9 100644 --- a/drivers/net/netxen/netxen_nic_hw.c +++ b/drivers/net/netxen/netxen_nic_hw.c @@ -303,6 +303,97 @@ int netxen_nic_set_mac(struct net_device return 0; } +#define NETXEN_UNICAST_ADDR(port, index) \ + (NETXEN_UNICAST_ADDR_BASE+(port*32)+(index*8)) + +int netxen_nic_enable_mcast_filter(struct netxen_adapter *adapter) +{ + u32 val = 0; + u16 port = physical_port[adapter-portnum]; + + if (adapter-mc_enabled) + return 0; + + netxen_set_addr_ctl_enable_poll(val, 0xf); + + if (adapter-ahw.board_type == NETXEN_NIC_XGBE) + netxen_set_addr_ctl_mode(val, 0x3); + else + netxen_set_addr_ctl_mode(val, 0x0); + + netxen_set_addr_ctl_id_pool0(val, 0x0); + netxen_set_addr_ctl_id_pool1(val, 0x1); + netxen_set_addr_ctl_id_pool2(val, 0x2); + netxen_set_addr_ctl_id_pool3(val, 0x3); + + netxen_set_addr_ctl_enable_xtnd_0(val); + netxen_set_addr_ctl_enable_xtnd_1(val); + netxen_set_addr_ctl_enable_xtnd_2(val); + netxen_set_addr_ctl_enable_xtnd_3(val); + + netxen_crb_writelit_adapter(adapter, NETXEN_MAC_ADDR_CNTL_REG, val); + + val = 0xff; + + netxen_crb_writelit_adapter(adapter, NETXEN_UNICAST_ADDR(port,0), val); + netxen_crb_writelit_adapter(adapter, NETXEN_UNICAST_ADDR(port,0)+4, + val); + + memcpy(val, adapter-mac_addr, 3); + netxen_crb_writelit_adapter(adapter,
Re: [PATCH] NET: Multiqueue network device support.
On Tue, 2007-06-12 at 23:17 +0200, Patrick McHardy wrote: I've hacked up a small multiqueue simulator device and to my big surprise my testing showed that Jamal's suggestion of using a single queue state seems to work better than I expected. But I've been doing mostly testing of the device itself up to now with very simple traffic patterns (mostly just flood all queues), so I'll try to get some real results tomorrow. The key argument for Jamal's solution is the NIC will send out 32 packets in the full PHL in a reasonably short time (a few microsecs per Jamal's calculation). But for wireless, the PHL hardware has low probability to seize the wireless medium when there are full of high priority frames in the air. That is, the chance for transmission in PHL and PHH is not equal. Queuing packets in software will starve high priority packets than putting them to PHH as early as possible. Patrick, I don't think your testing considered about above scenario, right? Thanks, -yi - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html