[PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
Locks down user pages and sets up for DMA in tcp_recvmsg, then calls dma_async_try_early_copy in tcp_v4_do_rcv Signed-off-by: Chris Leech <[EMAIL PROTECTED]> --- include/net/netdma.h |1 net/ipv4/tcp.c | 111 +- net/ipv4/tcp_input.c | 78 --- net/ipv4/tcp_ipv4.c | 20 + net/ipv6/tcp_ipv6.c | 12 + 5 files changed, 201 insertions(+), 21 deletions(-) diff --git a/include/net/netdma.h b/include/net/netdma.h index 415d74c..2d829e1 100644 --- a/include/net/netdma.h +++ b/include/net/netdma.h @@ -37,5 +37,6 @@ static inline struct dma_chan *get_softn int dma_skb_copy_datagram_iovec(struct dma_chan* chan, const struct sk_buff *skb, int offset, struct iovec *to, size_t len, struct dma_locked_list *locked_list); +int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen); #endif /* NETDMA_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 13abfa2..b792048 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -262,6 +262,9 @@ #include #include #include +#ifdef CONFIG_NET_DMA +#include +#endif #include @@ -1109,6 +1112,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; + int copied_early = 0; lock_sock(sk); @@ -1132,6 +1136,12 @@ int tcp_recvmsg(struct kiocb *iocb, stru target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); +#ifdef CONFIG_NET_DMA + tp->ucopy.dma_chan = NULL; + if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) + dma_lock_iovec_pages(msg->msg_iov, len, &tp->ucopy.locked_list); +#endif + do { struct sk_buff *skb; u32 offset; @@ -1273,6 +1283,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru } else sk_wait_data(sk, &timeo); +#ifdef CONFIG_NET_DMA + tp->ucopy.wakeup = 0; +#endif + if (user_recv) { int chunk; @@ -1328,13 +1342,39 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; +#ifdef CONFIG_NET_DMA + if (!tp->ucopy.dma_chan && tp->ucopy.locked_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan) { + tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( + tp->ucopy.dma_chan, skb, offset, + msg->msg_iov, used, + tp->ucopy.locked_list); + + if (tp->ucopy.dma_cookie < 0) { + + printk(KERN_ALERT "dma_cookie < 0\n"); + + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + if ((offset + used) == skb->len) + copied_early = 1; + + } else +#endif + { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } } @@ -1354,15 +1394,33 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + if (!(flags & MSG_PEEK)) { + if (!copied_early) + sk_eat_skb(sk, skb); +#ifdef CONFIG_NET_DMA + else { + __skb_unlink(skb, &sk->sk_receive_queue); + __skb_queue_tail(&sk->sk_async_wait_queue, skb); + copied_early = 0; + } +#endif + } continue; found_
[PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
Locks down user pages and sets up for DMA in tcp_recvmsg, then calls dma_async_try_early_copy in tcp_v4_do_rcv Signed-off-by: Chris Leech <[EMAIL PROTECTED]> --- include/net/netdma.h |1 net/ipv4/tcp.c | 110 +- net/ipv4/tcp_input.c | 74 ++ net/ipv4/tcp_ipv4.c | 18 net/ipv6/tcp_ipv6.c | 12 + 5 files changed, 193 insertions(+), 22 deletions(-) diff --git a/include/net/netdma.h b/include/net/netdma.h index feb499f..3d9c222 100644 --- a/include/net/netdma.h +++ b/include/net/netdma.h @@ -38,6 +38,7 @@ static inline struct dma_chan *get_softn int dma_skb_copy_datagram_iovec(struct dma_chan* chan, const struct sk_buff *skb, int offset, struct iovec *to, size_t len, struct dma_pinned_list *pinned_list); +int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen); #endif /* CONFIG_NET_DMA */ #endif /* NETDMA_H */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9122520..a277398 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -262,7 +262,7 @@ #include #include #include - +#include #include #include @@ -1109,6 +1109,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; + int copied_early = 0; lock_sock(sk); @@ -1132,6 +1133,12 @@ int tcp_recvmsg(struct kiocb *iocb, stru target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); +#ifdef CONFIG_NET_DMA + tp->ucopy.dma_chan = NULL; + if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) + tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); +#endif + do { struct sk_buff *skb; u32 offset; @@ -1273,6 +1280,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru } else sk_wait_data(sk, &timeo); +#ifdef CONFIG_NET_DMA + tp->ucopy.wakeup = 0; +#endif + if (user_recv) { int chunk; @@ -1328,13 +1339,39 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; +#ifdef CONFIG_NET_DMA + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan) { + tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( + tp->ucopy.dma_chan, skb, offset, + msg->msg_iov, used, + tp->ucopy.pinned_list); + + if (tp->ucopy.dma_cookie < 0) { + + printk(KERN_ALERT "dma_cookie < 0\n"); + + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + if ((offset + used) == skb->len) + copied_early = 1; + + } else +#endif + { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } } @@ -1354,15 +1391,33 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + if (!(flags & MSG_PEEK)) { + if (!copied_early) + sk_eat_skb(sk, skb); +#ifdef CONFIG_NET_DMA + else { + __skb_unlink(skb, &sk->sk_receive_queue); + __skb_queue_tail(&sk->sk_async_wait_queue, skb); + copied_early = 0; + } +#endif + } continue;
Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
On Fri, Mar 03, 2006 at 01:42:36PM -0800, Chris Leech wrote: > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c > index 13abfa2..b792048 100644 > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -262,6 +262,9 @@ > #include > #include > #include > +#ifdef CONFIG_NET_DMA > +#include > +#endif #ifdef is not needed here (try not to put #ifdef in .c files.) I think a few of your other usages of #ifdef in this file can also be removed with judicious use of inline functions in a .h file. thanks, greg k-h - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
Chris Leech <[EMAIL PROTECTED]> wrote: > > Locks down user pages and sets up for DMA in tcp_recvmsg, then calls > dma_async_try_early_copy in tcp_v4_do_rcv > +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA +#ifdef CONFIG_NET_DMA waaay too many ifdefs. There are various tricks we use to minimise them. > +#ifdef CONFIG_NET_DMA > + tp->ucopy.dma_chan = NULL; > + if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && > !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) > + dma_lock_iovec_pages(msg->msg_iov, len, &tp->ucopy.locked_list); > +#endif Please try to fit code into 80 columns. That's decimal 80 ;) > @@ -1328,13 +1342,39 @@ do_prequeue: > } > > if (!(flags & MSG_TRUNC)) { > - err = skb_copy_datagram_iovec(skb, offset, > - msg->msg_iov, used); > - if (err) { > - /* Exception. Bailout! */ > - if (!copied) > - copied = -EFAULT; > - break; > +#ifdef CONFIG_NET_DMA > + if (!tp->ucopy.dma_chan && tp->ucopy.locked_list) > + tp->ucopy.dma_chan = get_softnet_dma(); > + > + if (tp->ucopy.dma_chan) { > + tp->ucopy.dma_cookie = > dma_skb_copy_datagram_iovec( > + tp->ucopy.dma_chan, skb, offset, > + msg->msg_iov, used, > + tp->ucopy.locked_list); > + > + if (tp->ucopy.dma_cookie < 0) { > + > + printk(KERN_ALERT "dma_cookie < 0\n"); > + > + /* Exception. Bailout! */ > + if (!copied) > + copied = -EFAULT; > + break; > + } > + if ((offset + used) == skb->len) > + copied_early = 1; > + Consider trimming some of those blank lines. I don't think they add any value? > + } else > +#endif > + { These games with ifdefs and else statements aren't at all pleasant. Sometimes they're hard to avoid, but you'll probably find that some code rearrangemnt (in a preceding patch) makes it easier. Like, split this function into several. > @@ -1354,15 +1394,33 @@ skip_copy: > > if (skb->h.th->fin) > goto found_fin_ok; > - if (!(flags & MSG_PEEK)) > - sk_eat_skb(sk, skb); > + if (!(flags & MSG_PEEK)) { > + if (!copied_early) > + sk_eat_skb(sk, skb); > +#ifdef CONFIG_NET_DMA > + else { > + __skb_unlink(skb, &sk->sk_receive_queue); > + __skb_queue_tail(&sk->sk_async_wait_queue, skb); > + copied_early = 0; > + } > +#endif > ... > - sk_eat_skb(sk, skb); > + if (!(flags & MSG_PEEK)) { > + if (!copied_early) > + sk_eat_skb(sk, skb); > +#ifdef CONFIG_NET_DMA > + else { > + __skb_unlink(skb, &sk->sk_receive_queue); > + __skb_queue_tail(&sk->sk_async_wait_queue, skb); > + copied_early = 0; > + } > +#endif > + } etc. > +#ifdef CONFIG_NET_DMA > + if (copied_early) > + __skb_queue_tail(&sk->sk_async_wait_queue, skb); > + else > +#endif > if (eaten) > __kfree_skb(skb); > else etc. > @@ -4049,6 +4067,52 @@ discard: > return 0; > } > > +#ifdef CONFIG_NET_DMA > +int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) > +{ > + struct tcp_sock *tp = tcp_sk(sk); > + int chunk = skb->len - hlen; > + int dma_cookie; > + int copied_early = 0; > + > + if (tp->ucopy.wakeup) > + goto out; In this case a simple return 0; would be fine. We haven't done anything yet. > +#ifdef CONFIG_NET_DMA > + struct tcp_sock *tp = tcp_sk(sk); > + if (!tp->ucopy.dma_chan && tp->ucopy.locked_list) > + tp->ucopy.dma_chan = get_s
Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
Chris Leech <[EMAIL PROTECTED]> wrote: > > +#ifdef CONFIG_NET_DMA > +tp->ucopy.dma_chan = NULL; > +if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && > !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) > +dma_lock_iovec_pages(msg->msg_iov, len, &tp->ucopy.locked_list); > +#endif The __get_cpu_var() here will run smp_processor_id() from preemptible context. You'll get a big warning if the correct debug options are set. The reason for this is that preemption could cause this code to hop between CPUs. Please always test code with all debug options enabled and with full kernel preemption. - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
From: Andrew Morton <[EMAIL PROTECTED]> Date: Sun, 5 Mar 2006 00:45:34 -0800 > The __get_cpu_var() here will run smp_processor_id() from preemptible > context. You'll get a big warning if the correct debug options are set. > > The reason for this is that preemption could cause this code to hop between > CPUs. > > Please always test code with all debug options enabled and with full kernel > preemption. To be fair that warning doesn't trigger on some platforms, such as sparc64 where the __get_cpu_var() implementation simply takes the value from a fixed cpu register and doesn't do the debugging check. Sparc64 should add the check when debugging options are enabled, for sure, but the point is that it may not entirely be the tester's fault. :-) - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
Hi! > --- a/net/ipv4/tcp.c > +++ b/net/ipv4/tcp.c > @@ -262,6 +262,9 @@ > #include > #include > #include > +#ifdef CONFIG_NET_DMA > +#include > +#endif > Remove the ifdefs, move them inside .h if needed. > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c > index 7625eaf..9b6290d 100644 > --- a/net/ipv4/tcp_input.c > +++ b/net/ipv4/tcp_input.c > @@ -71,6 +71,9 @@ > #include > #include > #include > +#ifdef CONFIG_NET_DMA > +#include > +#endif Here, too. > +#ifdef CONFIG_NET_DMA > + if (copied_early) > + __skb_queue_tail(&sk->sk_async_wait_queue, skb); > + else > +#endif > if (eaten) > __kfree_skb(skb); > else Could you #define copied_early to 0 and avoid ifdefs? > @@ -1091,8 +1094,18 @@ process: > bh_lock_sock(sk); > ret = 0; > if (!sock_owned_by_user(sk)) { > - if (!tcp_prequeue(sk, skb)) > +#ifdef CONFIG_NET_DMA > + struct tcp_sock *tp = tcp_sk(sk); > + if (!tp->ucopy.dma_chan && tp->ucopy.locked_list) > + tp->ucopy.dma_chan = get_softnet_dma(); > + if (tp->ucopy.dma_chan) > + ret = tcp_v4_do_rcv(sk, skb); > + else > +#endif > + { > + if (!tcp_prequeue(sk, skb)) > ret = tcp_v4_do_rcv(sk, skb); > + } > } else Wrong indentation... Pavel -- Thanks, Sharp! - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
> #ifdef is not needed here (try not to put #ifdef in .c files.) I think > a few of your other usages of #ifdef in this file can also be removed > with judicious use of inline functions in a .h file. ACK on all the ifdef comments. I may have gone a little ifdef crazy making sure I could get to a zero impact state with these patches applied but CONFIG_NET_DMA turned off. I'll get these cleaned up. - Chris - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
On 3/5/06, Andrew Morton <[EMAIL PROTECTED]> wrote: > Chris Leech <[EMAIL PROTECTED]> wrote: > > > > +#ifdef CONFIG_NET_DMA > > +tp->ucopy.dma_chan = NULL; > > +if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && > > !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) > > +dma_lock_iovec_pages(msg->msg_iov, len, > > &tp->ucopy.locked_list); > > +#endif > > The __get_cpu_var() here will run smp_processor_id() from preemptible > context. You'll get a big warning if the correct debug options are set. > > The reason for this is that preemption could cause this code to hop between > CPUs. I've been playing with different models of where to select which DMA channel to use in order to reduce cache thrash and lock contention in the driver. It's not a clean per-cpu issue because per I/O there are potentially operations happening in both the process syscall and the netrx softirq context. Right now the code delays selection of a DMA channel until the first offload copy is ready to go, so the __get_cpu_var() you point out is just checking to see if any hardware exists for I/OAT at this point before doing the page pinning. Before anything is done with the channel the per-cpu pointer is re-read safely with preemption disabled and a reference count is incremented. - Chris - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT
Chris Leech <[EMAIL PROTECTED]> wrote: > > Locks down user pages and sets up for DMA in tcp_recvmsg, then calls > dma_async_try_early_copy in tcp_v4_do_rcv All those ifdefs are still there. They really do put a maintenance burden on, of all places, net/ipv4/tcp.c. Please find a way of abstracting out the conceptual additions which IOAT makes to TCP and to wrap that into inline functions, etc. See how tidily NF_HOOK() has inserted things into the net stack. Also, for something which is billed as an performance enhancement, benchmark numbers are the key thing which we need to judge the desirability of these changes. What is the plan there? Thanks. - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html