[PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-03 Thread Chris Leech
Locks down user pages and sets up for DMA in tcp_recvmsg, then calls
dma_async_try_early_copy in tcp_v4_do_rcv

Signed-off-by: Chris Leech <[EMAIL PROTECTED]>
---

 include/net/netdma.h |1 
 net/ipv4/tcp.c   |  111 +-
 net/ipv4/tcp_input.c |   78 ---
 net/ipv4/tcp_ipv4.c  |   20 +
 net/ipv6/tcp_ipv6.c  |   12 +
 5 files changed, 201 insertions(+), 21 deletions(-)

diff --git a/include/net/netdma.h b/include/net/netdma.h
index 415d74c..2d829e1 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -37,5 +37,6 @@ static inline struct dma_chan *get_softn
 int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
const struct sk_buff *skb, int offset, struct iovec *to,
size_t len, struct dma_locked_list *locked_list);
+int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen);
 
 #endif /* NETDMA_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 13abfa2..b792048 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -262,6 +262,9 @@
 #include 
 #include 
 #include 
+#ifdef CONFIG_NET_DMA
+#include 
+#endif
 
 
 #include 
@@ -1109,6 +1112,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
+   int copied_early = 0;
 
lock_sock(sk);
 
@@ -1132,6 +1136,12 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
+#ifdef CONFIG_NET_DMA
+   tp->ucopy.dma_chan = NULL;
+   if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 
!sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma))
+   dma_lock_iovec_pages(msg->msg_iov, len, &tp->ucopy.locked_list);
+#endif
+
do {
struct sk_buff *skb;
u32 offset;
@@ -1273,6 +1283,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru
} else
sk_wait_data(sk, &timeo);
 
+#ifdef CONFIG_NET_DMA
+   tp->ucopy.wakeup = 0;
+#endif
+
if (user_recv) {
int chunk;
 
@@ -1328,13 +1342,39 @@ do_prequeue:
}
 
if (!(flags & MSG_TRUNC)) {
-   err = skb_copy_datagram_iovec(skb, offset,
- msg->msg_iov, used);
-   if (err) {
-   /* Exception. Bailout! */
-   if (!copied)
-   copied = -EFAULT;
-   break;
+#ifdef CONFIG_NET_DMA
+   if (!tp->ucopy.dma_chan && tp->ucopy.locked_list)
+   tp->ucopy.dma_chan = get_softnet_dma();
+
+   if (tp->ucopy.dma_chan) {
+   tp->ucopy.dma_cookie = 
dma_skb_copy_datagram_iovec(
+   tp->ucopy.dma_chan, skb, offset,
+   msg->msg_iov, used,
+   tp->ucopy.locked_list);
+
+   if (tp->ucopy.dma_cookie < 0) {
+
+   printk(KERN_ALERT "dma_cookie < 0\n");
+
+   /* Exception. Bailout! */
+   if (!copied)
+   copied = -EFAULT;
+   break;
+   }
+   if ((offset + used) == skb->len)
+   copied_early = 1;
+
+   } else
+#endif
+   {
+   err = skb_copy_datagram_iovec(skb, offset,
+   msg->msg_iov, used);
+   if (err) {
+   /* Exception. Bailout! */
+   if (!copied)
+   copied = -EFAULT;
+   break;
+   }
}
}
 
@@ -1354,15 +1394,33 @@ skip_copy:
 
if (skb->h.th->fin)
goto found_fin_ok;
-   if (!(flags & MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   if (!(flags & MSG_PEEK)) {
+   if (!copied_early)
+   sk_eat_skb(sk, skb);
+#ifdef CONFIG_NET_DMA
+   else {
+   __skb_unlink(skb, &sk->sk_receive_queue);
+   __skb_queue_tail(&sk->sk_async_wait_queue, skb);
+   copied_early = 0;
+   }
+#endif
+   }
continue;
 
found_

[PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-10 Thread Chris Leech
Locks down user pages and sets up for DMA in tcp_recvmsg, then calls
dma_async_try_early_copy in tcp_v4_do_rcv

Signed-off-by: Chris Leech <[EMAIL PROTECTED]>
---

 include/net/netdma.h |1 
 net/ipv4/tcp.c   |  110 +-
 net/ipv4/tcp_input.c |   74 ++
 net/ipv4/tcp_ipv4.c  |   18 
 net/ipv6/tcp_ipv6.c  |   12 +
 5 files changed, 193 insertions(+), 22 deletions(-)

diff --git a/include/net/netdma.h b/include/net/netdma.h
index feb499f..3d9c222 100644
--- a/include/net/netdma.h
+++ b/include/net/netdma.h
@@ -38,6 +38,7 @@ static inline struct dma_chan *get_softn
 int dma_skb_copy_datagram_iovec(struct dma_chan* chan,
const struct sk_buff *skb, int offset, struct iovec *to,
size_t len, struct dma_pinned_list *pinned_list);
+int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen);
 
 #endif /* CONFIG_NET_DMA */
 #endif /* NETDMA_H */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 9122520..a277398 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -262,7 +262,7 @@
 #include 
 #include 
 #include 
-
+#include 
 
 #include 
 #include 
@@ -1109,6 +1109,7 @@ int tcp_recvmsg(struct kiocb *iocb, stru
int target; /* Read at least this many bytes */
long timeo;
struct task_struct *user_recv = NULL;
+   int copied_early = 0;
 
lock_sock(sk);
 
@@ -1132,6 +1133,12 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
+#ifdef CONFIG_NET_DMA
+   tp->ucopy.dma_chan = NULL;
+   if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 
!sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma))
+   tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
+#endif
+
do {
struct sk_buff *skb;
u32 offset;
@@ -1273,6 +1280,10 @@ int tcp_recvmsg(struct kiocb *iocb, stru
} else
sk_wait_data(sk, &timeo);
 
+#ifdef CONFIG_NET_DMA
+   tp->ucopy.wakeup = 0;
+#endif
+
if (user_recv) {
int chunk;
 
@@ -1328,13 +1339,39 @@ do_prequeue:
}
 
if (!(flags & MSG_TRUNC)) {
-   err = skb_copy_datagram_iovec(skb, offset,
- msg->msg_iov, used);
-   if (err) {
-   /* Exception. Bailout! */
-   if (!copied)
-   copied = -EFAULT;
-   break;
+#ifdef CONFIG_NET_DMA
+   if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+   tp->ucopy.dma_chan = get_softnet_dma();
+
+   if (tp->ucopy.dma_chan) {
+   tp->ucopy.dma_cookie = 
dma_skb_copy_datagram_iovec(
+   tp->ucopy.dma_chan, skb, offset,
+   msg->msg_iov, used,
+   tp->ucopy.pinned_list);
+
+   if (tp->ucopy.dma_cookie < 0) {
+
+   printk(KERN_ALERT "dma_cookie < 0\n");
+
+   /* Exception. Bailout! */
+   if (!copied)
+   copied = -EFAULT;
+   break;
+   }
+   if ((offset + used) == skb->len)
+   copied_early = 1;
+
+   } else
+#endif
+   {
+   err = skb_copy_datagram_iovec(skb, offset,
+   msg->msg_iov, used);
+   if (err) {
+   /* Exception. Bailout! */
+   if (!copied)
+   copied = -EFAULT;
+   break;
+   }
}
}
 
@@ -1354,15 +1391,33 @@ skip_copy:
 
if (skb->h.th->fin)
goto found_fin_ok;
-   if (!(flags & MSG_PEEK))
-   sk_eat_skb(sk, skb);
+   if (!(flags & MSG_PEEK)) {
+   if (!copied_early)
+   sk_eat_skb(sk, skb);
+#ifdef CONFIG_NET_DMA
+   else {
+   __skb_unlink(skb, &sk->sk_receive_queue);
+   __skb_queue_tail(&sk->sk_async_wait_queue, skb);
+   copied_early = 0;
+   }
+#endif
+   }
continue;
 

Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-04 Thread Greg KH
On Fri, Mar 03, 2006 at 01:42:36PM -0800, Chris Leech wrote:
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 13abfa2..b792048 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -262,6 +262,9 @@
>  #include 
>  #include 
>  #include 
> +#ifdef CONFIG_NET_DMA
> +#include 
> +#endif

#ifdef is not needed here (try not to put #ifdef in .c files.)  I think
a few of your other usages of #ifdef in this file can also be removed
with judicious use of inline functions in a .h file.

thanks,

greg k-h
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-04 Thread Andrew Morton
Chris Leech <[EMAIL PROTECTED]> wrote:
>
> Locks down user pages and sets up for DMA in tcp_recvmsg, then calls
> dma_async_try_early_copy in tcp_v4_do_rcv
> 

+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA
+#ifdef CONFIG_NET_DMA

waaay too many ifdefs.   There are various tricks we use to minimise them.

> +#ifdef CONFIG_NET_DMA
> + tp->ucopy.dma_chan = NULL;
> + if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 
> !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma))
> + dma_lock_iovec_pages(msg->msg_iov, len, &tp->ucopy.locked_list);
> +#endif

Please try to fit code into 80 columns.

That's decimal 80 ;)

> @@ -1328,13 +1342,39 @@ do_prequeue:
>   }
>  
>   if (!(flags & MSG_TRUNC)) {
> - err = skb_copy_datagram_iovec(skb, offset,
> -   msg->msg_iov, used);
> - if (err) {
> - /* Exception. Bailout! */
> - if (!copied)
> - copied = -EFAULT;
> - break;
> +#ifdef CONFIG_NET_DMA
> + if (!tp->ucopy.dma_chan && tp->ucopy.locked_list)
> + tp->ucopy.dma_chan = get_softnet_dma();
> +
> + if (tp->ucopy.dma_chan) {
> + tp->ucopy.dma_cookie = 
> dma_skb_copy_datagram_iovec(
> + tp->ucopy.dma_chan, skb, offset,
> + msg->msg_iov, used,
> + tp->ucopy.locked_list);
> +
> + if (tp->ucopy.dma_cookie < 0) {
> +
> + printk(KERN_ALERT "dma_cookie < 0\n");
> +
> + /* Exception. Bailout! */
> + if (!copied)
> + copied = -EFAULT;
> + break;
> + }
> + if ((offset + used) == skb->len)
> + copied_early = 1;
> +

Consider trimming some of those blank lines.  I don't think they add any
value?

> + } else
> +#endif
> + {

These games with ifdefs and else statements aren't at all pleasant. 
Sometimes they're hard to avoid, but you'll probably find that some code
rearrangemnt (in a preceding patch) makes it easier.  Like, split this
function into several.

> @@ -1354,15 +1394,33 @@ skip_copy:
>  
>   if (skb->h.th->fin)
>   goto found_fin_ok;
> - if (!(flags & MSG_PEEK))
> - sk_eat_skb(sk, skb);
> + if (!(flags & MSG_PEEK)) {
> + if (!copied_early)
> + sk_eat_skb(sk, skb);
> +#ifdef CONFIG_NET_DMA
> + else {
> + __skb_unlink(skb, &sk->sk_receive_queue);
> + __skb_queue_tail(&sk->sk_async_wait_queue, skb);
> + copied_early = 0;
> + }
> +#endif
> ...
> - sk_eat_skb(sk, skb);
> + if (!(flags & MSG_PEEK)) {
> + if (!copied_early)
> + sk_eat_skb(sk, skb);
> +#ifdef CONFIG_NET_DMA
> + else {
> + __skb_unlink(skb, &sk->sk_receive_queue);
> + __skb_queue_tail(&sk->sk_async_wait_queue, skb);
> + copied_early = 0;
> + }
> +#endif
> + }

etc.

> +#ifdef CONFIG_NET_DMA
> + if (copied_early)
> + __skb_queue_tail(&sk->sk_async_wait_queue, skb);
> + else
> +#endif
>   if (eaten)
>   __kfree_skb(skb);
>   else

etc.

> @@ -4049,6 +4067,52 @@ discard:
>   return 0;
>  }
>  
> +#ifdef CONFIG_NET_DMA
> +int dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
> +{
> + struct tcp_sock *tp = tcp_sk(sk);
> + int chunk = skb->len - hlen;
> + int dma_cookie;
> + int copied_early = 0;
> +
> + if (tp->ucopy.wakeup)
> + goto out;

In this case a simple

return 0;

would be fine.  We haven't done anything yet.

> +#ifdef CONFIG_NET_DMA
> + struct tcp_sock *tp = tcp_sk(sk);
> + if (!tp->ucopy.dma_chan && tp->ucopy.locked_list)
> + tp->ucopy.dma_chan = get_s

Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-05 Thread Andrew Morton
Chris Leech <[EMAIL PROTECTED]> wrote:
>
> +#ifdef CONFIG_NET_DMA
>  +tp->ucopy.dma_chan = NULL;
>  +if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 
> !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma))
>  +dma_lock_iovec_pages(msg->msg_iov, len, &tp->ucopy.locked_list);
>  +#endif

The __get_cpu_var() here will run smp_processor_id() from preemptible
context.  You'll get a big warning if the correct debug options are set.

The reason for this is that preemption could cause this code to hop between
CPUs.

Please always test code with all debug options enabled and with full kernel
preemption.

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-05 Thread David S. Miller
From: Andrew Morton <[EMAIL PROTECTED]>
Date: Sun, 5 Mar 2006 00:45:34 -0800

> The __get_cpu_var() here will run smp_processor_id() from preemptible
> context.  You'll get a big warning if the correct debug options are set.
> 
> The reason for this is that preemption could cause this code to hop between
> CPUs.
> 
> Please always test code with all debug options enabled and with full kernel
> preemption.

To be fair that warning doesn't trigger on some platforms, such as
sparc64 where the __get_cpu_var() implementation simply takes the
value from a fixed cpu register and doesn't do the debugging check.

Sparc64 should add the check when debugging options are enabled, for
sure, but the point is that it may not entirely be the tester's fault.
:-)
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-05 Thread Pavel Machek
Hi!

> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -262,6 +262,9 @@
>  #include 
>  #include 
>  #include 
> +#ifdef CONFIG_NET_DMA
> +#include 
> +#endif
>  

Remove the ifdefs, move them inside .h if needed.

> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 7625eaf..9b6290d 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -71,6 +71,9 @@
>  #include 
>  #include 
>  #include 
> +#ifdef CONFIG_NET_DMA
> +#include 
> +#endif

Here, too.

> +#ifdef CONFIG_NET_DMA
> + if (copied_early)
> + __skb_queue_tail(&sk->sk_async_wait_queue, skb);
> + else
> +#endif
>   if (eaten)
>   __kfree_skb(skb);
>   else

Could you #define copied_early to 0 and avoid ifdefs?

> @@ -1091,8 +1094,18 @@ process:
>   bh_lock_sock(sk);
>   ret = 0;
>   if (!sock_owned_by_user(sk)) {
> - if (!tcp_prequeue(sk, skb))
> +#ifdef CONFIG_NET_DMA
> + struct tcp_sock *tp = tcp_sk(sk);
> + if (!tp->ucopy.dma_chan && tp->ucopy.locked_list)
> + tp->ucopy.dma_chan = get_softnet_dma();
> + if (tp->ucopy.dma_chan)
> + ret = tcp_v4_do_rcv(sk, skb);
> + else
> +#endif
> + {
> + if (!tcp_prequeue(sk, skb))
>   ret = tcp_v4_do_rcv(sk, skb);
> + }
>   } else

Wrong indentation...
Pavel
-- 
Thanks, Sharp!
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-06 Thread Chris Leech
> #ifdef is not needed here (try not to put #ifdef in .c files.)  I think
> a few of your other usages of #ifdef in this file can also be removed
> with judicious use of inline functions in a .h file.

ACK on all the ifdef comments.  I may have gone a little ifdef crazy
making sure I could get to a zero impact state with these patches
applied but CONFIG_NET_DMA turned off.  I'll get these cleaned up.

- Chris
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-06 Thread Chris Leech
On 3/5/06, Andrew Morton <[EMAIL PROTECTED]> wrote:
> Chris Leech <[EMAIL PROTECTED]> wrote:
> >
> > +#ifdef CONFIG_NET_DMA
> >  +tp->ucopy.dma_chan = NULL;
> >  +if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && 
> > !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma))
> >  +dma_lock_iovec_pages(msg->msg_iov, len, 
> > &tp->ucopy.locked_list);
> >  +#endif
>
> The __get_cpu_var() here will run smp_processor_id() from preemptible
> context.  You'll get a big warning if the correct debug options are set.
>
> The reason for this is that preemption could cause this code to hop between
> CPUs.

I've been playing with different models of where to select which DMA
channel to use in order to reduce cache thrash and lock contention in
the driver.  It's not a clean per-cpu issue because per I/O there are
potentially operations happening in both the process syscall and the
netrx softirq context.

Right now the code delays selection of a DMA channel until the first
offload copy is ready to go, so the __get_cpu_var() you point out is
just checking to see if any hardware exists for I/OAT at this point
before doing the page pinning.  Before anything is done with the
channel the per-cpu pointer is re-read safely with preemption disabled
and a reference count is incremented.

 - Chris
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 8/8] [I/OAT] TCP recv offload to I/OAT

2006-03-11 Thread Andrew Morton
Chris Leech <[EMAIL PROTECTED]> wrote:
>
> Locks down user pages and sets up for DMA in tcp_recvmsg, then calls
>  dma_async_try_early_copy in tcp_v4_do_rcv

All those ifdefs are still there.  They really do put a maintenance burden
on, of all places, net/ipv4/tcp.c.  Please find a way of abstracting out
the conceptual additions which IOAT makes to TCP and to wrap that into
inline functions, etc.  See how tidily NF_HOOK() has inserted things into
the net stack.

Also, for something which is billed as an performance enhancement,
benchmark numbers are the key thing which we need to judge the desirability
of these changes.  What is the plan there?

Thanks.
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html