from:"Herbert Xu"

[NET]: Added GSO header verification

2006-06-27 Thread Herbert Xu

Hi Dave:

This feature is only needed by Xen but most of the code here is useful
for other things like TCPv4 ECN support.

[NET]: Added GSO header verification

When GSO packets come from an untrusted source (e.g., a Xen guest domain),
we need to verify the header integrity before passing it to the hardware.

Since the first step in GSO is to verify the header, we can reuse that
code by adding a new bit to gso_type: SKB_GSO_DODGY.  Packets with this
bit set can only be fed directly to devices with the corresponding bit
NETIF_F_GSO_ROBUST.  If the device doesn't have that bit, then the skb
is fed to the GSO engine which will allow the packet to be sent to the
hardware if it passes the header check.

This patch changes the sg flag to a full features flag.  The same method
can be used to implement TSO ECN support.  We simply have to mark packets
with CWR set with SKB_GSO_ECN so that only hardware with a corresponding
NETIF_F_TSO_ECN can accept them.  The GSO engine can either fully segment
the packet, or segment the first MTU and pass the rest to the hardware for
further segmentation.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -315,6 +315,7 @@ struct net_device
 #define NETIF_F_GSO_SHIFT  16
 #define NETIF_F_TSO(SKB_GSO_TCPV4  NETIF_F_GSO_SHIFT)
 #define NETIF_F_UFO(SKB_GSO_UDPV4  NETIF_F_GSO_SHIFT)
+#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY  NETIF_F_GSO_SHIFT)
 
 #define NETIF_F_GEN_CSUM   (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 #define NETIF_F_ALL_CSUM   (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
@@ -543,7 +544,8 @@ struct packet_type {
 struct net_device *,
 struct packet_type *,
 struct net_device *);
-   struct sk_buff  *(*gso_segment)(struct sk_buff *skb, int sg);
+   struct sk_buff  *(*gso_segment)(struct sk_buff *skb,
+   int features);
void*af_packet_priv;
struct list_headlist;
 };
@@ -968,7 +970,7 @@ extern int  netdev_max_backlog;
 extern int weight_p;
 extern int netdev_set_master(struct net_device *dev, struct 
net_device *master);
 extern int skb_checksum_help(struct sk_buff *skb, int inward);
-extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg);
+extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features);
 #ifdef CONFIG_BUG
 extern void netdev_rx_csum_fault(struct net_device *dev);
 #else
@@ -988,11 +990,16 @@ extern void dev_seq_stop(struct seq_file
 
 extern void linkwatch_run_queue(void);
 
+static inline int skb_gso_ok(struct sk_buff *skb, int features)
+{
+   int feature = skb_shinfo(skb)-gso_size ?
+ skb_shinfo(skb)-gso_type  NETIF_F_GSO_SHIFT : 0;
+   return (features  feature) != feature;
+}
+
 static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
 {
-   int feature = skb_shinfo(skb)-gso_type  NETIF_F_GSO_SHIFT;
-   return skb_shinfo(skb)-gso_size 
-  (dev-features  feature) != feature;
+   return skb_gso_ok(skb, dev-features);
 }
 
 #endif /* __KERNEL__ */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -172,6 +172,9 @@ enum {
 enum {
SKB_GSO_TCPV4 = 1  0,
SKB_GSO_UDPV4 = 1  1,
+
+   /* This indicates the skb is from an untrusted source. */
+   SKB_GSO_DODGY = 1  2,
 };
 
 /** 
@@ -1299,7 +1302,7 @@ extern void  skb_split(struct sk_b
 struct sk_buff *skb1, const u32 len);
 
 extern void   skb_release_data(struct sk_buff *skb);
-extern struct sk_buff *skb_segment(struct sk_buff *skb, int sg);
+extern struct sk_buff *skb_segment(struct sk_buff *skb, int features);
 
 static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
   int len, void *buffer)
diff --git a/include/net/protocol.h b/include/net/protocol.h
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -36,7 +36,8 @@
 struct net_protocol {
int (*handler)(struct sk_buff *skb);
void(*err_handler)(struct sk_buff *skb, u32 info);
-   struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg);
+   struct sk_buff *(*gso_segment)(struct sk_buff *skb,
+  int features);
int no_policy;
 };
 
diff --git a/include/net/tcp.h b/include/net/tcp.h

Re: [PATCH REPOST 0/2][RFC] Network Event Notifier Mechanism

2006-06-27 Thread Herbert Xu

Steve Wise [EMAIL PROTECTED] wrote:
 
 This patch implements a mechanism that allows interested clients to
 register for notification of certain network events. The intended use
 is to allow RDMA devices (linux/drivers/infiniband) to be notified of
 neighbour updates, ICMP redirects, path MTU changes, and route changes.
 
 The reason these devices need update events is because they typically
 cache this information in hardware and need to be notified when this
 information has been updated.
 
 The key events of interest are:
 
 - neighbour mac address change 
 - routing redirect (the next hop neighbour changes for a dst_entry)
 - path mtu change (the patch mtu for a dst_entry changes).
 - route add/deletes

I'd like to know more about what the RDMA device is going to do with this
information.  I thought RDMA was for receiving packets? Most of the info
here pertains to transmission.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [NET]: Added GSO header verification

2006-06-27 Thread Herbert Xu

On Tue, Jun 27, 2006 at 01:46:35PM -0700, Michael Chan wrote:
 On Tue, 2006-06-27 at 22:07 +1000, Herbert Xu wrote:
 
  [NET]: Added GSO header verification
 
  @@ -2166,10 +2166,14 @@ struct sk_buff *tcp_tso_segment(struct s
  if (!pskb_may_pull(skb, thlen))
  goto out;
   
  +   segs = NULL;
  +   if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST))
  +   goto out;
  +
 
 This logic doesn't look right to me.  Perhaps it's backwards and should
 be:
 
 if (!skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST))

Oops, you're absolutely right.  Here is the fix.

[NET]: Fix logical error in skb_gso_ok

The test in skb_gso_ok is backwards.  Noticed by Michael Chan
[EMAIL PROTECTED].

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 84b0f0d..efd1e2a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -994,12 +994,12 @@ static inline int skb_gso_ok(struct sk_b
 {
int feature = skb_shinfo(skb)-gso_size ?
  skb_shinfo(skb)-gso_type  NETIF_F_GSO_SHIFT : 0;
-   return (features  feature) != feature;
+   return (features  feature) == feature;
 }
 
 static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
 {
-   return skb_gso_ok(skb, dev-features);
+   return !skb_gso_ok(skb, dev-features);
 }
 
 #endif /* __KERNEL__ */
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH REPOST 0/2][RFC] Network Event Notifier Mechanism

2006-06-27 Thread Herbert Xu

On Tue, Jun 27, 2006 at 09:31:57AM -0500, Steve Wise wrote:
 
  I'd like to know more about what the RDMA device is going to do with this
  information.  I thought RDMA was for receiving packets? Most of the info
  here pertains to transmission.
 
 RDMA Ethernet devices adhere to a set of protocols defined by the IETF.
 See the RDDP WG (http://www.ietf.org/html.charters/rddp-charter.html)
 for the Internet Drafts that define the protocols.

Would it be possible for you to give us a quick summary of the relevant
points?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH Round 3 0/2][RFC] Network Event Notifier Mechanism

2006-06-27 Thread Herbert Xu

Steve Wise [EMAIL PROTECTED] wrote:
 
 The reason these devices need update events is because they typically
 cache this information in hardware and need to be notified when this
 information has been updated.  For information on RDMA protocols, see:
 http://www.ietf.org/html.charters/rddp-charter.html.

Please give more specific reasons for needing these events because it
is certainly far from obvious from reading those documents.

Without reasons these invasive changes may turn out to be completely
inappropriate.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH Round 3 0/2][RFC] Network Event Notifier Mechanism

2006-06-27 Thread Herbert Xu

On Wed, Jun 28, 2006 at 12:54:10PM +1000, Herbert Xu wrote:
 
 Please give more specific reasons for needing these events because it
 is certainly far from obvious from reading those documents.

Never mind, I've found your earlier messages on the list which explains
your reasons more clearly.  It would be nice if you could include those
explanations in your patch description.

BTW, does this mean that we're now comfortable with full TOE?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH]NET: Add ECN support for TSO

2006-06-27 Thread Herbert Xu

On Tue, Jun 27, 2006 at 08:06:47PM -0700, Michael Chan wrote:

 diff --git a/include/net/sock.h b/include/net/sock.h
 index 2d8d6ad..2c75172 100644
 --- a/include/net/sock.h
 +++ b/include/net/sock.h
 @@ -1033,7 +1033,8 @@ static inline void sk_setup_caps(struct 
   if (sk-sk_route_caps  NETIF_F_GSO)
   sk-sk_route_caps |= NETIF_F_TSO;
   if (sk-sk_route_caps  NETIF_F_TSO) {
 - if (sock_flag(sk, SOCK_NO_LARGESEND) || dst-header_len)
 + if ((sock_flag(sk, SOCK_NO_LARGESEND) 
 + !tso_ecn_capable(sk-sk_route_caps)) || dst-header_len)
   sk-sk_route_caps = ~NETIF_F_TSO;

Why turn it off? With GSO in place the stack will handle it just fine
(even your description says so :)  We should instead remove all code
that turns off TSO/ECN when the other is present.

Otherwise the patch looks good.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH Round 3 0/2][RFC] Network Event Notifier Mechanism

2006-06-27 Thread Herbert Xu

On Tue, Jun 27, 2006 at 11:24:25PM -0400, Jeff Garzik wrote:

 I don't see how that position has changed?
 
 http://linux-net.osdl.org/index.php/TOE

Well I must say that RDMA over TCP smells very much like TOE.  They've
got an ARP table, a routing table, and presumably a TCP stack.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH]NET: Add ECN support for TSO

2006-06-27 Thread Herbert Xu

On Tue, Jun 27, 2006 at 08:40:34PM -0700, Michael Chan wrote:

 We need to turn off NETIF_F_TSO for a connection that has negotiated to
 turn on ECN if the output device cannot handle TSO and ECN.  In other
 words, if the output device does not have either GSO or TSO_ECN feature
 set.

I think you're mixing up GSO the mechanism with GSO the flag.  The GSO
flag simply tells the TCP stack whether TSO should be used or not, even
if the hardware does not support TSO at all.  The GSO mechanism on the
other hand is ALWAYS present.  So regardless of the presence of the GSO
flag, you can always rely on the GSO mechanism to pick up the pieces (or
rather generate the pieces as the case may be :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: TOE, etc. (was Re: [PATCH Round 3 0/2][RFC] Network Event Notifier Mechanism)

2006-06-27 Thread Herbert Xu

On Wed, Jun 28, 2006 at 12:18:25AM -0400, Jeff Garzik wrote:
 
 A PCI device that presents itself as a SCSI controller, but under the 
 hood is really iSCSI-over-TCP smells like TOE.  Running a virtualized 
 Linux guest on top of a proprietary stack [which provides networking 
 services to guests] also smells like TOE.  :)

Agreed.  However, when they start adding hooks to the ARP table, the
routing table, and PMTU management, it begs the question what more is
there to add for TOE (well, user-space driven TOE at least)?
 
 Unfortunately I don't have more details, so you just get a generalized 
 rant :)

OK, the patch under discussion here adds hooks to all the stuff in the
previous paragraph for the purpose of RDMA over TCP (well I must say
that the exact RDMA application/hardware has never been clearly given
but this is what I can gather from the previous posts).

Put it another way, I think the dividing line between TOE and iSCSI or
virtualisation is exactly the interface between them and the Linux kernel.
If the interface is an existing one such as SCSI or standard IP then it's
OK.  However, when it starts poking in the guts of the Linux stack I'd say
that it has crossed the line.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH]NET: Add ECN support for TSO

2006-06-27 Thread Herbert Xu

On Tue, Jun 27, 2006 at 09:37:01PM -0700, Michael Chan wrote:
 
 Signed-off-by: Michael Chan [EMAIL PROTECTED]

Looks good to me too!

 @@ -56,6 +55,9 @@ static inline void TCP_ECN_send(struct s
   if (tp-ecn_flagsTCP_ECN_QUEUE_CWR) {
   tp-ecn_flags = ~TCP_ECN_QUEUE_CWR;
   skb-h.th-cwr = 1;
 + if (skb_shinfo(skb)-gso_type  SKB_GSO_TCPV4)
 + skb_shinfo(skb)-gso_type |=
 + SKB_GSO_TCPV4_ECN;

As a byte-pincher I must suggest that you turn this check into something
like 

if (skb_shinfo(skb)-gso_type)

or even

if (skb_shinfo(skb)-gso_size)

:)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH]NET: Add ECN support for TSO

2006-06-27 Thread Herbert Xu

On Tue, Jun 27, 2006 at 09:54:39PM -0700, Michael Chan wrote:

 Assuming that we'll later have GSO_TCPV6, isn't it better to check for
 TCPV4 explicitly now?  Or just change it later when necessary.

Good point, I suppose you never know whether a V6 TSO-capable card is going
to handle ECN correctly in both cases.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: TOE, etc.

2006-06-27 Thread Herbert Xu

On Tue, Jun 27, 2006 at 09:43:23PM -0700, David Miller wrote:
 
 Socket state, and that is one thing I don't see them doing yet.

I wonder what happens when the Linux TCP stack attempts to open a
connection to a remote host when that connection is already open
in the RDMA NIC?  For that matter what happens if a Linux application
decides to listen on a TCP port already listened on by the RDMA
NIC?

The only saving grace is that they're only doing RDMA rather than
arbitrary TCP.  However, exactly the same infrastructure can be used
to do arbitrary TCP should they wish to.
 
 But we have to realize they've already been given %95 of the
 interfaces they need to speak IP using our routes and our neighbour
 entries.
 
 Right?

Yes, however I think the same argument could be applied to TOE.

With their RDMA NIC, we'll have TCP/SCTP connections that bypass
netfilter, tc, IPsec, AF_PACKET/tcpdump and the rest of our stack
while at the same time it is using the same IP address as us and
deciding what packets we will or won't see.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2.6.17] support for TSO over IPv6

2006-06-29 Thread Herbert Xu

Ananda Raju [EMAIL PROTECTED] wrote:
This patch enables TSO over IPv6. Currently Linux network stacks
restricts TSO over IPv6 by clearing of the NETIF_F_TSO bit from
dev-features. This patch will remove this restriction.

Thanks, looks good over all.
 
SKB_GSO_TCPV4 renamed to SKB_GSO_TCP to make it generic GSO packet.
SKB_GSO_UDPV4 renamed to SKB_GSO_UDP as UFO is not a IPv4 feature.
UFO is supported over IPv6 also

This bit is wrong though.  TCPv4 and TCPv6 packets can't share the same
GSO feature bit.  The reason is that GSO/TSO is no longer just activated
by dev-features.  Bridges can forward GSO/TSO packets through at any time.
This is why it is crucial that each packet specficies exactly the features
that it requires from the hardware.

For UFO, it's OK to have just SKB_GSO_UDP since IIRC your card is the only
one that supports it and it supports both protocols anyway.

 diff -upNr netdev.org/include/linux/skbuff.h 
 netdev.ipv6_tso/include/linux/skbuff.h
 --- netdev.org/include/linux/skbuff.h   2006-06-27 07:30:36.0 -0700
 +++ netdev.ipv6_tso/include/linux/skbuff.h  2006-06-27 07:38:48.0 
 -0700
 @@ -170,8 +170,9 @@ enum {
 };
 
 enum {
 -   SKB_GSO_TCPV4 = 1  0,
 -   SKB_GSO_UDPV4 = 1  1,
 +   SKB_GSO_TCP = 1  0,
 +   SKB_GSO_UDP = 1  1,
 +   SKB_GSO_TCPV6 = 1  2,
 };

BTW, you should rediff against Dave's current tree which has a few
extra bits there.

You should also leave TCPV4 as is and just add the TCPV6 bit.
 
 diff -upNr netdev.org/net/ipv4/tcp_output.c 
 netdev.ipv6_tso/net/ipv4/tcp_output.c
 --- netdev.org/net/ipv4/tcp_output.c2006-06-27 07:30:36.0 -0700
 +++ netdev.ipv6_tso/net/ipv4/tcp_output.c   2006-06-27 07:38:48.0 
 -0700
 @@ -525,7 +525,7 @@ static void tcp_set_skb_tso_segs(struct 
factor /= mss_now;
skb_shinfo(skb)-gso_segs = factor;
skb_shinfo(skb)-gso_size = mss_now;
 -   skb_shinfo(skb)-gso_type = SKB_GSO_TCPV4;
 +   skb_shinfo(skb)-gso_type = SKB_GSO_TCP;

You need to set SKB_GSO_TCPV6 for IPv6 packets here.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2.6.17] support for TSO over IPv6

2006-06-29 Thread Herbert Xu

On Fri, Jun 30, 2006 at 09:32:44AM +1000, Herbert Xu wrote:
 
  diff -upNr netdev.org/include/linux/skbuff.h 
  netdev.ipv6_tso/include/linux/skbuff.h
  --- netdev.org/include/linux/skbuff.h   2006-06-27 07:30:36.0 -0700
  +++ netdev.ipv6_tso/include/linux/skbuff.h  2006-06-27 
  07:38:48.0 -0700
  @@ -170,8 +170,9 @@ enum {
  };
  
  enum {
  -   SKB_GSO_TCPV4 = 1  0,
  -   SKB_GSO_UDPV4 = 1  1,
  +   SKB_GSO_TCP = 1  0,
  +   SKB_GSO_UDP = 1  1,
  +   SKB_GSO_TCPV6 = 1  2,
  };
 
 BTW, you should rediff against Dave's current tree which has a few
 extra bits there.
 
 You should also leave TCPV4 as is and just add the TCPV6 bit.

BTW, does your card handle ECN correctly? If not then we should change
the new ECN bit to apply to both TCPv4 and TCPv6 since

1) We now have a piece of hardware that handles TSO6 and it doesn't do ECN.
2) It's quite likely that if the NIC can handle ECN in TCPv4 then it can do
   it in TCPv6.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[TCP]: Reset gso_segs if packet is dodgy

2006-06-29 Thread Herbert Xu

Hi Dave:

I forgot to verify gso_segs on packets from untrusted sources.  In fact
looking around it seems that gso_segs is used by exactly one driver outside
of the TCP stack.  In fact it also happens to be a virtual driver: s390/qeth.

Since the only other GSO user we have at the moment -- UFO, doesn't even set
gso_segs, I'd like to move it to skb-cb and get rid of this.  However, for
now let's simply reset it in tcp_tso_segment.

[TCP]: Reset gso_segs if packet is dodgy

I wasn't paranoid enough in verifying GSO information.  A bogus gso_segs
could upset drivers as much as a bogus header would.  Let's reset it in
the per-protocol gso_segment functions.

I didn't verify gso_size because that can be verified by the source of
the dodgy packets.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0336422..0bb0ac9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2166,13 +2166,19 @@ struct sk_buff *tcp_tso_segment(struct s
if (!pskb_may_pull(skb, thlen))
goto out;
 
-   segs = NULL;
-   if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST))
-   goto out;
-
oldlen = (u16)~skb-len;
__skb_pull(skb, thlen);
 
+   if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
+   /* Packet is from an untrusted source, reset gso_segs. */
+   int mss = skb_shinfo(skb)-gso_size;
+
+   skb_shinfo(skb)-gso_segs = (skb-len + mss - 1) / mss;
+
+   segs = NULL;
+   goto out;
+   }
+
segs = skb_segment(skb, features);
if (IS_ERR(segs))
goto out;
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2.6.17] support for TSO over IPv6

2006-06-29 Thread Herbert Xu

On Thu, Jun 29, 2006 at 07:06:51PM -0700, Michael Chan wrote:

 Don't we have a bigger problem if it doesn't support ECN with ipv6 TSO?
 We either have to disable ECN when TSO is enabled like we used to for
 ipv4, or provide the gso tcp segmentation for ipv6.  Right?

Good point.  In that case we should also add GSO for IPv6.  It shouldn't
be too bad because all we need to do is verify the integrity of the
extension headers in terms of their length, not their content.

I wonder if this chip handles all the extension headers or not.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2.6.17-mm3 -- BUG: illegal lock usage -- illegal {softirq-on-W} - {in-softirq-R} usage.

2006-06-29 Thread Herbert Xu

Andrew Morton [EMAIL PROTECTED] wrote:

  inet_bind()
  -sk_dst_get
-read_lock(sk-sk_dst_lock)

We are still holding the sock lock when doing sk_dst_get.

   1 lock held by java_vm/4418:
#0:  (af_family_keys + (sk)-sk_family#4){-+..}, at: [f93c9281]
   tcp_v6_rcv+0x308/0x7b7 [ipv6]
  
  softirq
  -ip6_dst_lookup
-sk_dst_check
  -sk_dst_reset
-write_lock(sk-sk_dst_lock);

The sock lock prevents this path from being entered.  Instead the
received TCP packet is queued and replayed when the sock lock is
released.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[1/4] [IPV6]: Remove redundant length check on input

2006-06-29 Thread Herbert Xu

Hi Dave:

I've added GSO for TCPv6 and updated Ananda's patch.  Please note that
the following patches have only been compile-tested.

[IPV6]: Remove redundant length check on input

We don't need to check skb-len when we're just about to call
pskb_may_pull since that checks it for us.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -84,14 +84,9 @@ int ipv6_rcv(struct sk_buff *skb, struct
 */
IP6CB(skb)-iif = skb-dst ? ((struct rt6_info 
*)skb-dst)-rt6i_idev-dev-ifindex : dev-ifindex;
 
-   if (skb-len  sizeof(struct ipv6hdr))
+   if (unlikely(!pskb_may_pull(skb, sizeof(*hdr
goto err;
 
-   if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) {
-   IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
-   goto drop;
-   }
-
hdr = skb-nh.ipv6h;
 
if (hdr-version != 6)
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[2/4] [IPV6]: Added GSO support for TCPv6

2006-06-29 Thread Herbert Xu

Hi:

[IPV6]: Added GSO support for TCPv6

This patch adds GSO support for IPv6 and TCPv6.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/include/net/protocol.h b/include/net/protocol.h
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -50,11 +50,17 @@ struct inet6_protocol 
   struct inet6_skb_parm *opt,
   int type, int code, int offset,
   __u32 info);
+
+   struct sk_buff *(*gso_segment)(struct sk_buff *skb,
+  int features);
+
unsigned intflags;  /* INET6_PROTO_xxx */
 };
 
 #define INET6_PROTO_NOPOLICY   0x1
 #define INET6_PROTO_FINAL  0x2
+/* This should be set for any extension header which is compatible with GSO. */
+#define INET6_PROTO_GSO_EXTHDR 0x4
 #endif
 
 /* This is used to register socket interfaces for IP protocols.  */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2215,6 +2215,7 @@ struct sk_buff *tcp_tso_segment(struct s
 out:
return segs;
 }
+EXPORT_SYMBOL(tcp_tso_segment);
 
 extern void __skb_cb_too_small_for_tcp(int, int);
 extern struct tcp_congestion_ops tcp_reno;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -179,7 +179,7 @@ static int ipv6_destopt_rcv(struct sk_bu
 
 static struct inet6_protocol destopt_protocol = {
.handler=   ipv6_destopt_rcv,
-   .flags  =   INET6_PROTO_NOPOLICY,
+   .flags  =   INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR,
 };
 
 void __init ipv6_destopt_init(void)
@@ -340,7 +340,7 @@ looped_back:
 
 static struct inet6_protocol rthdr_protocol = {
.handler=   ipv6_rthdr_rcv,
-   .flags  =   INET6_PROTO_NOPOLICY,
+   .flags  =   INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR,
 };
 
 void __init ipv6_rthdr_init(void)
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -58,9 +58,71 @@
 
 DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly;
 
+static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features)
+{
+   struct sk_buff *segs = ERR_PTR(-EINVAL);
+   struct ipv6hdr *ipv6h;
+   struct inet6_protocol *ops;
+   int proto;
+
+   if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h
+   goto out;
+
+   ipv6h = skb-nh.ipv6h;
+   proto = ipv6h-nexthdr;
+   __skb_pull(skb, sizeof(*ipv6h));
+
+   rcu_read_lock();
+   for (;;) {
+   struct ipv6_opt_hdr *opth;
+   int len;
+
+   if (proto != NEXTHDR_HOP) {
+   ops = rcu_dereference(inet6_protos[proto]);
+
+   if (unlikely(!ops))
+   goto unlock;
+
+   if (!(ops-flags  INET6_PROTO_GSO_EXTHDR))
+   break;
+   }
+
+   if (unlikely(!pskb_may_pull(skb, 8)))
+   goto unlock;
+
+   opth = (void *)skb-data;
+   len = opth-hdrlen * 8 + 8;
+
+   if (unlikely(!pskb_may_pull(skb, len)))
+   goto unlock;
+
+   proto = opth-nexthdr;
+   __skb_pull(skb, len);
+   }
+
+   skb-h.raw = skb-data;
+   if (likely(ops-gso_segment))
+   segs = ops-gso_segment(skb, features);
+
+unlock:
+   rcu_read_unlock();
+
+   if (unlikely(IS_ERR(segs)))
+   goto out;
+
+   for (skb = segs; skb; skb = skb-next) {
+   ipv6h = skb-nh.ipv6h;
+   ipv6h-payload_len = htons(skb-len - skb-mac_len);
+   }
+
+out:
+   return segs;
+}
+
 static struct packet_type ipv6_packet_type = {
.type = __constant_htons(ETH_P_IPV6), 
.func = ipv6_rcv,
+   .gso_segment = ipv6_gso_segment,
 };
 
 struct ip6_ra_chain *ip6_ra_chain;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1605,6 +1605,7 @@ struct proto tcpv6_prot = {
 static struct inet6_protocol tcpv6_protocol = {
.handler=   tcp_v6_rcv,
.err_handler=   tcp_v6_err,
+   .gso_segment=   tcp_tso_segment,
.flags  =   INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
 };
 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[4/4] [IPV6]: Added GSO support for TCPv6

2006-06-29 Thread Herbert Xu

Hi:

[IPV6]: Added GSO support for TCPv6

This patch adds GSO support for IPv6 and TCPv6.  This is based on a patch
by Ananda Raju [EMAIL PROTECTED].  His original description is:

This patch enables TSO over IPv6. Currently Linux network stacks
restricts TSO over IPv6 by clearing of the NETIF_F_TSO bit from
dev-features. This patch will remove this restriction.

This patch will introduce a new flag NETIF_F_TSO6 which will be used
to check whether device supports TSO over IPv6. If device support TSO
over IPv6 then we don't clear of NETIF_F_TSO and which will make the
TCP layer to create TSO packets. Any device supporting TSO over IPv6
will set NETIF_F_TSO6 flag in dev-features along with NETIF_F_TSO.

In case when user disables TSO using ethtool, NETIF_F_TSO will get
cleared from dev-features. So even if we have NETIF_F_TSO6 we don't
get TSO packets created by TCP layer.

SKB_GSO_TCPV4 renamed to SKB_GSO_TCP to make it generic GSO packet.
SKB_GSO_UDPV4 renamed to SKB_GSO_UDP as UFO is not a IPv4 feature.
UFO is supported over IPv6 also

The following table shows there is significant improvement in
throughput with normal frames and CPU usage for both normal and jumbo.

--
|  | 1500|  9600 |
|  --|---|
|  | thru CPU|  thru CPU |
--
| TSO OFF  | 2.00   5.5% id  |  5.66   20.0% id  |
--
| TSO ON   | 2.63   78.0 id  |  5.67   39.0% id  |
--

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c
--- a/drivers/net/s2io.c
+++ b/drivers/net/s2io.c
@@ -3960,7 +3960,7 @@ static int s2io_xmit(struct sk_buff *skb
txdp-Control_2 = 0;
 #ifdef NETIF_F_TSO
mss = skb_shinfo(skb)-gso_size;
-   if (skb_shinfo(skb)-gso_type == SKB_GSO_TCPV4) {
+   if (skb_shinfo(skb)-gso_type  (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) {
txdp-Control_1 |= TXD_TCP_LSO_EN;
txdp-Control_1 |= TXD_TCP_LSO_MSS(mss);
}
@@ -3980,7 +3980,7 @@ static int s2io_xmit(struct sk_buff *skb
}
 
frg_len = skb-len - skb-data_len;
-   if (skb_shinfo(skb)-gso_type == SKB_GSO_UDPV4) {
+   if (skb_shinfo(skb)-gso_type == SKB_GSO_UDP) {
int ufo_size;
 
ufo_size = skb_shinfo(skb)-gso_size;
@@ -4009,7 +4009,7 @@ static int s2io_xmit(struct sk_buff *skb
txdp-Host_Control = (unsigned long) skb;
txdp-Control_1 |= TXD_BUFFER0_SIZE(frg_len);
 
-   if (skb_shinfo(skb)-gso_type == SKB_GSO_UDPV4)
+   if (skb_shinfo(skb)-gso_type == SKB_GSO_UDP)
txdp-Control_1 |= TXD_UFO_EN;
 
frg_cnt = skb_shinfo(skb)-nr_frags;
@@ -4024,12 +4024,12 @@ static int s2io_xmit(struct sk_buff *skb
(sp-pdev, frag-page, frag-page_offset,
 frag-size, PCI_DMA_TODEVICE);
txdp-Control_1 = TXD_BUFFER0_SIZE(frag-size);
-   if (skb_shinfo(skb)-gso_type == SKB_GSO_UDPV4)
+   if (skb_shinfo(skb)-gso_type == SKB_GSO_UDP)
txdp-Control_1 |= TXD_UFO_EN;
}
txdp-Control_1 |= TXD_GATHER_CODE_LAST;
 
-   if (skb_shinfo(skb)-gso_type == SKB_GSO_UDPV4)
+   if (skb_shinfo(skb)-gso_type == SKB_GSO_UDP)
frg_cnt++; /* as Txd0 was used for inband header */
 
tx_fifo = mac_control-tx_FIFO_start[queue];
@@ -4043,7 +4043,7 @@ static int s2io_xmit(struct sk_buff *skb
if (mss)
val64 |= TX_FIFO_SPECIAL_FUNC;
 #endif
-   if (skb_shinfo(skb)-gso_type == SKB_GSO_UDPV4)
+   if (skb_shinfo(skb)-gso_type == SKB_GSO_UDP)
val64 |= TX_FIFO_SPECIAL_FUNC;
writeq(val64, tx_fifo-List_Control);
 
@@ -7021,6 +7021,9 @@ s2io_init_nic(struct pci_dev *pdev, cons
 #ifdef NETIF_F_TSO
dev-features |= NETIF_F_TSO;
 #endif
+#ifdef NETIF_F_TSO6
+   dev-features |= NETIF_F_TSO6;
+#endif
if (sp-device_type  XFRAME_II_DEVICE) {
dev-features |= NETIF_F_UFO;
dev-features |= NETIF_F_HW_CSUM;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -315,9 +315,10 @@ struct net_device
 #define NETIF_F_GSO_SHIFT  16
 #define NETIF_F_GSO_MASK   0x
 #define NETIF_F_TSO(SKB_GSO_TCPV4  NETIF_F_GSO_SHIFT)
-#define NETIF_F_UFO

[3/4] [NET]: Generalise TSO-specific bits from skb_setup_caps

2006-06-29 Thread Herbert Xu

Hi:

[NET]: Generalise TSO-specific bits from skb_setup_caps

This patch generalises the TSO-specific bits from sk_setup_caps by adding
the sk_gso_type member to struct sock.  This makes sk_setup_caps generic
so that it can be used by TCPv6 or UFO.

The only catch is that whoever uses this must provide a GSO implementation
for their protocol which I think is a fair deal :) For now UFO continues to
live without a GSO implementation which is OK since it doesn't use the sock
caps field at the moment.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -313,6 +313,7 @@ struct net_device
 
/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT  16
+#define NETIF_F_GSO_MASK   0x
 #define NETIF_F_TSO(SKB_GSO_TCPV4  NETIF_F_GSO_SHIFT)
 #define NETIF_F_UFO(SKB_GSO_UDPV4  NETIF_F_GSO_SHIFT)
 #define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY  NETIF_F_GSO_SHIFT)
@@ -991,13 +992,18 @@ extern void dev_seq_stop(struct seq_file
 
 extern void linkwatch_run_queue(void);
 
-static inline int skb_gso_ok(struct sk_buff *skb, int features)
+static inline int net_gso_ok(int features, int gso_type)
 {
-   int feature = skb_shinfo(skb)-gso_size ?
- skb_shinfo(skb)-gso_type  NETIF_F_GSO_SHIFT : 0;
+   int feature = gso_type  NETIF_F_GSO_SHIFT;
return (features  feature) == feature;
 }
 
+static inline int skb_gso_ok(struct sk_buff *skb, int features)
+{
+   return net_gso_ok(features, skb_shinfo(skb)-gso_size ?
+   skb_shinfo(skb)-gso_type : 0);
+}
+
 static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
 {
return !skb_gso_ok(skb, dev-features);
diff --git a/include/net/sock.h b/include/net/sock.h
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -140,6 +140,7 @@ struct sock_common {
   *@sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, 
%SO_OOBINLINE settings
   *@sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets
   *@sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
+  *@sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
   *@sk_lingertime: %SO_LINGER l_linger setting
   *@sk_backlog: always used with the per-socket spinlock held
   *@sk_callback_lock: used with the callbacks in the end of this struct
@@ -211,6 +212,7 @@ struct sock {
gfp_t   sk_allocation;
int sk_sndbuf;
int sk_route_caps;
+   int sk_gso_type;
int sk_rcvlowat;
unsigned long   sk_flags;
unsigned long   sk_lingertime;
@@ -1025,15 +1027,20 @@ extern struct dst_entry *__sk_dst_check(
 
 extern struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);
 
+static inline int sk_can_gso(const struct sock *sk)
+{
+   return net_gso_ok(sk-sk_route_caps, sk-sk_gso_type);
+}
+
 static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
__sk_dst_set(sk, dst);
sk-sk_route_caps = dst-dev-features;
if (sk-sk_route_caps  NETIF_F_GSO)
-   sk-sk_route_caps |= NETIF_F_TSO;
-   if (sk-sk_route_caps  NETIF_F_TSO) {
+   sk-sk_route_caps |= NETIF_F_GSO_MASK;
+   if (sk_can_gso(sk)) {
if (dst-header_len)
-   sk-sk_route_caps = ~NETIF_F_TSO;
+   sk-sk_route_caps = ~NETIF_F_GSO_MASK;
else 
sk-sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
}
diff --git a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -751,7 +751,7 @@ static inline int tcp_is_cwnd_limited(co
if (in_flight = tp-snd_cwnd)
return 1;
 
-   if (!(sk-sk_route_caps  NETIF_F_TSO))
+   if (!sk_can_gso(sk))
return 0;
 
left = tp-snd_cwnd - in_flight;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -643,7 +643,7 @@ static inline int select_size(struct soc
int tmp = tp-mss_cache;
 
if (sk-sk_route_caps  NETIF_F_SG) {
-   if (sk-sk_route_caps  NETIF_F_TSO)
+   if (sk_can_gso(sk))
tmp = 0;
else {
int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -242,6 +242,7 @@ int tcp_v4_connect(struct sock *sk, stru
goto failure;
 
/* OK, now commit destination to socket.  */
+   sk-sk_gso_type

[5/4] [NET]: Verify gso_type too in gso_segment

2006-06-30 Thread Herbert Xu

Hi:

[NET]: Verify gso_type too in gso_segment

We don't want nasty Xen guests to pass a TCPv6 packet in with gso_type set
to TCPv4 or even UDP (or a packet that's both TCP and UDP).

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8d15715..318d467 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1106,7 +1106,15 @@ static struct sk_buff *inet_gso_segment(
int ihl;
int id;
 
-   if (!pskb_may_pull(skb, sizeof(*iph)))
+   if (unlikely(skb_shinfo(skb)-gso_type 
+~(SKB_GSO_TCPV4 |
+  SKB_GSO_UDP |
+  SKB_GSO_DODGY |
+  SKB_GSO_TCP_ECN |
+  0)))
+   goto out;
+
+   if (unlikely(!pskb_may_pull(skb, sizeof(*iph
goto out;
 
iph = skb-nh.iph;
@@ -1114,7 +1122,7 @@ static struct sk_buff *inet_gso_segment(
if (ihl  sizeof(*iph))
goto out;
 
-   if (!pskb_may_pull(skb, ihl))
+   if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
 
skb-h.raw = __skb_pull(skb, ihl);
@@ -1125,7 +1133,7 @@ static struct sk_buff *inet_gso_segment(
 
rcu_read_lock();
ops = rcu_dereference(inet_protos[proto]);
-   if (ops  ops-gso_segment)
+   if (likely(ops  ops-gso_segment))
segs = ops-gso_segment(skb, features);
rcu_read_unlock();
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 59e30ba..2f81374 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2155,6 +2155,14 @@ struct sk_buff *tcp_tso_segment(struct s
unsigned int oldlen;
unsigned int len;
 
+   if (unlikely(skb_shinfo(skb)-gso_type 
+~(SKB_GSO_TCPV4 |
+  SKB_GSO_DODGY |
+  SKB_GSO_TCP_ECN |
+  SKB_GSO_TCPV6 |
+  0)))
+   goto out;
+
if (!pskb_may_pull(skb, sizeof(*th)))
goto out;
 
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 25f8bf8..03b65aa 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -65,6 +65,14 @@ static struct sk_buff *ipv6_gso_segment(
struct inet6_protocol *ops;
int proto;
 
+   if (unlikely(skb_shinfo(skb)-gso_type 
+~(SKB_GSO_UDP |
+  SKB_GSO_DODGY |
+  SKB_GSO_TCP_ECN |
+  SKB_GSO_TCPV6 |
+  0)))
+   goto out;
+
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h
goto out;
 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [5/4] [NET]: Verify gso_type too in gso_segment

2006-06-30 Thread Herbert Xu

On Fri, Jun 30, 2006 at 04:13:50PM +1000, herbert wrote:
 
 [NET]: Verify gso_type too in gso_segment

Here is a better version that ensures at least one of TCPV4 and TCPV6
is set in tcp_tso_segment.

[NET]: Verify gso_type too in gso_segment

We don't want nasty Xen guests to pass a TCPv6 packet in with gso_type set
to TCPv4 or even UDP (or a packet that's both TCP and UDP).

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
8cbe620474c9ba82ab868e1b2e887ff91808470f
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8d15715..318d467 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1106,7 +1106,15 @@ static struct sk_buff *inet_gso_segment(
int ihl;
int id;
 
-   if (!pskb_may_pull(skb, sizeof(*iph)))
+   if (unlikely(skb_shinfo(skb)-gso_type 
+~(SKB_GSO_TCPV4 |
+  SKB_GSO_UDP |
+  SKB_GSO_DODGY |
+  SKB_GSO_TCP_ECN |
+  0)))
+   goto out;
+
+   if (unlikely(!pskb_may_pull(skb, sizeof(*iph
goto out;
 
iph = skb-nh.iph;
@@ -1114,7 +1122,7 @@ static struct sk_buff *inet_gso_segment(
if (ihl  sizeof(*iph))
goto out;
 
-   if (!pskb_may_pull(skb, ihl))
+   if (unlikely(!pskb_may_pull(skb, ihl)))
goto out;
 
skb-h.raw = __skb_pull(skb, ihl);
@@ -1125,7 +1133,7 @@ static struct sk_buff *inet_gso_segment(
 
rcu_read_lock();
ops = rcu_dereference(inet_protos[proto]);
-   if (ops  ops-gso_segment)
+   if (likely(ops  ops-gso_segment))
segs = ops-gso_segment(skb, features);
rcu_read_unlock();
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 59e30ba..9b5c54e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2171,8 +2171,19 @@ struct sk_buff *tcp_tso_segment(struct s
 
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
-   int mss = skb_shinfo(skb)-gso_size;
+   int type = skb_shinfo(skb)-gso_type;
+   int mss;
+
+   if (unlikely(type 
+~(SKB_GSO_TCPV4 |
+  SKB_GSO_DODGY |
+  SKB_GSO_TCP_ECN |
+  SKB_GSO_TCPV6 |
+  0) ||
+!(type  (SKB_GSO_TCPV4 | SKB_GSO_TCPV6
+   goto out;
 
+   mss = skb_shinfo(skb)-gso_size;
skb_shinfo(skb)-gso_segs = (skb-len + mss - 1) / mss;
 
segs = NULL;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 25f8bf8..03b65aa 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -65,6 +65,14 @@ static struct sk_buff *ipv6_gso_segment(
struct inet6_protocol *ops;
int proto;
 
+   if (unlikely(skb_shinfo(skb)-gso_type 
+~(SKB_GSO_UDP |
+  SKB_GSO_DODGY |
+  SKB_GSO_TCP_ECN |
+  SKB_GSO_TCPV6 |
+  0)))
+   goto out;
+
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h
goto out;
 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: jumbo frames and memory fragmentation

2006-06-30 Thread Herbert Xu

Chris Friesen [EMAIL PROTECTED] wrote:
 
 Anyone have any suggestions on how to improve this?  Upgrading kernels 
 isn't an option.  I could port back the copybreak stuff fairly easily.

Either upgrade your kernel or backport the page-splitting code in the
current tree.  That's really the only sane solution for jumbo packets.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [1/4] [IPV6]: Remove redundant length check on input

2006-06-30 Thread Herbert Xu

On Fri, Jun 30, 2006 at 07:44:49PM -0400, Ananda Raju wrote:
 
 I tested the patch, and TSO over ipv6 is working fine. But TSO disable
 not working for IPv6. 
 
 I tried the from tree /pub/scm/linux/kernel/git/davem/net-2.6

I think we need some new ethtool helper functions that sets/clears both
TSO/TSO6.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2.6.17] support for TSO over IPv6

2006-06-30 Thread Herbert Xu

Hi Leonid:

On Fri, Jun 30, 2006 at 04:46:56PM -0400, Leonid Grossman wrote:
 
 If ECE == 1, then set it to one for all datagrams.
 If CWR == 1, then set it to one for the first datagram, and set it to
 zero for the rest?

Exactly.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] NET: Fix ipv6 GSO payload length

2006-06-30 Thread Herbert Xu

On Fri, Jun 30, 2006 at 03:56:47PM -0700, Michael Chan wrote:
 Fix ipv6 GSO payload length calculation.
 
 The ipv6 payload length excludes the ipv6 base header length and so
 must be subtracted.
 
 Signed-off-by: Michael Chan [EMAIL PROTECTED]

Looks good to me.  Thanks for cathcing this!
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [1/4] [IPV6]: Remove redundant length check on input

2006-07-03 Thread Herbert Xu

On Mon, Jul 03, 2006 at 07:44:06PM -0700, David Miller wrote:

  I think we need some new ethtool helper functions that sets/clears both
  TSO/TSO6.
 
 Do you really want to semantically seperate TSO and TSO6?
 
 I would think that real users who want to disable TSO, wish
 to do so unilaterally.

That's what I meant.  I meant ethtool helper functions that
clear and set both TSO/TSO6 flags at the same time.

Alternatively, we can add a new features bit which gives the inherent
features of a device.  That can then be used to derive the actual
features in use.  That way we won't need to invent a new ethtool
helper function for this.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [1/4] [IPV6]: Remove redundant length check on input

2006-07-03 Thread Herbert Xu

On Tue, Jul 04, 2006 at 12:45:27PM +1000, herbert wrote:
 
 That's what I meant.  I meant ethtool helper functions that
 clear and set both TSO/TSO6 flags at the same time.

I think I was a bit ambiguous here.  To expand on my argument, what
I'm saying is that we can't just change the existing ethtool helper
functions to set TSO6 since that'd break NICs which do not support
TSO6.

Instead of adding an ad-hoc ethtool function in the neterion driver,
we should either add a new ethtool function which sets both TSO/TSO6,
or do the following.

 Alternatively, we can add a new features bit which gives the inherent
 features of a device.  That can then be used to derive the actual
 features in use.  That way we won't need to invent a new ethtool
 helper function for this.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: RDMA will be reverted

2006-07-06 Thread Herbert Xu

Tom Tucker [EMAIL PROTECTED] wrote:
 
 All that said, the proposed patch helps not only iWARP, but other
 transports (iSCSI, IB) as well. It is not large, invasive,

Care to explain on how it helps those other technologies?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: RDMA will be reverted

2006-07-06 Thread Herbert Xu

On Thu, Jul 06, 2006 at 12:36:24PM -0500, Tom Tucker wrote:
 
 The RDMA CMA uses IP addresses and port numbers to create a uniform
 addressing scheme across all transport types. For IB, it is necessary to
 resolve IP addresses to IB GIDs. The ARP protocol is used to do this and
 a netfilter rule is installed to snoop the incoming ARP replies. This
 would not be necessary if ARP events were provided as in the patch. 

Well the concerns we have do not apply to just iWARP, but RDMA/IP in
general so this isn't really another technology.

In fact, it seems that we now have IP-specific knowledge living in
drivers/infiniband/core/cma.c which is suboptimal.

 Unified wire iSCSI adapters have the same issue as iWARP wrt to managing
 IP addresses and ports.

If by Unified wire iSCSI you mean something that presents a SCSI interface
together with an Ethernet interface where the two share the same MAC and
IP address, then we have the same concerns with it as we do with iWARP or
TOE.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

What is RDMA (was: RDMA will be reverted)

2006-07-07 Thread Herbert Xu

On Fri, Jul 07, 2006 at 06:53:20AM +, David Miller wrote:
 
 What I am saying, however, is that we need to understand the
 technology and the hooks you guys want before we put any of it in.

Yes indeed.

Here is what I've understood so far so let's see if we can start building
a censensus.

1) RDMA over straight Infiniband is not contentious.  In this case no
   IP networking is involved.

2) RDMA over TCP/IP (or SCTP) can theoretically run on any network that
   supported IP, including Infiniband and Ethernet.

3) When RDMA over TCP is completely done in hardware, i.e., it has its
   own IP address, MAC address, and simply presents an RDMA interface
   (whatever that may be) to Linux, we're OK with it.

   This is similar to how some iSCSI adapters work.

4) When RDMA over TCP is done completely in the Linux networking stack,
   we don't have a problem because the existing TCP stack is still in
   charge.  However, this is pretty pointless.

5) RDMA over TCP on the receive side is offloaded into the NIC.  This
   allows the NIC to directly place data into the application's buffer.  

   We're starting to have a little bit of a problem because it means that
   part of the incoming IP traffic is now being directly processed by the
   NIC, with no input from the Linux TCP/IP stack.

   However, as long as the connection establishment/acks are still
   controlled/seen by Linux we can probably live with it.

6) RDMA over TCP on the transmit side is offloaded into the NIC.  This
   is starting to look very worrying.

   The reason is that we lose all control to crucial aspects of TCP like
   congestion control.  It is now completely up to the NIC to do that.
   For straight RDMA over Infiniband this isn't an issue because the
   traffic is not likely to travel across the Internet.

   However, for RDMA over TCP, one of their goals is to support sending
   traffic over the Internet so this is a concern.  Incidentally, this is
   why they need to know about things like MAC/route/MTU changing.

7) RDMA over TCP is completely offloaded into the NIC, however, they still
   use Linux's IP address, MAC address, and rely on us to tell it about
   events such as MTU updates or MAC changes.

   In addition to the problems we have in 5) and 6), we now have a portion
   of TCP port space which has suddenly become invisible to Linux.  What's
   more, we lose control (e.g., netfilter) over what connections may or
   may not be established.

So to my mind, RDMA over TCP is most problematic when it shares the same
IP/MAC address as the Linux host, and when the transmit side and/or the
connection establishment (case 6 and 7) is offloaded into the NIC.  This
also happens to be the only scenario where they need the notification
patch that started all this discussion.

BTW, this URL gives an interesting perspective on RDMA over TCP
(particularly Q14/Q15):

http://www.rdmaconsortium.org/home/FAQs_Apr25.htm

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Xen-devel] kernel BUG at net/core/dev.c:1133!

2006-07-07 Thread Herbert Xu

Petersson, Mats [EMAIL PROTECTED] wrote:
 Looks like the GSO is involved?

It's certainly what crashed your machine :) It's probably not the
guilty party though.  Someone is passing through a TSO packet with
checksum set to something other than CHECKSUM_HW.

I bet it's netfilter and we just never noticed before because real
NICS would simply corrupt the checksum silently.

Could you confirm that you have netfilter rules (in particular NAT
rules) and that this goes away if you flush all your netfilter tables?

Patrick, do we really have to zap the checksum on outbound NAT? Could
we update it instead?

 I got this while running Dom0 only (no guests), with a
 BOINC/[EMAIL PROTECTED] application running on all 4 cores. 
 
 changeset:   10649:8e55c5c11475
 
 Build: x86_32p (pae). 
 
 [ cut here ]
 kernel BUG at net/core/dev.c:1133!
 invalid opcode:  [#1]
 SMP 
 CPU:0
 EIP:0061:[c04dceb0]Not tainted VLI
 EFLAGS: 00210297   (2.6.16.13-xen #12) 
 EIP is at skb_gso_segment+0xf0/0x110
 eax:    ebx: 0003   ecx: 0002   edx: c06e2e00
 esi: 0008   edi: cd9e32e0   ebp: c63a7900   esp: c0de5ad0
 ds: 007b   es: 007b   ss: 0069
 Process rosetta_5.25_i6 (pid: 8826, threadinfo=c0de4000 task=cb019560)
 Stack: 0c8f69060  ffa3 0003 cd9e32e0 0002 c63a7900
 c04dcfb0 
   cd9e32e0 0003  cd9e32e0 cf8e3000 cf8e3140 c04dd07e
 cd9e32e0 
   cf8e3000  cd9e32e0 cf8e3000 c04ec07e cd9e32e0 cf8e3000
 c0895140 
 Call Trace:
 [c04dcfb0] dev_gso_segment+0x30/0xb0
 [c04dd07e] dev_hard_start_xmit+0x4e/0x110
 [c04ec07e] __qdisc_run+0xbe/0x280
 [c04dd4b9] dev_queue_xmit+0x379/0x380
 [c05bbe44] br_dev_queue_push_xmit+0xa4/0x140
 [c05c2402] br_nf_post_routing+0x102/0x1d0
 [c05c22b0] br_nf_dev_queue_xmit+0x0/0x50
 [c05bbda0] br_dev_queue_push_xmit+0x0/0x140
 [c04f0eab] nf_iterate+0x6b/0xa0
 [c05bbda0] br_dev_queue_push_xmit+0x0/0x140
 [c05bbda0] br_dev_queue_push_xmit+0x0/0x140
 [c04f0f4e] nf_hook_slow+0x6e/0x120
 [c05bbda0] br_dev_queue_push_xmit+0x0/0x140
 [c05bbf40] br_forward_finish+0x60/0x70
 [c05bbda0] br_dev_queue_push_xmit+0x0/0x140
 [c05c1b71] br_nf_forward_finish+0x71/0x130
 [c05bbee0] br_forward_finish+0x0/0x70
 [c05c1d20] br_nf_forward_ip+0xf0/0x1a0
 [c05c1b00] br_nf_forward_finish+0x0/0x130
 [c05bbee0] br_forward_finish+0x0/0x70
 [c04f0eab] nf_iterate+0x6b/0xa0
 [c05bbee0] br_forward_finish+0x0/0x70
 [c05bbee0] br_forward_finish+0x0/0x70
 [c04f0f4e] nf_hook_slow+0x6e/0x120
 [c05bbee0] br_forward_finish+0x0/0x70
 [c05bc044] __br_forward+0x74/0x80
 [c05bbee0] br_forward_finish+0x0/0x70
 [c05bceb1] br_handle_frame_finish+0xd1/0x160
 [c05bcde0] br_handle_frame_finish+0x0/0x160
 [c05c0e0b] br_nf_pre_routing_finish+0xfb/0x480
 [c05bcde0] br_handle_frame_finish+0x0/0x160
 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480
 [c054fe13] ip_nat_in+0x43/0xc0
 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480
 [c04f0eab] nf_iterate+0x6b/0xa0
 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480
 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480
 [c04f0f4e] nf_hook_slow+0x6e/0x120
 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480
 [c05c1914] br_nf_pre_routing+0x404/0x580
 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480
 [c04f0eab] nf_iterate+0x6b/0xa0
 [c05bcde0] br_handle_frame_finish+0x0/0x160
 [c05bcde0] br_handle_frame_finish+0x0/0x160
 [c04f0f4e] nf_hook_slow+0x6e/0x120
 [c05bcde0] br_handle_frame_finish+0x0/0x160
 [c05bd124] br_handle_frame+0x1e4/0x250
 [c05bcde0] br_handle_frame_finish+0x0/0x160
 [c04ddae5] netif_receive_skb+0x165/0x2a0
 [c04ddcdf] process_backlog+0xbf/0x180
 [c04ddebf] net_rx_action+0x11f/0x1d0
 [c01262e6] __do_softirq+0x86/0x120
 [c01263f5] do_softirq+0x75/0x90
 [c0106cef] do_IRQ+0x1f/0x30
 [c04271d0] evtchn_do_upcall+0x90/0x100
 [c0105315] hypervisor_callback+0x3d/0x48
 Code: c2 2b 57 24 29 d0 8d 14 2a 89 87 94 00 00 00 89 57 60 8b 44 24 08
 83 c4 0c 5b 5e 5f 5d c3 0f 0
 b 69 03 fe 8c 66 c0 e9 69 ff ff ff 0f 0b 6d 04 e8 ab 6c c0 e9 3a ff ff
 ff 0f 0b 6c 04 e8 ab 6c c0 
 0Kernel panic - not syncing: Fatal exception in interrupt

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Xen-devel] kernel BUG at net/core/dev.c:1133!

2006-07-07 Thread Herbert Xu

On Fri, Jul 07, 2006 at 05:03:36PM +0200, Petersson, Mats wrote:
 
 So, nothing going on there... I certainly haven't got NAT on my machine,
 as my machine is within the AMD network, and doesn't need NAT. AMD
 probably uses NAT as part of it's external communications, but I doubt
 it's used at all internally. 

Actually, just having it loaded is enough to break TSO.  So for all this
time anyone who had ip_nat loaded were silently corrupting all their TSO
checksums!

I'll send a patch soon once I've tested it.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: starting mc triggers lockdep

2006-07-07 Thread Herbert Xu

Arjan van de Ven [EMAIL PROTECTED] wrote:
 
 i_mutex is taken within rtln_mutex like this:
   [8030f4a0] create_dir+0x2c/0x1e2
   [8030fa5b] sysfs_create_dir+0x59/0x78
   [8034d2e2] kobject_add+0x114/0x1d8
   [803bb1e7] class_device_add+0xb5/0x49d
   [804300b1] netdev_register_sysfs+0x98/0xa2
   [80426f58] register_netdevice+0x28c/0x376
   [8042709c] register_netdev+0x5a/0x69
 creating the AB dependency

This is a sysfs inode.

 now for the third part, which involves the nfs client:
 stat on an nfs file, which ends up taken the i_mutex of a directory in
 the path (obvious), and then does 
   [8022800b] tcp_sendmsg+0x1e/0xb1a
   [80248f4b] inet_sendmsg+0x45/0x53
   [80259d25] sock_sendmsg+0x110/0x130
   [8041f462] kernel_sendmsg+0x3c/0x52
   [885399e9] xs_tcp_send_request+0x117/0x320 [sunrpc]
   [885388d5] xprt_transmit+0x105/0x21e [sunrpc]
   [8853771e] call_transmit+0x1f4/0x239 [sunrpc]
   [8853c06e] __rpc_execute+0x9b/0x1e6 [sunrpc]
   [8853c1de] rpc_execute+0x1a/0x1d [sunrpc]
   [885364ad] rpc_call_sync+0x87/0xb9 [sunrpc]
   [885a2587] nfs3_rpc_wrapper+0x2e/0x74 [nfs]
   [885a2a14] nfs3_proc_lookup+0xe0/0x163 [nfs]
 where tcp_sendmsg calls lock_sock. So this is the BC dependency.

This is an nfs inode.

Did I miss something?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: auro deadlock (was Re: e100 lockdep irq lock inversion.)

2006-07-07 Thread Herbert Xu

Arjan van de Ven [EMAIL PROTECTED] wrote:
 
 Act 1
 
 Enter the mpi_start_xmit() function, which is airo's xmit function.
 This function takes the aux_lock first, with irq's off, then calls
 skb_queue_tail(). skb_queue_tail takes the sk_receive_queue.lock (with
 irqsave as well).

Nope, make that ai-txq.

 Act 2
 
 Enter the ipcalc program. This program calls an ioctl, which ends up
 calling udp_ioctl. udp_ioctl does
   spin_lock_bh(sk-sk_receive_queue.lock);

Different queue.

So no deadlock.  Better luck next time :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 2/2] correct dev_alloc_skb kerneldoc

2006-07-07 Thread Herbert Xu

David Miller [EMAIL PROTECTED] wrote:
 What is the point of dev_alloc_skb anyway? all it does is add header space.
 
 In stone-age times it actually had specific semantics, but yes today
 it is just a synonym.

Does anyone still need those 16 bytes of header space?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[1/2] [NET] gso: Add skb_is_gso

2006-07-07 Thread Herbert Xu

Hi Dave:

These two patches fix the netfilter/checksum/TSO problem where netfilter
destroys the partial checksum which breaks TSO.

[NET] gso: Add skb_is_gso

This patch adds the wrapper function skb_is_gso which can be used instead
of directly testing skb_shinfo(skb)-gso_size.  This makes things a little
nicer and allows us to change the primary key for indicating whether an skb
is GSO (if we ever want to do that).

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -1639,7 +1639,7 @@ bnx2_tx_int(struct bnx2 *bp)
skb = tx_buf-skb;
 #ifdef BCM_TSO 
/* partial BD completions possible with TSO packets */
-   if (skb_shinfo(skb)-gso_size) {
+   if (skb_is_gso(skb)) {
u16 last_idx, last_ring_idx;
 
last_idx = sw_cons +
diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c
--- a/drivers/net/chelsio/sge.c
+++ b/drivers/net/chelsio/sge.c
@@ -1417,7 +1417,7 @@ int t1_start_xmit(struct sk_buff *skb, s
struct cpl_tx_pkt *cpl;
 
 #ifdef NETIF_F_TSO
-   if (skb_shinfo(skb)-gso_size) {
+   if (skb_is_gso(skb)) {
int eth_type;
struct cpl_tx_pkt_lso *hdr;
 
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -2394,7 +2394,7 @@ e1000_tso(struct e1000_adapter *adapter,
uint8_t ipcss, ipcso, tucss, tucso, hdr_len;
int err;
 
-   if (skb_shinfo(skb)-gso_size) {
+   if (skb_is_gso(skb)) {
if (skb_header_cloned(skb)) {
err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
if (err)
@@ -2519,7 +2519,7 @@ e1000_tx_map(struct e1000_adapter *adapt
 * tso gets written back prematurely before the data is fully
 * DMA'd to the controller */
if (!skb-data_len  tx_ring-last_tx_tso 
-   !skb_shinfo(skb)-gso_size) {
+   !skb_is_gso(skb)) {
tx_ring-last_tx_tso = 0;
size -= 4;
}
@@ -2806,8 +2806,7 @@ e1000_xmit_frame(struct sk_buff *skb, st
 
 #ifdef NETIF_F_TSO
/* Controller Erratum workaround */
-   if (!skb-data_len  tx_ring-last_tx_tso 
-   !skb_shinfo(skb)-gso_size)
+   if (!skb-data_len  tx_ring-last_tx_tso  !skb_is_gso(skb))
count++;
 #endif
 
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -1495,7 +1495,7 @@ static int nv_start_xmit(struct sk_buff 
np-tx_skbuff[nr] = skb;
 
 #ifdef NETIF_F_TSO
-   if (skb_shinfo(skb)-gso_size)
+   if (skb_is_gso(skb))
tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)-gso_size  
NV_TX2_TSO_SHIFT);
else
 #endif
diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c
--- a/drivers/net/ixgb/ixgb_main.c
+++ b/drivers/net/ixgb/ixgb_main.c
@@ -1173,7 +1173,7 @@ ixgb_tso(struct ixgb_adapter *adapter, s
uint16_t ipcse, tucse, mss;
int err;
 
-   if(likely(skb_shinfo(skb)-gso_size)) {
+   if (likely(skb_is_gso(skb))) {
if (skb_header_cloned(skb)) {
err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
if (err)
diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -139,7 +139,7 @@ static int loopback_xmit(struct sk_buff 
 #endif
 
 #ifdef LOOPBACK_TSO
-   if (skb_shinfo(skb)-gso_size) {
+   if (skb_is_gso(skb)) {
BUG_ON(skb-protocol != htons(ETH_P_IP));
BUG_ON(skb-nh.iph-protocol != IPPROTO_TCP);
 
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -2116,7 +2116,7 @@ abort_linearize:
}
idx = (idx + 1)  tx-mask;
} while (idx != last_idx);
-   if (skb_shinfo(skb)-gso_size) {
+   if (skb_is_gso(skb)) {
printk(KERN_ERR
   myri10ge: %s: TSO but wanted to linearize?!?!?\n,
   mgp-dev-name);
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1159,7 +1159,7 @@ static unsigned tx_le_req(const struct s
count = sizeof(dma_addr_t) / sizeof(u32);
count += skb_shinfo(skb)-nr_frags * count;
 
-   if (skb_shinfo(skb)-gso_size)
+   if (skb_is_gso(skb))
++count;
 
if (skb

[2/2] [NET] gso: Fix up GSO packets with broken checksums

2006-07-07 Thread Herbert Xu

Hi:

[NET] gso: Fix up GSO packets with broken checksums

Certain subsystems in the stack (e.g., netfilter) can break the partial
checksum on GSO packets.  Until they're fixed, this patch allows this to
work by recomputing the partial checksums through the GSO mechanism.

Once they've all been converted to update the partial checksum instead of
clearing it, this workaround can be removed.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -549,6 +549,7 @@ struct packet_type {
 struct net_device *);
struct sk_buff  *(*gso_segment)(struct sk_buff *skb,
int features);
+   int (*gso_send_check)(struct sk_buff *skb);
void*af_packet_priv;
struct list_headlist;
 };
@@ -1001,13 +1002,14 @@ static inline int net_gso_ok(int feature
 
 static inline int skb_gso_ok(struct sk_buff *skb, int features)
 {
-   return net_gso_ok(features, skb_is_gso(skb) ?
-   skb_shinfo(skb)-gso_type : 0);
+   return net_gso_ok(features, skb_shinfo(skb)-gso_type);
 }
 
 static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
 {
-   return !skb_gso_ok(skb, dev-features);
+   return skb_is_gso(skb) 
+  (!skb_gso_ok(skb, dev-features) ||
+   unlikely(skb-ip_summed != CHECKSUM_HW));
 }
 
 #endif /* __KERNEL__ */
diff --git a/include/net/protocol.h b/include/net/protocol.h
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -36,6 +36,7 @@
 struct net_protocol {
int (*handler)(struct sk_buff *skb);
void(*err_handler)(struct sk_buff *skb, u32 info);
+   int (*gso_send_check)(struct sk_buff *skb);
struct sk_buff *(*gso_segment)(struct sk_buff *skb,
   int features);
int no_policy;
@@ -51,6 +52,7 @@ struct inet6_protocol 
   int type, int code, int offset,
   __u32 info);
 
+   int (*gso_send_check)(struct sk_buff *skb);
struct sk_buff *(*gso_segment)(struct sk_buff *skb,
   int features);
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1086,6 +1086,7 @@ extern struct request_sock_ops tcp_reque
 
 extern int tcp_v4_destroy_sock(struct sock *sk);
 
+extern int tcp_v4_gso_send_check(struct sk_buff *skb);
 extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features);
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/core/dev.c b/net/core/dev.c
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1162,9 +1162,17 @@ int skb_checksum_help(struct sk_buff *sk
unsigned int csum;
int ret = 0, offset = skb-h.raw - skb-data;
 
-   if (inward) {
-   skb-ip_summed = CHECKSUM_NONE;
-   goto out;
+   if (inward)
+   goto out_set_summed;
+
+   if (unlikely(skb_shinfo(skb)-gso_size)) {
+   static int warned;
+
+   WARN_ON(!warned);
+   warned = 1;
+
+   /* Let GSO fix up the checksum. */
+   goto out_set_summed;
}
 
if (skb_cloned(skb)) {
@@ -1181,6 +1189,8 @@ int skb_checksum_help(struct sk_buff *sk
BUG_ON(skb-csum + 2  offset);
 
*(u16*)(skb-h.raw + skb-csum) = csum_fold(csum);
+
+out_set_summed:
skb-ip_summed = CHECKSUM_NONE;
 out:   
return ret;
@@ -1201,17 +1211,35 @@ struct sk_buff *skb_gso_segment(struct s
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_type *ptype;
int type = skb-protocol;
+   int err;
 
BUG_ON(skb_shinfo(skb)-frag_list);
-   BUG_ON(skb-ip_summed != CHECKSUM_HW);
 
skb-mac.raw = skb-data;
skb-mac_len = skb-nh.raw - skb-data;
__skb_pull(skb, skb-mac_len);
 
+   if (unlikely(skb-ip_summed != CHECKSUM_HW)) {
+   static int warned;
+
+   WARN_ON(!warned);
+   warned = 1;
+
+   if (skb_header_cloned(skb) 
+   (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+   return ERR_PTR(err);
+   }
+
rcu_read_lock();
list_for_each_entry_rcu(ptype, ptype_base[ntohs(type)  15], list) {
if (ptype-type == type  !ptype-dev  ptype-gso_segment) {
+   if (unlikely(skb-ip_summed != CHECKSUM_HW)) {
+   err = ptype

Re: airo maybe should select crypto_aes

2006-07-07 Thread Herbert Xu

Robert Schulze [EMAIL PROTECTED] wrote:
 
 I first wrote to the linux-pcmcia ML, but they said it wasn't the right 
 address for my issue. The driver airo (for Cisco Wlan-Cards) complains 
 about failed to load transform for AES, when it is loaded and 
 CRYPTO_AES is not selected in Kconfig.
 I've got a patch for that, maybe it's worth it.

First of your patch is space-damaged.  Please make sure that the
original tabs are preserved and resend.

Also, wireless patches should be sent to this list with a cc to
John W. Linville [EMAIL PROTECTED].

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: starting mc triggers lockdep

2006-07-08 Thread Herbert Xu

On Sat, Jul 08, 2006 at 11:53:20AM +0200, Arjan van de Ven wrote:

   now for the third part, which involves the nfs client:
   stat on an nfs file, which ends up taken the i_mutex of a directory in
   the path (obvious), and then does 
 [8022800b] tcp_sendmsg+0x1e/0xb1a
 [80248f4b] inet_sendmsg+0x45/0x53
 [80259d25] sock_sendmsg+0x110/0x130
 [8041f462] kernel_sendmsg+0x3c/0x52
 [885399e9] xs_tcp_send_request+0x117/0x320 [sunrpc]
 [885388d5] xprt_transmit+0x105/0x21e [sunrpc]
 [8853771e] call_transmit+0x1f4/0x239 [sunrpc]
 [8853c06e] __rpc_execute+0x9b/0x1e6 [sunrpc]
 [8853c1de] rpc_execute+0x1a/0x1d [sunrpc]
 [885364ad] rpc_call_sync+0x87/0xb9 [sunrpc]
 [885a2587] nfs3_rpc_wrapper+0x2e/0x74 [nfs]
 [885a2a14] nfs3_proc_lookup+0xe0/0x163 [nfs]
   where tcp_sendmsg calls lock_sock. So this is the BC dependency.
  
  This is an nfs inode.
  
  Did I miss something?
 
 is it not possible to nfs export /sys, and then mount it over loopback?

Possibly, but not with the backtrace above.  You'd need an nfs server
backtrace to get the real sysfs inode.

In any case, the sock lock from the other backtrace that you had
(udp setsockopt) cannot be held by the kernel nfs client or server
since the kernel nfs sockets are not visible to user space.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2.6.17-mm3

2006-07-09 Thread Herbert Xu

Michal Piotrowski [EMAIL PROTECTED] wrote:
 
 It was moved, sorry.

I fail to spot any relevant backtraces for skge or indeed any part
of the networking stack.  Ingo/Arjan, perhaps you guys can figure
out what's wrong here.

In future perhaps you should consider posting the dmesg to the list
directly.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[IPV4] inetpeer: Get rid of volatile from peer_total

2006-07-09 Thread Herbert Xu

Hi Dave:

Just joining on the fun on volatile :)

[IPV4] inetpeer: Get rid of volatile from peer_total

The variable peer_total is protected by a lock.  The volatile marker
makes no sense.  This shaves off 20 bytes on i386.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index 2160874..03ff62e 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -86,7 +86,7 @@ static struct inet_peer *peer_root = pee
 static DEFINE_RWLOCK(peer_pool_lock);
 #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
 
-static volatile int peer_total;
+static int peer_total;
 /* Exported for sysctl_net_ipv4.  */
 int inet_peer_threshold = 65536 + 128; /* start to throw entries more
 * aggressively at this stage */
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: ipsec tunnel asymmetrical mtu

2006-07-11 Thread Herbert Xu

Hi Marco:

On Mon, Apr 24, 2006 at 09:23:00AM +, Marco Berizzi wrote:
 
 What should I do? Mangling MSS with iptables  --set-mss ?
 Altering MSS to 1440 did the trick. See:
 http://marc.theaimsgroup.com/?l=linux-netdevm=114373067423528w=2

Yes that's enough, although proper PMTU would be better.
 
 and here is snmp when the sapgui client has told me that the
 connections has been reset:
 
 [EMAIL PROTECTED]:/var/log# cat SNMP-CONN-RESET
 Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams 
 InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes 
 ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates
 Ip: 1 64 79257 0 31 48139 0 0 38799 56650 2 0 2 182 90 2 90 0 124

OK, the number of reassemblies equals the number of FragOKs.  So it does
not look like there is a problem within mimosa, i.e., Linux.

I've looked at your packet dumps again and in fact there is not qualitative
difference between WITHTCPDUMP and WITHOUTTCPDUMP.  What is different is
that the latter seems to have experienced a higher packet loss rate at an
early stage and therefore the sender has already backed off to a very long
retry period.

The fact that tcpdump makes a difference could simply be that it changes
the timing of the fragment tramissions on mimosa which has an impact on
the loss rate between mimosa and pleiadi.

We can say these things for certain:

1) The path between mimosa and pleiadi has a packet loss problem.  A small
   burst of 10 or so fragments is enough to cause at least half of them to
   be lost.  This problem may be specific to IPsec traffic (ISPs often
   discriminate against traffic with protocols other than TCP/UDP).

2) Fragmentation exacerbates the packet loss problem because it increases
   the number of packets and a packet is lost if only one of its fragments
   is lost.

3) The fact that the TCP connections are not using PMTU causes fragmentation
   in the presence of IPsec.

From what I've seen, there does not appear to be a bug in Linux that could
explain the behaviour change that is seen when you run tcpdump.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: ipsec tunnel asymmetrical mtu

2006-07-11 Thread Herbert Xu

On Mon, May 08, 2006 at 08:28:32AM +, Marco Berizzi wrote:

 [EMAIL PROTECTED]:~# ping 10.49.59.23
 PING 10.49.59.23 (10.49.59.23) 56(84) bytes of data.
 64 bytes from 10.49.59.23: icmp_seq=1 ttl=247 time=91.9 ms
 64 bytes from 10.49.59.23: icmp_seq=2 ttl=247 time=49.3 ms
 64 bytes from 10.49.59.23: icmp_seq=3 ttl=247 time=106 ms
 64 bytes from 10.49.59.23: icmp_seq=4 ttl=247 time=74.3 ms
 
 --- 10.49.59.23 ping statistics ---
 4 packets transmitted, 4 received, 0% packet loss, time 2998ms
 rtt min/avg/max/mdev = 49.316/80.460/106.257/21.241 ms
 [EMAIL PROTECTED]:~# cd /tmp/
 [EMAIL PROTECTED]:/tmp# tcpdump -v -p -n ip host 10.49.59.23  
 /tmp/NULL-10.49.59.23 
 [1] 18981
 [EMAIL PROTECTED]:/tmp# tcpdump: listening on eth0, link-type EN10MB 
 (Ethernet), 
 capture size 96 bytes
 
 [EMAIL PROTECTED]:/tmp# ping 10.49.59.23
 PING 10.49.59.23 (10.49.59.23) 56(84) bytes of data.
 
 --- 10.49.59.23 ping statistics ---
 8 packets transmitted, 0 received, 100% packet loss, time 6999ms
 
 [EMAIL PROTECTED]:/tmp# fg
 tcpdump -v -p -n ip host 10.49.59.23 /tmp/NULL-10.49.59.23
 101 packets captured
 101 packets received by filter
 0 packets dropped by kernel

Yes this is really weird.  The only thing I can think of is that it
somehow managed to put some bogus entry into the conntrack table.
What happens if you do

grep 10.49.59.23 /proc/net/ip_conntrack

before and after the tcpdump?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: What is RDMA (was: RDMA will be reverted)

2006-07-11 Thread Herbert Xu

On Fri, Jul 07, 2006 at 01:25:44PM -0500, Steve Wise wrote:
 
 Some IP networking is involved for this.  IP addresses and port numbers
 are used by the RDMA Connection Manager.  The motivation for this was
 two-fold, I think:
 
 1) to simplify the connection setup model.  The IB CM model was very
 complex.
 
 2) to allow ULPs to be transport independent.  Thus a single code base
 for NFSoRDMA, for example, can run over Infiniband and RDMA/TCP
 transports without code changes or knowing about transport-specific
 addressing.
 
 The routing table is also consulted to determine which rdma device
 should be used for connection setup.  Each rdma device also installs a
 netdev device for native stack traffic.  The RDMA CM maintains an
 association between the netdev device and the rdma device.  
 
 And the Infiniband subsystem uses ARP over IPoIB to map IP addresses to
 GID/QPN info.  This is done by calling arp_send() directly, and snooping
 all ARP packets to discover when the arp entry is completed.

This sounds interesting.

Since this is going to be IB-neutral, what about moving high-level logic
like this is moved out of drivers/infiniband and into net?

That way the rest of the networking community can add input into how
things are done.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] cancel_rearming_delayed_work infinite loop fix

2006-07-11 Thread Herbert Xu

Michael Buesch [EMAIL PROTECTED] wrote:
 cancel_rearming_delayed_work{queue} is broken, because it is
 possible to enter an infinite loop if:
 We call the function on a work that is currently not executing or pending.

Why are you calling it on a work that was never scheduled? Sounds like
a bug to me.

 void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
   struct work_struct *work)
 {
 -   while (!cancel_delayed_work(work))
 +   do {
 +   cancel_delayed_work(work);
flush_workqueue(wq);
 +   } while (test_bit(0, work-pending));

This is broken.  If the work just starts running before your test_bit
you'd exit without cancelling it properly.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: ipsec tunnel asymmetrical mtu

2006-07-11 Thread Herbert Xu

On Tue, Jul 11, 2006 at 11:22:18AM +0200, Marco Berizzi wrote:

 I'm able to connect to a sap server connected to the milano network
 from a sapgui client connected to the venezia network. No problem.
 If packet loss is a problem it should be also a problem with this tunnel.
 Am I wrong?

It depends.  A mild packet loss problem can become a big one when it
is exacerbated by fragmentation, e.g., a 20% rate can become 40%.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener

2006-07-11 Thread Herbert Xu

Andrew Morton [EMAIL PROTECTED] wrote:
 On Tue, 11 Jul 2006 00:36:39 -0400
 Shailabh Nagar [EMAIL PROTECTED] wrote:
 
   down_write(listeners-sem);
   list_for_each_entry_safe(s, tmp, listeners-list, list) {
 - ret = genlmsg_unicast(skb, s-pid);
 + skb_next = NULL;
 + if (!list_islast(s-list, listeners-list)) {
 + skb_next = skb_clone(skb_cur, GFP_KERNEL);
 
 If we do a GFP_KERNEL allocation with this semaphore held, and the
 oom-killer tries to kill something to satisfy the allocation, and the
 killed task gets stuck on that semaphore, I wonder of the box locks up.

We do GFP_KERNEL inside semaphores/mutexes in lots of places.  So if this
can deadlock with the oom-killer we probably should fix that, preferably
by having GFP_KERNEL fail in that case.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener

2006-07-11 Thread Herbert Xu

On Tue, Jul 11, 2006 at 03:57:31AM -0700, Andrew Morton wrote:

 down_write(listeners-sem);
 list_for_each_entry_safe(s, tmp, listeners-list, list) {
   - ret = genlmsg_unicast(skb, s-pid);
   + skb_next = NULL;
   + if (!list_islast(s-list, listeners-list)) {
   + skb_next = skb_clone(skb_cur, GFP_KERNEL);
   
   If we do a GFP_KERNEL allocation with this semaphore held, and the
   oom-killer tries to kill something to satisfy the allocation, and the
   killed task gets stuck on that semaphore, I wonder of the box locks up.
  
  We do GFP_KERNEL inside semaphores/mutexes in lots of places.  So if this
  can deadlock with the oom-killer we probably should fix that, preferably
  by having GFP_KERNEL fail in that case.
 
 This lock is special, in that it's taken on the exit() path (I think).  So
 it can block tasks which are trying to exit.

Sorry, missed the context.

If there is a deadlock then it's not just this allocation that you
need worry about.  There is also an allocation within genlmsg_uniast
that would be GFP_KERNEL.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: ipsec tunnel asymmetrical mtu

2006-07-11 Thread Herbert Xu

On Tue, Jul 11, 2006 at 11:31:33AM +0200, Marco Berizzi wrote:
 
 Me again. After a while here is:
 
 [EMAIL PROTECTED]:/tmp# ping 10.49.59.23
 PING 10.49.59.23 (10.49.59.23) 56(84) bytes of data.
 
 --- 10.49.59.23 ping statistics ---
 4 packets transmitted, 0 received, 100% packet loss, time 3010ms

Please check using ip -s x p to make sure that the packet is hitting
the right policy.  Unfortunately we don't update the byte/packet counters
so you'll have to look at the `use' time stamp.

If it's passing through IPsec, then you should trace through your
iptables rules using the LOG target to see if it's hitting them.

We need to know if it's being dropped before, in, or after netfilter.

Please also do ip r g 10.49.59.23 to make sure that it says something
sane.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[IPCOMP]: Fix truesize after decompression

2006-07-11 Thread Herbert Xu

On Thu, Jul 06, 2006 at 12:53:45PM +, Beschorner Daniel wrote:
 Does it harm?
 
 SKB BUG: Invalid truesize (380) len=1383, sizeof(sk_buff)=156
 SKB BUG: Invalid truesize (316) len=1383, sizeof(sk_buff)=156
 SKB BUG: Invalid truesize (348) len=1383, sizeof(sk_buff)=156
 SKB BUG: Invalid truesize (316) len=1383, sizeof(sk_buff)=156
 SKB BUG: Invalid truesize (380) len=1383, sizeof(sk_buff)=156
 
 I found it in the log of a 2.6.17 box using IPSEC tunnels.

It's not fatal, but it does stuff up socket accounting.  Unfortunately
getting totally accurate truesizes is not easy due to the large numbers
of pskb_expand_head calls scattered around the stack.

[IPCOMP]: Fix truesize after decompression

The truesize check has uncovered the fact that we forgot to update truesize
after pskb_expand_head.  Unfortunately pskb_expand_head can't update it for
us because it's used in all sorts of different contexts, some of which would
not allow truesize to be updated by itself.

So the solution for now is to simply update it in IPComp.

This patch also changes skb_put to __skb_put since we've just expanded
tailroom by exactly that amount so we know it's there (but gcc does not).

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 8e03748..8a8b5cf 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -70,7 +70,8 @@ static int ipcomp_decompress(struct xfrm
if (err)
goto out;

-   skb_put(skb, dlen - plen);
+   skb-truesize += dlen - plen;
+   __skb_put(skb, dlen - plen);
memcpy(skb-data, scratch, dlen);
 out:   
put_cpu();
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index b285b03..7e4d1c1 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -109,7 +109,8 @@ static int ipcomp6_input(struct xfrm_sta
goto out_put_cpu;
}
 
-   skb_put(skb, dlen - plen);
+   skb-truesize += dlen - plen;
+   __skb_put(skb, dlen - plen);
memcpy(skb-data, scratch, dlen);
err = ipch-nexthdr;
 
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: ipsec tunnel asymmetrical mtu

2006-07-11 Thread Herbert Xu

On Tue, Jul 11, 2006 at 12:32:45PM +0200, Marco Berizzi wrote:
 Running this on mimosa 'mitigates' the problem:
 
 ip addr add 172.29.128.1/28 dev eth2
 
 Connections are pretty slow but they aren't
 reseted anymore.

Hmm, I thought 172.29.128.1 was already a local address? What does

ip addr

show?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Problems with xfrm (IPSec) and multicast

2006-07-11 Thread Herbert Xu

On Wed, Jun 14, 2006 at 01:09:59PM +, Roar Bj?rgum Rotvik wrote:
 
 So I cannot make encrypted multicast traffic to flow both ways at the same 
 time, and has no clue as to why the first packets after changing direction 
 is dropped somewhere.

Sounds like conntrack.  Check /proc/net/ip_conntrack when this happens.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [IPCOMP]: Fix truesize after decompression

2006-07-11 Thread Herbert Xu

On Tue, Jul 11, 2006 at 01:55:53PM -0700, David Miller wrote:
 
 I think it is possible cover a certain class of these situations
 from within pskb_expand_head.  For example, if skb-sk is NULL
 we can prove that updating skb-truesize is safe since no
 socket's buffer accounting can possible depend upon the truesize
 value of this skb.

Yes that's certainly possible.  However, we'll need to audit the few
spots (e.g., ATM) that use truesize without setting skb-sk.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [IPCOMP]: Fix truesize after decompression

2006-07-11 Thread Herbert Xu

On Tue, Jul 11, 2006 at 04:22:17PM -0700, David Miller wrote:
 
 What ATM is doing here is charging the SKB to the virtual circuit
 sockets.  At least in the few cases I've looked at just now, the
 skb is some private ATM level signalling message, and not part of
 a normal transmit/receive packet from the normal networking stack.

Indeed, at least they do have an sk to charge things to :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [BUG] Two BUG warnings in net/core/dev.c

2006-07-11 Thread Herbert Xu

john stultz [EMAIL PROTECTED] wrote:
 Both of these were seen on my laptop w/ the current (as of this writing)
 -git tree using the e1000 driver after a suspend/resume cycle.

It's just a reminder that we need to fix NAT to update checksums
incrementally.  You'll only see it once per boot.

 BUG: warning at net/core/dev.c:1171/skb_checksum_help()
 [c0103d69] show_trace_log_lvl+0x149/0x170
 [c01052bb] show_trace+0x1b/0x20
 [c01052e4] dump_stack+0x24/0x30
 [c03c7523] skb_checksum_help+0x163/0x170
 [c0439c15] ip_nat_fn+0x1a5/0x210

Of course, if anyone sees it with a backtrace that does not contain
ip_nat_fn, please let us know.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Unnecessary check in __sk_stream_mem_reclaim?

2006-07-11 Thread Herbert Xu

Ian McDonald [EMAIL PROTECTED] wrote:
 
 It looks to me like this check here in net/core/stream.c for
 __sk_stream_mem_reclaim:
if (sk-sk_forward_alloc = SK_STREAM_MEM_QUANTUM) {
 
 is unnecessary.

It's needed after skb's have been freed which can push sk_forward_alloc
above a quantum.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Unnecessary check in __sk_stream_mem_reclaim?

2006-07-11 Thread Herbert Xu

On Wed, Jul 12, 2006 at 03:17:43PM +1200, Ian McDonald wrote:

 I'm not saying the check is unneeded - just saying doing it twice is 
 unneeded.

Right, got you this time.

I don't think we need to worry about people who use __sk_stream_mem_reclaim
when there is a perfectly good sk_stream_mem_reclaim around.

Besides, this function has only been exported since 2004 so it's highly
unlikely for there to be stuff out there using it.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] bonding: enhance the IP address check of arp_ip_target

2006-07-11 Thread Herbert Xu

Stephen Hemminger [EMAIL PROTECTED] wrote:
 
 Why not just use sscanf?

Better yet, use a better interface like netlink rather than module
parameters.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Bug 6688] Memory allocation problem

2006-07-12 Thread Herbert Xu

Andrew Morton [EMAIL PROTECTED] wrote:
 On Mon, 19 Jun 2006 23:46:08 -0700
 [EMAIL PROTECTED] wrote:
 
 http://bugzilla.kernel.org/show_bug.cgi?id=6688
 
 This is looking like a net memory leak in 2.6.16.  1/3rd is in ip_fib_alias
 and 2/3rds is in size-64.  I've asked the reporter to apply the leak
 detector patch so we can find out who is using the size-64 part.

I had a look at fib_trie.c and found a bug.  This is probably not the
cause of the leak since it looks more likely to cause a crash than
anything.  However, please retest with this applied just to be sure.

[IPV4]: Fix error handling for fib_insert_node call

The error handling around fib_insert_node was broken because we always
zeroed the error before checking it.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [Bug 6688] Memory allocation problem

2006-07-12 Thread Herbert Xu

On Wed, Jul 12, 2006 at 09:44:46PM +1000, Herbert Xu wrote:
 
 [IPV4]: Fix error handling for fib_insert_node call

Doh, fogot the patch.

[IPV4]: Fix error handling for fib_insert_node call

The error handling around fib_insert_node was broken because we always
zeroed the error before checking it.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 1cb6530..23fb9d9 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1252,8 +1252,8 @@ fn_trie_insert(struct fib_table *tb, str
 */
 
if (!fa_head) {
-   fa_head = fib_insert_node(t, err, key, plen);
err = 0;
+   fa_head = fib_insert_node(t, err, key, plen);
if (err)
goto out_free_new_fa;
}
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/1] net: fix __sk_stream_mem_reclaim

2006-07-12 Thread Herbert Xu

Ian McDonald [EMAIL PROTECTED] wrote:
 __sk_stream_mem_reclaim is only called by sk_stream_mem_reclaim.
 
 As such the check on sk-sk_forward_alloc is not needed and can be removed.
 
 Signed-off-by: Ian McDonald [EMAIL PROTECTED]

Acked-by: Herbert Xu [EMAIL PROTECTED]

What's more, even if the check does turn out to be false for some renegade
caller, the function will still work since all the operations turn out to
be no-ops.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[NET]: Update frag_list in pskb_trim

2006-07-13 Thread Herbert Xu

Hi Dave:

This needs to go into stable as well.  In fact, there is another unrelated
bug with exactly the same symptoms which was inadvertently fixed by the
GSO patches.  So I'll send a simpler fix for that to stable.

[NET]: Update frag_list in pskb_trim

When pskb_trim has to defer to ___pksb_trim to trim the frag_list part of
the packet, the frag_list is not updated to reflect the trimming.  This
will usually work fine until you hit something that uses the packet length
or tail from the frag_list.

Examples include esp_output and ip_fragment.

Another problem caused by this is that you can end up with a linear packet
with a frag_list attached.

It is possible to get away with this if we audit everything to make sure
that they always consult skb-len before going down onto frag_list.  In
fact we can do the samething for the paged part as well to avoid copying
the data area of the skb.  For now though, let's do the conservative fix
and update frag_list.

Many thanks to Marco Berizzi for helping me to track down this bug.

This 4-year old bug took 3 months to track down.  Marco was very patient
indeed :)

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 44f6a18..476aa39 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -257,11 +257,11 @@ nodata:
 }
 
 
-static void skb_drop_fraglist(struct sk_buff *skb)
+static void skb_drop_list(struct sk_buff **listp)
 {
-   struct sk_buff *list = skb_shinfo(skb)-frag_list;
+   struct sk_buff *list = *listp;
 
-   skb_shinfo(skb)-frag_list = NULL;
+   *listp = NULL;
 
do {
struct sk_buff *this = list;
@@ -270,6 +270,11 @@ static void skb_drop_fraglist(struct sk_
} while (list);
 }
 
+static inline void skb_drop_fraglist(struct sk_buff *skb)
+{
+   skb_drop_list(skb_shinfo(skb)-frag_list);
+}
+
 static void skb_clone_fraglist(struct sk_buff *skb)
 {
struct sk_buff *list;
@@ -830,41 +835,75 @@ free_skb:
 
 int ___pskb_trim(struct sk_buff *skb, unsigned int len)
 {
+   struct sk_buff **fragp;
+   struct sk_buff *frag;
int offset = skb_headlen(skb);
int nfrags = skb_shinfo(skb)-nr_frags;
int i;
+   int err;
+
+   if (skb_cloned(skb) 
+   unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC
+   return err;
 
for (i = 0; i  nfrags; i++) {
int end = offset + skb_shinfo(skb)-frags[i].size;
-   if (end  len) {
-   if (skb_cloned(skb)) {
-   if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-   return -ENOMEM;
-   }
-   if (len = offset) {
-   put_page(skb_shinfo(skb)-frags[i].page);
-   skb_shinfo(skb)-nr_frags--;
-   } else {
-   skb_shinfo(skb)-frags[i].size = len - offset;
-   }
+
+   if (end  len) {
+   offset = end;
+   continue;
}
-   offset = end;
+
+   if (len  offset)
+   skb_shinfo(skb)-frags[i++].size = len - offset;
+
+   skb_shinfo(skb)-nr_frags = i;
+
+   for (; i  nfrags; i++)
+   put_page(skb_shinfo(skb)-frags[i].page);
+
+   if (skb_shinfo(skb)-frag_list)
+   skb_drop_fraglist(skb);
+   break;
}
 
-   if (offset  len) {
+   for (fragp = skb_shinfo(skb)-frag_list; (frag = *fragp);
+fragp = frag-next) {
+   int end = offset + frag-len;
+
+   if (skb_shared(frag)) {
+   struct sk_buff *nfrag;
+
+   nfrag = skb_clone(frag, GFP_ATOMIC);
+   if (unlikely(!nfrag))
+   return -ENOMEM;
+
+   nfrag-next = frag-next;
+   frag = nfrag;
+   *fragp = frag;
+   }
+
+   if (end  len) {
+   offset = end;
+   continue;
+   }
+
+   if (end  len 
+   unlikely((err = pskb_trim(frag, len - offset
+   return err;
+
+   if (frag-next)
+   skb_drop_list(frag-next);
+   break;
+   }
+
+   if (len  skb_headlen(skb)) {
skb-data_len -= skb-len - len;
skb-len   = len;
} else {
-   if (len = skb_headlen(skb)) {
-   skb-len  = len;
-   skb-data_len = 0

[NET]: Add missing UFO initialisations

2006-07-13 Thread Herbert Xu

Hi:

This is only needed for 2.6.17-stable.

[NET]: Add missing UFO initialisations

This bug was unknowingly fixed the GSO patches (or rather, its effect was
unknown at the time).

Thanks to Marco Berizzi's persistence which is documented in the thread
ipsec tunnel asymmetrical mtu, we now know that it can have highly
non-obvious symptoms.

What happens is that uninitialised uso_size fields can cause packets to
be incorrectly identified as UFO, which means that it does not get
fragmented even if it's over the MTU.

The fix is simple enough.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/core/dev.c b/net/core/dev.c
index 4fba549..7d472ed 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1246,6 +1246,7 @@ int __skb_linearize(struct sk_buff *skb,
atomic_set(ninfo-dataref, 1);
ninfo-tso_size = skb_shinfo(skb)-tso_size;
ninfo-tso_segs = skb_shinfo(skb)-tso_segs;
+   ninfo-ufo_size = skb_shinfo(skb)-ufo_size;
ninfo-nr_frags = 0;
ninfo-frag_list = NULL;
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fb3770f..0280535 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -240,6 +240,7 @@ struct sk_buff *alloc_skb_from_cache(kme
skb_shinfo(skb)-nr_frags  = 0;
skb_shinfo(skb)-tso_size = 0;
skb_shinfo(skb)-tso_segs = 0;
+   skb_shinfo(skb)-ufo_size = 0;
skb_shinfo(skb)-frag_list = NULL;
 out:
return skb;
@@ -529,6 +530,7 @@ #endif
atomic_set(new-users, 1);
skb_shinfo(new)-tso_size = skb_shinfo(old)-tso_size;
skb_shinfo(new)-tso_segs = skb_shinfo(old)-tso_segs;
+   skb_shinfo(new)-ufo_size = skb_shinfo(old)-ufo_size;
 }
 
 /**
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: another networking lockdep bug

2006-07-13 Thread Herbert Xu

Dave Jones [EMAIL PROTECTED] wrote:
 Not sure if this one got reported/fixed yet, as I was running
 a kernel from sometime last week..

I think we've seen a couple of similar ones, this one is more
elaborate though :)
 
 - #1 (rtnl_mutex){--..}:
   [802ab792] lock_acquire+0x4a/0x69
   [802691c2] __mutex_lock_slowpath+0xeb/0x29f
   [8026939f] mutex_lock+0x29/0x2e
   [8042d973] rtnl_lock+0xf/0x12
   [8045c18a] ip_mc_leave_group+0x1e/0xae
   [80446087] do_ip_setsockopt+0x6ad/0x9b2
   [8044643a] ip_setsockopt+0x2a/0x84
   [80454328] udp_setsockopt+0xd/0x1c
   [8041f094] sock_common_setsockopt+0xe/0x11
   [8041e20f] sys_setsockopt+0x8e/0xb4
   [80262fd9] tracesys+0xd0/0xdb
 
 - #0 (sk_lock-AF_INET){--..}:
   [802ab792] lock_acquire+0x4a/0x69
   [8023726c] lock_sock+0xd4/0xe7
   [80228061] tcp_sendmsg+0x1e/0xb1a
   [80248ff8] inet_sendmsg+0x45/0x53
   [80259dd3] sock_sendmsg+0x110/0x130
   [8041ed0c] kernel_sendmsg+0x3c/0x52
   [8853c9e9] xs_tcp_send_request+0x117/0x320 [sunrpc]
   [8853b8d5] xprt_transmit+0x105/0x21e [sunrpc]
   [8853a71e] call_transmit+0x1f4/0x239 [sunrpc]
   [8853f06e] __rpc_execute+0x9b/0x1e6 [sunrpc]
   [8853f1de] rpc_execute+0x1a/0x1d [sunrpc]
   [885394ad] rpc_call_sync+0x87/0xb9 [sunrpc]
   [885a5587] nfs3_rpc_wrapper+0x2e/0x74 [nfs]
   [885a5870] nfs3_proc_setattr+0x9b/0xd3 [nfs]
   [8859bffb] nfs_setattr+0xe9/0x11e [nfs]
   [8022f7b4] notify_change+0x154/0x2f7
   [802e00c7] do_truncate+0x52/0x72
   [80212d17] may_open+0x1d5/0x231
   [8021c270] open_namei+0x290/0x6b4
   [80229974] do_filp_open+0x27/0x46
   [8021acb7] do_sys_open+0x4e/0xcd
   [80234b2a] sys_open+0x1a/0x1d
   [80262fd9] tracesys+0xd0/0xdb

We know this is a false positive because the NFS sockets are not
exported to user-space and therefore #1 can't happen.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: IPSec + large packets being corrupted

2006-07-13 Thread Herbert Xu

Chris Audley [EMAIL PROTECTED] wrote:
 
 Large packet (eg. 1600 byte ping) received by VPN server A.
 Packet encrypted and fragmented then sent from Server A to Server B.
 Packet received by network subsytem on B and frag_list created
 ah_input() strips the AH header -- frag sizes are not changed!
 esp_input() decrypts data
 ip_fragment() uses existing frag_list sizes from before the AH
  header being stripped, and sends too much data (16 bytes extra). This
  breaks the checksum and packets get dropped by destination host.

Aha, this sounds exactly like the bug I fixed today for Marco Berizzi.
The following patch should fix the problem for you.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
[NET]: Update frag_list in pskb_trim

When pskb_trim has to defer to ___pksb_trim to trim the frag_list part of
the packet, the frag_list is not updated to reflect the trimming.  This
will usually work fine until you hit something that uses the packet length
or tail from the frag_list.

Examples include esp_output and ip_fragment.

Another problem caused by this is that you can end up with a linear packet
with a frag_list attached.

It is possible to get away with this if we audit everything to make sure
that they always consult skb-len before going down onto frag_list.  In
fact we can do the samething for the paged part as well to avoid copying
the data area of the skb.  For now though, let's do the conservative fix
and update frag_list.

Many thanks to Marco Berizzi for helping me to track down this bug.

This 4-year old bug took 3 months to track down.  Marco was very patient
indeed :)

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 44f6a18..476aa39 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -257,11 +257,11 @@ nodata:
 }
 
 
-static void skb_drop_fraglist(struct sk_buff *skb)
+static void skb_drop_list(struct sk_buff **listp)
 {
-   struct sk_buff *list = skb_shinfo(skb)-frag_list;
+   struct sk_buff *list = *listp;
 
-   skb_shinfo(skb)-frag_list = NULL;
+   *listp = NULL;
 
do {
struct sk_buff *this = list;
@@ -270,6 +270,11 @@ static void skb_drop_fraglist(struct sk_
} while (list);
 }
 
+static inline void skb_drop_fraglist(struct sk_buff *skb)
+{
+   skb_drop_list(skb_shinfo(skb)-frag_list);
+}
+
 static void skb_clone_fraglist(struct sk_buff *skb)
 {
struct sk_buff *list;
@@ -830,41 +835,75 @@ free_skb:
 
 int ___pskb_trim(struct sk_buff *skb, unsigned int len)
 {
+   struct sk_buff **fragp;
+   struct sk_buff *frag;
int offset = skb_headlen(skb);
int nfrags = skb_shinfo(skb)-nr_frags;
int i;
+   int err;
+
+   if (skb_cloned(skb) 
+   unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC
+   return err;
 
for (i = 0; i  nfrags; i++) {
int end = offset + skb_shinfo(skb)-frags[i].size;
-   if (end  len) {
-   if (skb_cloned(skb)) {
-   if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-   return -ENOMEM;
-   }
-   if (len = offset) {
-   put_page(skb_shinfo(skb)-frags[i].page);
-   skb_shinfo(skb)-nr_frags--;
-   } else {
-   skb_shinfo(skb)-frags[i].size = len - offset;
-   }
+
+   if (end  len) {
+   offset = end;
+   continue;
}
-   offset = end;
+
+   if (len  offset)
+   skb_shinfo(skb)-frags[i++].size = len - offset;
+
+   skb_shinfo(skb)-nr_frags = i;
+
+   for (; i  nfrags; i++)
+   put_page(skb_shinfo(skb)-frags[i].page);
+
+   if (skb_shinfo(skb)-frag_list)
+   skb_drop_fraglist(skb);
+   break;
}
 
-   if (offset  len) {
+   for (fragp = skb_shinfo(skb)-frag_list; (frag = *fragp);
+fragp = frag-next) {
+   int end = offset + frag-len;
+
+   if (skb_shared(frag)) {
+   struct sk_buff *nfrag;
+
+   nfrag = skb_clone(frag, GFP_ATOMIC);
+   if (unlikely(!nfrag))
+   return -ENOMEM;
+
+   nfrag-next = frag-next;
+   frag = nfrag;
+   *fragp = frag;
+   }
+
+   if (end  len) {
+   offset = end;
+   continue;
+   }
+
+   if (end  len 
+   unlikely((err = pskb_trim(frag, len - offset

Re: [Bugme-new] [Bug 6430] New: ipsec tunnel : reply is not forwarded

2006-07-13 Thread Herbert Xu

Raphael Astier [EMAIL PROTECTED] wrote:
  
 On GW1 :
 #setkey -f 
 flush;
 spdflush;
 add 192.168.1.1 192.168.1.2 esp 1000 -m tunnel -E des-cbc 12345678;
 spdadd 10.0.0.0/24 11.0.0.0/24 any -P out ipsec
 esp/tunnel/192.168.1.1-192.168.1.2/require;
 
 On GW2 : (only need to have SPI to decrypt packets coming from GW1) 
 #!/usr/local/sbin/setkey -f
 flush;
 spdflush;
 add -n 192.168.1.1 192.168.1.2 esp 1000 -m tunnel -E des-cbc 12345678;

This can't possibly work since inbound policies are required for
tunnel-mode SAs (otherwise people can send packets with arbitrary
source addresses once they have a tunnel-mode SA with you).

So you need at least 1 more policy on GW1 and 2 policies on GW2
for this to have a chance of working.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: jumbo frames and memory fragmentation

2006-07-13 Thread Herbert Xu

Chris Friesen [EMAIL PROTECTED] wrote:
 Herbert Xu wrote:
 
 Either upgrade your kernel or backport the page-splitting code in the
 current tree.  That's really the only sane solution for jumbo packets.
 
 Looking at the page-splitting code, it says 82571 and greater support 
 packet-split  We're running the 82546GB device.  Looks like it 
 won't help me.

Well, time to fork out for a new card then :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: possible dos / wsize affected frozen connection length

2006-07-13 Thread Herbert Xu

CaT [EMAIL PROTECTED] wrote:
 
 I'm just wondering if connections hanging around this long are normal.
 The above has now been running for 6 days. netstat is still reporting an
 established session. netcat has not timed out. It's all just sitting
 there doing nothing.

TCP connections without keepalives can sit there for all eternity,
if your machine lasts that long :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: 2.6.18-rc1-mm2

2006-07-14 Thread Herbert Xu

On Fri, Jul 14, 2006 at 12:05:51AM -0700, Andrew Morton wrote:

  Call Trace:
[8026963e] show_trace+0xae/0x265
[8026980a] dump_stack+0x15/0x1b
[8043ba7b] skb_checksum_help+0x61/0x126
[8802f35f] :iptable_nat:ip_nat_fn+0x5f/0x1d2

This is tell you that there is a bug in ip_nat_fn in that it completes
the partial checksum even for TSO packets which will cause them to go
out with bogus checksums.

The warning also indicates that the system has detected this and has
worked around it by recomputing the partial checksum after NAT.

The warning is here so someone can fix NAT to not trash the partial
checksum.  It would also tell us if anyone else breaks checksums in
this way.

I've already made the warning appear only once per-boot so I'd really
like to keep it in until

1) NAT is fixed.
2) We're reasonably sure there's nothing else doing this.

Prior to this change your TSO packets would've gone out with corrupted
checksums silently.  Essentially TSO would only slow your machine down
since every transmission it makes has to be retransmitted as non-TSO.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [NET]: Update frag_list in pskb_trim

2006-07-14 Thread Herbert Xu

On Thu, Jul 13, 2006 at 10:05:26PM -0700, David Miller wrote:
 
 As I noted already, this is in my tree and will go off to
 Linus soon.
 
 Please toss this over to -stable under seperate cover, if
 you haven't done so already.  Please add my signoff:
 
 Signed-off-by: David S. Miller [EMAIL PROTECTED]

Great.  Here we go:

[NET]: Update frag_list in pskb_trim

When pskb_trim has to defer to ___pksb_trim to trim the frag_list part of
the packet, the frag_list is not updated to reflect the trimming.  This
will usually work fine until you hit something that uses the packet length
or tail from the frag_list.

Examples include esp_output and ip_fragment.

Another problem caused by this is that you can end up with a linear packet
with a frag_list attached.

It is possible to get away with this if we audit everything to make sure
that they always consult skb-len before going down onto frag_list.  In
fact we can do the samething for the paged part as well to avoid copying
the data area of the skb.  For now though, let's do the conservative fix
and update frag_list.

Many thanks to Marco Berizzi for helping me to track down this bug.

This 4-year old bug took 3 months to track down.  Marco was very patient
indeed :)

Signed-off-by: Herbert Xu [EMAIL PROTECTED]
Signed-off-by: David S. Miller [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 44f6a18..476aa39 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -257,11 +257,11 @@ nodata:
 }
 
 
-static void skb_drop_fraglist(struct sk_buff *skb)
+static void skb_drop_list(struct sk_buff **listp)
 {
-   struct sk_buff *list = skb_shinfo(skb)-frag_list;
+   struct sk_buff *list = *listp;
 
-   skb_shinfo(skb)-frag_list = NULL;
+   *listp = NULL;
 
do {
struct sk_buff *this = list;
@@ -270,6 +270,11 @@ static void skb_drop_fraglist(struct sk_
} while (list);
 }
 
+static inline void skb_drop_fraglist(struct sk_buff *skb)
+{
+   skb_drop_list(skb_shinfo(skb)-frag_list);
+}
+
 static void skb_clone_fraglist(struct sk_buff *skb)
 {
struct sk_buff *list;
@@ -830,41 +835,75 @@ free_skb:
 
 int ___pskb_trim(struct sk_buff *skb, unsigned int len)
 {
+   struct sk_buff **fragp;
+   struct sk_buff *frag;
int offset = skb_headlen(skb);
int nfrags = skb_shinfo(skb)-nr_frags;
int i;
+   int err;
+
+   if (skb_cloned(skb) 
+   unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC
+   return err;
 
for (i = 0; i  nfrags; i++) {
int end = offset + skb_shinfo(skb)-frags[i].size;
-   if (end  len) {
-   if (skb_cloned(skb)) {
-   if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-   return -ENOMEM;
-   }
-   if (len = offset) {
-   put_page(skb_shinfo(skb)-frags[i].page);
-   skb_shinfo(skb)-nr_frags--;
-   } else {
-   skb_shinfo(skb)-frags[i].size = len - offset;
-   }
+
+   if (end  len) {
+   offset = end;
+   continue;
}
-   offset = end;
+
+   if (len  offset)
+   skb_shinfo(skb)-frags[i++].size = len - offset;
+
+   skb_shinfo(skb)-nr_frags = i;
+
+   for (; i  nfrags; i++)
+   put_page(skb_shinfo(skb)-frags[i].page);
+
+   if (skb_shinfo(skb)-frag_list)
+   skb_drop_fraglist(skb);
+   break;
}
 
-   if (offset  len) {
+   for (fragp = skb_shinfo(skb)-frag_list; (frag = *fragp);
+fragp = frag-next) {
+   int end = offset + frag-len;
+
+   if (skb_shared(frag)) {
+   struct sk_buff *nfrag;
+
+   nfrag = skb_clone(frag, GFP_ATOMIC);
+   if (unlikely(!nfrag))
+   return -ENOMEM;
+
+   nfrag-next = frag-next;
+   frag = nfrag;
+   *fragp = frag;
+   }
+
+   if (end  len) {
+   offset = end;
+   continue;
+   }
+
+   if (end  len 
+   unlikely((err = pskb_trim(frag, len - offset
+   return err;
+
+   if (frag-next)
+   skb_drop_list(frag-next);
+   break;
+   }
+
+   if (len  skb_headlen(skb)) {
skb-data_len -= skb-len - len;
skb-len   = len

Re: [PATCH 08/10] MLSXFRM: Add security context to acquire messages using PF_KEY

2006-07-14 Thread Herbert Xu

On Fri, Jul 14, 2006 at 09:54:59AM -0400, James Morris wrote:
 
 Herbert, any review from you on this would be greatly appreciated.

OK, I'll try to have a look tomorrow (I'm GMT-4 at the moment).

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] clear skb cb on IP input

2006-07-15 Thread Herbert Xu

David Miller [EMAIL PROTECTED] wrote:
 
 Thank goodness this thing is only 3-words in size, this is going to
 run on every single IPv4 packet received by the system. :-/

At least this lets us get rid of a few other memsets :)

[IPV4]: Get rid of redundant IPCB-opts initialisation

Now that we always zero the IPCB-opts in ip_rcv, it is no longer
necessary to do so before calling netif_rx for tunneled packets.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 6ff9b10..0f9b3a3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -617,7 +617,6 @@ static int ipgre_rcv(struct sk_buff *skb
skb-mac.raw = skb-nh.raw;
skb-nh.raw = __pskb_pull(skb, offset);
skb_postpull_rcsum(skb, skb-h.raw, offset);
-   memset((IPCB(skb)-opt), 0, sizeof(struct ip_options));
skb-pkt_type = PACKET_HOST;
 #ifdef CONFIG_NET_IPGRE_BROADCAST
if (MULTICAST(iph-daddr)) {
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index cbcae65..406056e 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -256,7 +256,6 @@ int ip_options_compile(struct ip_options
 
if (!opt) {
opt = (IPCB(skb)-opt);
-   memset(opt, 0, sizeof(struct ip_options));
iph = skb-nh.raw;
opt-optlen = ((struct iphdr *)iph)-ihl*4 - sizeof(struct 
iphdr);
optptr = iph + sizeof(struct iphdr);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 3291d51..76ab50b 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -487,7 +487,6 @@ static int ipip_rcv(struct sk_buff *skb)
 
skb-mac.raw = skb-nh.raw;
skb-nh.raw = skb-data;
-   memset((IPCB(skb)-opt), 0, sizeof(struct ip_options));
skb-protocol = htons(ETH_P_IP);
skb-pkt_type = PACKET_HOST;
 
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ba33f86..9ccacf5 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1461,7 +1461,6 @@ int pim_rcv_v1(struct sk_buff * skb)
skb_pull(skb, (u8*)encap - skb-data);
skb-nh.iph = (struct iphdr *)skb-data;
skb-dev = reg_dev;
-   memset((IPCB(skb)-opt), 0, sizeof(struct ip_options));
skb-protocol = htons(ETH_P_IP);
skb-ip_summed = 0;
skb-pkt_type = PACKET_HOST;
@@ -1517,7 +1516,6 @@ static int pim_rcv(struct sk_buff * skb)
skb_pull(skb, (u8*)encap - skb-data);
skb-nh.iph = (struct iphdr *)skb-data;
skb-dev = reg_dev;
-   memset((IPCB(skb)-opt), 0, sizeof(struct ip_options));
skb-protocol = htons(ETH_P_IP);
skb-ip_summed = 0;
skb-pkt_type = PACKET_HOST;
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index f8d880b..13cafbe 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -92,7 +92,6 @@ static int xfrm4_tunnel_input(struct xfr
skb-mac.raw = memmove(skb-data - skb-mac_len,
   skb-mac.raw, skb-mac_len);
skb-nh.raw = skb-data;
-   memset((IPCB(skb)-opt), 0, sizeof(struct ip_options));
err = 0;
 
 out:
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index c56aeec..836eecd 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -380,7 +380,6 @@ static int ipip6_rcv(struct sk_buff *skb
secpath_reset(skb);
skb-mac.raw = skb-nh.raw;
skb-nh.raw = skb-data;
-   memset((IPCB(skb)-opt), 0, sizeof(struct ip_options));
IPCB(skb)-flags = 0;
skb-protocol = htons(ETH_P_IPV6);
skb-pkt_type = PACKET_HOST;
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCHv2 2.6.18-rc1-mm2 1/3] net: UDP-Lite generic support

2006-07-15 Thread Herbert Xu

Gerrit Renker [EMAIL PROTECTED] wrote:

 diff -Nurp  a/net/core/sock.c b/net/core/sock.c
 --- a/net/core/sock.c   2006-07-06 09:08:24.0 +0100
 +++ b/net/core/sock.c   2006-07-14 10:17:50.0 +0100
 @@ -479,7 +479,12 @@ set_rcvbuf:
break;
 
case SO_NO_CHECK:
 -   sk-sk_no_check = valbool;
 +   /* UDP-Lite (RFC 3828) mandates checksumming,
 +* hence user must not enable this option.   */
 +   if (sk-sk_protocol == IPPROTO_UDPLITE)
 +   ret = -EOPNOTSUPP;
 +   else
 +   sk-sk_no_check = valbool;

Please don't add protocol-specific stuff to generic functions.  In this
case why don't you just ignore sk_no_check for UDPLITE as we do for TCP?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] clear skb cb on IP input

2006-07-16 Thread Herbert Xu

On Sat, Jul 15, 2006 at 06:12:22PM -0700, David Miller wrote:
 
 But I'm beginning to think that the onus of this may in fact fall upon
 the devices, in fact.  Loopback is one of the few devices where the
 control block might not be cleared out, due to uses in the output
 path.  Devices predominantly provide a zero'd out control block in the
 skb on packet receive.

The thing is qdiscs using cb means that this method of clearing cb
before netif_rx doesn't work anymore.

In particular, even if loopback clears cb before calling netif_rx,
some qdisc could come along between netif_rx and ip_rcv and put
stuff in the cb.

The same thing can happen to any NIC in fact, as long as we allow
qdiscs to use the cb area without clearing it, ip_rcv needs to
clear it itself.

With a little bit of effort we should be able to get away with
clearing just optlen.  Whether this effort is worthwhile I don't
know :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [stable] [NET]: Update frag_list in pskb_trim

2006-07-17 Thread Herbert Xu

On Mon, Jul 17, 2006 at 08:22:44AM -0700, Greg KH wrote:

 Ick, this doesn't apply to 2.6.17, care to rediff it?  I don't trust
 myself to get it correct :)

Oops, I thought I rediffed against 2.6.17, but it must've been
something else.

Here is a second attempt:

[NET]: Update frag_list in pskb_trim

When pskb_trim has to defer to ___pksb_trim to trim the frag_list part of
the packet, the frag_list is not updated to reflect the trimming.  This
will usually work fine until you hit something that uses the packet length
or tail from the frag_list.

Examples include esp_output and ip_fragment.

Another problem caused by this is that you can end up with a linear packet
with a frag_list attached.

It is possible to get away with this if we audit everything to make sure
that they always consult skb-len before going down onto frag_list.  In
fact we can do the samething for the paged part as well to avoid copying
the data area of the skb.  For now though, let's do the conservative fix
and update frag_list.

Many thanks to Marco Berizzi for helping me to track down this bug.

This 4-year old bug took 3 months to track down.  Marco was very patient
indeed :)

Signed-off-by: Herbert Xu [EMAIL PROTECTED]
Signed-off-by: David S. Miller [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f8f2347..2c31bb0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -967,15 +967,16 @@ #ifndef NET_SKB_PAD
 #define NET_SKB_PAD16
 #endif
 
-extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc);
+extern int ___pskb_trim(struct sk_buff *skb, unsigned int len);
 
 static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
 {
-   if (!skb-data_len) {
-   skb-len  = len;
-   skb-tail = skb-data + len;
-   } else
-   ___pskb_trim(skb, len, 0);
+   if (unlikely(skb-data_len)) {
+   WARN_ON(1);
+   return;
+   }
+   skb-len  = len;
+   skb-tail = skb-data + len;
 }
 
 /**
@@ -985,6 +986,7 @@ static inline void __skb_trim(struct sk_
  *
  * Cut the length of a buffer down by removing data from the tail. If
  * the buffer is already under the length specified it is not modified.
+ * The skb must be linear.
  */
 static inline void skb_trim(struct sk_buff *skb, unsigned int len)
 {
@@ -995,12 +997,10 @@ static inline void skb_trim(struct sk_bu
 
 static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
 {
-   if (!skb-data_len) {
-   skb-len  = len;
-   skb-tail = skb-data+len;
-   return 0;
-   }
-   return ___pskb_trim(skb, len, 1);
+   if (skb-data_len)
+   return ___pskb_trim(skb, len);
+   __skb_trim(skb, len);
+   return 0;
 }
 
 static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fb3770f..40f108e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -250,11 +250,11 @@ nodata:
 }
 
 
-static void skb_drop_fraglist(struct sk_buff *skb)
+static void skb_drop_list(struct sk_buff **listp)
 {
-   struct sk_buff *list = skb_shinfo(skb)-frag_list;
+   struct sk_buff *list = *listp;
 
-   skb_shinfo(skb)-frag_list = NULL;
+   *listp = NULL;
 
do {
struct sk_buff *this = list;
@@ -263,6 +263,11 @@ static void skb_drop_fraglist(struct sk_
} while (list);
 }
 
+static inline void skb_drop_fraglist(struct sk_buff *skb)
+{
+   skb_drop_list(skb_shinfo(skb)-frag_list);
+}
+
 static void skb_clone_fraglist(struct sk_buff *skb)
 {
struct sk_buff *list;
@@ -800,49 +805,80 @@ struct sk_buff *skb_pad(struct sk_buff *
return nskb;
 }  
  
-/* Trims skb to length len. It can change skb pointers, if realloc is 1.
- * If realloc==0 and trimming is impossible without change of data,
- * it is BUG().
+/* Trims skb to length len. It can change skb pointers.
  */
 
-int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc)
+int ___pskb_trim(struct sk_buff *skb, unsigned int len)
 {
+   struct sk_buff **fragp;
+   struct sk_buff *frag;
int offset = skb_headlen(skb);
int nfrags = skb_shinfo(skb)-nr_frags;
int i;
+   int err;
+
+   if (skb_cloned(skb) 
+   unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC
+   return err;
 
for (i = 0; i  nfrags; i++) {
int end = offset + skb_shinfo(skb)-frags[i].size;
-   if (end  len) {
-   if (skb_cloned(skb)) {
-   BUG_ON(!realloc);
-   if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
-   return

Re: [PATCH] Bug in pskb_trim_rcsum()

2006-07-18 Thread Herbert Xu

On Tue, Jul 18, 2006 at 09:09:34AM +0800, Wei Yongjun wrote:

 And in my test, UDP under IPv4 maybe do that.
 My UDP packet is:
 
 packet1:
  ___
 | Source Port   | Dest Port   |
 |_|_|
 | Length = 16   | Checksum(*1)  |
 |_|_|
 |  payload24  |
 |__|

The whole point of CHECKSUM_UNNECESSARY is that the hardware parses
the protocol header for us.  So in this case it must calculate the
checksum for only the first 8 bytes of the payload.

If it does this incorrectly, then it doesn't support RX checksums at
all.

Which NIC is doing this BTW?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Bug in pskb_trim_rcsum()

2006-07-18 Thread Herbert Xu

On Tue, Jul 18, 2006 at 04:54:39PM +0400, Alexey Kuznetsov wrote:

 I preferred optimistic approach: if the checksum comes out correct,
 we do not really care, how device calculated it. Probably, it calculated
 checksum over wrong data, but got a good checksum. So what? It is
 not a crypto digest yet. And if device found wrong checksum, we will
 recalculate it anyway.

Agreed.

 I would like to add that CHECKSUM_UNNECESSARY can be used, when
 checksum is really wrong (on loopback), that's why it is not cleared,
 when trimming. CHECKSUM_HW can always fall back to CHECKSUM_NONE,
 but CHECKSUM_UNNECESSARY cannot. Probably, this was bad idea, but
 it still means that if some generic function starts to clear it,
 all the code using it should be reverified.

Actually, I plan to differentiate between RX CHECKSUM_HW and TX
CHECKSUM_HW.  Now that we have things like Xen it is possible for
RX packets to have patial checksums too.

When this is done loopback can send TX CHECKSUM_HW packets instead
of CHECKSUM_UNNECESSARY (I'm currently calling this CHECKSUM_PARTIAL).

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 32/33] Add the Xen virtual network device driver.

2006-07-18 Thread Herbert Xu

jamal [EMAIL PROTECTED] wrote:

 I dont think the ifup/ifconfig provide operational status (i.e link
 up/down) - or do they? If they can be made to invoke scripts in such
 a case then we are set.

In fact, that's a very good reason why this shouldn't be in netfront.
Indeed, it shouldn't be in the guest at all.  The reason is that the
guest has no idea whether the physical carrier is present.

It's much better for the host to send the ARP packet on behalf of the
guest since the host knows the carrier status and the guest's MAC
address.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 32/33] Add the Xen virtual network device driver.

2006-07-18 Thread Herbert Xu

John Haller [EMAIL PROTECTED] wrote:

 But sending ARPs is not the right thing if the guest is expecting
 to use IPv6 networking, in which case unsolicited neighbor
 advertisements are the right thing to do.  The driver just
 doesn't seem to be the right place to do this, as it doesn't/
 shouldn't need to know the difference between IPv4/IPv6.

In this case it doesn't really matter because AFAIK they're
trying to get switches to notice that the MAC has moved.  So
all you need is some packet that the switches can grok.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] clear skb cb on IP input

2006-07-18 Thread Herbert Xu

On Tue, Jul 18, 2006 at 08:19:34PM +0200, Guillaume Chazarain wrote:

 Why not clearing the whole IPCB(skb) instead of
 just IPCB(skb)-opts? that would also clear
 IPCB(skb)-flags.

I agree, we should clear the whole IPCB.

 And, does not ipv6 need the same treatment with
 IP6CB?

Probably.  Patches are welcome :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC PATCH 32/33] Add the Xen virtual network device driver.

2006-07-18 Thread Herbert Xu

Stephen Hemminger [EMAIL PROTECTED] wrote:
 
 diff -r eadc12b20f35 drivers/xen/netfront/netfront.c
 --- /dev/null Thu Jan 01 00:00:00 1970 +
 +++ b/drivers/xen/netfront/netfront.c Fri Jun 09 15:03:12 2006 -0400
 @@ -0,0 +1,1584 @@
 
 +static inline void init_skb_shinfo(struct sk_buff *skb)
 +{
 + atomic_set((skb_shinfo(skb)-dataref), 1);
 + skb_shinfo(skb)-nr_frags = 0;
 + skb_shinfo(skb)-frag_list = NULL;
 +}
 
 Shouldn't this move to skbuff.h?

If and when my dom0=domU GSO patches are applied, this will simply
disappear.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: michael_mic in crypto api?

2006-07-19 Thread Herbert Xu

Michael Wu [EMAIL PROTECTED] wrote:

 Simplicity and consistency. Whereas the relatively simple mic part of the 
 TKIP 
 algorithm is in crypto API, the (more important, more complicated) key mixing 
 part is not in crypto api. It is unlikely that either the mic or key mixing 
 part would be used separately or even outside of TKIP/802.11i code, and we 
 don't want to encourage people anyways since they're just bandaids for 
 problems associated with using rc4.

Sure, I don't mind either way.  I think Jouni wrote this originally,
maybe he can share his thoughts with us?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: michael_mic in crypto api?

2006-07-19 Thread Herbert Xu

Jouni Malinen [EMAIL PROTECTED] wrote:

 However, at least for some time, there are two different TKIP
 implementations (net/ieee80211 and net/d80211) so this would mean
 duplicating Michael MIC implementation and I would rather not do that.

Good point, let's keep it for now.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: A question about linux/net/ipv4/ipcomp.c

2006-07-21 Thread Herbert Xu

Igor V. Liferenko [EMAIL PROTECTED] wrote:

 Would you please say why it's 60, and not 52?

The header length / 4 must fit within a single hexadecimal
digit.  Therefore the maximum is 15 * 4 = 60.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [XFRM]: Fix protocol field value for outgoing IPv6 GSO packets

2006-07-24 Thread Herbert Xu

On Tue, Jul 25, 2006 at 02:09:26AM +0200, Patrick McHardy wrote:
 This appears to be a mistake, but I didn't follow the GSO stuff
 very closely, so there could be some non-obvious reason.

Yes it definitely was a mistake! Thanks for picking this up Patrick.

 [XFRM]: Fix protocol field value for outgoing IPv6 GSO packets
 
 Signed-off-by: Patrick McHardy [EMAIL PROTECTED]

Acked-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ip multicast route bug fix

2006-07-25 Thread Herbert Xu

Stephen Hemminger [EMAIL PROTECTED] wrote:

 @@ -1593,12 +1594,19 @@ int ipmr_get_route(struct sk_buff *skb, 
read_unlock(mrt_lock);
return -ENODEV;
}
 -   skb-nh.raw = skb_push(skb, sizeof(struct iphdr));
 -   skb-nh.iph-ihl = sizeof(struct iphdr)2;
 -   skb-nh.iph-saddr = rt-rt_src;
 -   skb-nh.iph-daddr = rt-rt_dst;
 -   skb-nh.iph-version = 0;
 -   err = ipmr_cache_unresolved(vif, skb);
 +   
 +   iskb = alloc_skb(sizeof(struct iphdr), GFP_KERNEL);
 +   if (!iskb) {
 +   read_unlock(mrt_lock);
 +   return -ENOMEM;
 +   }
 +   memset(iskb-data, 0, sizeof(struct iphdr));
 +   iskb-nh.raw = iskb-data;
 +   iskb-nh.iph-ihl = sizeof(struct iphdr)2;
 +   iskb-nh.iph-saddr = rt-rt_src;
 +   iskb-nh.iph-daddr = rt-rt_dst;
 +
 +   err = ipmr_cache_unresolved(vif, iskb);

I'm afraid this is still broken in a different way.

If ipmr_cache_unresolved queues the skb onto the unresolved list things
it's going to try to use the skb as a netlink skb instead :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [IPROUTE]: Add support for multipath route realms

2006-07-25 Thread Herbert Xu

Patrick McHardy [EMAIL PROTECTED] wrote:
 
 [IPROUTE]: Add support for multipath route realms
 
 Routing realms exist per nexthop, but iproute currently only allows to send
 a single route realm, which is refused by the kernel for multipath routes.
 Add support for specifying per nexthop realms. Old kernels only return the
 first realm back to userspace when dumping, so the others can't be displayed,
 besides that it will also behave correctly on old kernels.
 
 old kernel:
 
 1.2.3.4 realm 1
nexthop dev dummy0 weight 1
nexthop dev dummy1 weight 1
nexthop dev dummy2 weight 1
nexthop dev dummy3 weight 1
 
 new kernel:
 
 1.2.3.4
nexthop realm 1 dev dummy0 weight 1
nexthop realm 2 dev dummy1 weight 1
nexthop realm 3 dev dummy2 weight 1
nexthop realm 4 dev dummy3 weight 1

This really looks like papering over fundamental brokenness of
IP_ROUTE_MULTIPATH_CACHED since you wouldn't otherwise get these
entries in the routing cache.

This reminds me that I better revisit the reasons that people gave
for actually using IP_ROUTE_MULTIPATH_CACHED the last time we tried
to get rid of it.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [IPROUTE]: Add support for multipath route realms

2006-07-25 Thread Herbert Xu

On Tue, Jul 25, 2006 at 06:19:33PM +1000, Herbert Xu wrote:

  new kernel:
  
  1.2.3.4
 nexthop realm 1 dev dummy0 weight 1
 nexthop realm 2 dev dummy1 weight 1
 nexthop realm 3 dev dummy2 weight 1
 nexthop realm 4 dev dummy3 weight 1
 
 This really looks like papering over fundamental brokenness of
 IP_ROUTE_MULTIPATH_CACHED since you wouldn't otherwise get these
 entries in the routing cache.

Nevermind, I misread your changelog.  Your patch is obviously
not related to IP_ROUTE_MULTIPATH_CACHED :)
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] ip multicast route bug fix

2006-07-25 Thread Herbert Xu

Alexey Kuznetsov [EMAIL PROTECTED] wrote:
 
 I think you mean this.
 
 Note, it is real skb_clone(), not alloc_skb(). Equeued skb contains
 the whole half-prepared netlink message plus room for the rest.
 It could be also skb_copy(), if we want to be puristic about mangling
 cloned data, but original copy is really not going to be used.  

I like this.  However, since the cloned skb is either discarded in case
of error, or queued in which case the caller discards its reference right
away, wouldn't it be simpler to just do this?

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ba33f86..0a2af08 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1593,6 +1593,7 @@ int ipmr_get_route(struct sk_buff *skb, 
read_unlock(mrt_lock);
return -ENODEV;
}
+   skb_get(skb);
skb-nh.raw = skb_push(skb, sizeof(struct iphdr));
skb-nh.iph-ihl = sizeof(struct iphdr)2;
skb-nh.iph-saddr = rt-rt_src;
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: BUGs in skb_checksum_help() and skb_gso_segment() in 2.6.18-rc2

2006-07-25 Thread Herbert Xu

Hi Patrick:

On Wed, Jul 26, 2006 at 05:38:07AM +0200, Patrick McHardy wrote:
 
 I have a patch which changes netfilter to do incremental checksumming.
 The hook number is passed to all functions doing this so they know
 how to update the checksum. Could you explain how
 CHECKSUM_COMPLETE/CHECKSUM_PARTIAL are going to be used? I assume
 they're meant to avoid passing hook numbers around everywhere?

Yes the hook number is another way to solve the same problem.  However,
it can only be used within netfilter.  CHECKSUM_COMPLETE/CHECKSUM_PARTIAL
on the other hand are valid throughout the stack.  With Xen feeding Linux
packets into the stack the netfilter hook is also no longer sufficient to
distinguish between these two cases as partial checksum packets can now
appear on receive.

The problem is that you need to do different incremental updates depending
on whether the checksum is complete (i.e., CHECKSUM_HW on receive), or
partial (i.e., CHECKSUM_HW on transmit).

With complete checksums the current update code in netfilter can be used
as is.  With partial checksums you need to exclude bits which weren't
used when computing the partial checksums (e.g., TCP port numbers need
to be excluded, but the IP address needs to be included for NAT).

I have a patch that adds CHECKSUM_COMPLETE/CHECKSUM_PARTIAL if you want
something to work from.  Let me know if you want this and I'll bounce it
to you.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH Round 4 2/3] Core network changes to support network event notification.

2006-07-25 Thread Herbert Xu

On Tue, Jul 25, 2006 at 10:05:40AM -0500, Steve Wise wrote:
 
 But they really are seeing a delete followed by an add.  That's what the
 kernel is doing.

Actually that's the other thing I don't really like.  The user-space
monitor may perceive that a route was actually deleted and replaced
by a new one even though this isn't what's happening at all.

In fact the problem here is that you're sending route notifications
when it's really the dst_entry that's changing.  User-space as it
stands only get notifications about fib changes which is quite different
from changes to the transient dst_entry objects which only exist in the
route cache.

Is anyone actually going to use the user-space interface of this? If not
perhaps we should wait until someone really needs it before adding the
netlink part of the patch.

We can change the kernel interface at will so if we make a mistake with
netevent it can be easily corrected.  For user-space though the rules
are totally different.  I'd really hate to be stuck with an interface
which turns out to not be the one that people actually want to have.

 The rdma driver needs to update all established rdma connections that
 are using the next-hop information of the existing route and make them
 use the next-hop information of the new route.  In addition, the rdma
 driver might have a reference to the old dst entry.  So it can release
 that ref and add a ref to the new dst entry.

Do you really need the old route for the user-space part of your patch?

 I have to admit I'm a little fuzzy on the routing stuff.  The main
 netevents I've utilized in the the rdma driver I'm writing is the
 neighbour update event and the redirect event.  Route add/del was added
 for completeness of routing netevents.   

So you mean you aren't going to use the route notifications? In that case
we should probably just drop them and add them when someone actually needs
it.  At that point they can tell us what semantics they want from it :)

 Can you expand further or point me to code where the IP stack flushes
 its tables when routes are changed?

Grep for rt_cache_flush in net/ipv4/fib_hash.c.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: BUGs in skb_checksum_help() and skb_gso_segment() in 2.6.18-rc2

2006-07-25 Thread Herbert Xu

On Wed, Jul 26, 2006 at 06:01:40AM +0200, Patrick McHardy wrote:
 
 Please send it, I'll update my patch based on that. Thanks.

Here it is, it sits on top of

commit ca6bb5d7ab22ac79f608fe6cbc6b12de6a5a19f0
Author: David Woodhouse [EMAIL PROTECTED]
Date:   Thu Jun 22 16:07:52 2006 -0700

[NET]: Require CAP_NET_ADMIN to create tuntap devices.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
61a015eb86469404587e910e9b852fc35ce436b8
diff --git a/drivers/atm/he.c b/drivers/atm/he.c
index fde9334..601e7ee 100644
--- a/drivers/atm/he.c
+++ b/drivers/atm/he.c
@@ -1913,7 +1913,7 @@ #endif
skb-tail = skb-data + skb-len;
 #ifdef USE_CHECKSUM_HW
if (vcc-vpi == 0  vcc-vci = 
ATM_NOT_RSV_VCI) {
-   skb-ip_summed = CHECKSUM_HW;
+   skb-ip_summed = CHECKSUM_COMPLETE;
skb-csum = TCP_CKSUM(skb-data,
he_vcc-pdu_len);
}
diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index e277789..15dcd4e 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -2243,7 +2243,7 @@ boomerang_start_xmit(struct sk_buff *skb
 
vp-tx_ring[entry].next = 0;
 #if DO_ZEROCOPY
-   if (skb-ip_summed != CHECKSUM_HW)
+   if (skb-ip_summed != CHECKSUM_PARTIAL)
vp-tx_ring[entry].status = cpu_to_le32(skb-len | 
TxIntrUploaded);
else
vp-tx_ring[entry].status = cpu_to_le32(skb-len | 
TxIntrUploaded | AddTCPChksum | AddUDPChksum);
diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c
index ad0c8c3..4f566d8 100644
--- a/drivers/net/8139cp.c
+++ b/drivers/net/8139cp.c
@@ -809,7 +809,7 @@ #endif
 
if (mss)
flags |= LargeSend | ((mss  MSSMask)  MSSShift);
-   else if (skb-ip_summed == CHECKSUM_HW) {
+   else if (skb-ip_summed == CHECKSUM_PARTIAL) {
const struct iphdr *ip = skb-nh.iph;
if (ip-protocol == IPPROTO_TCP)
flags |= IPCS | TCPCS;
@@ -863,7 +863,7 @@ #endif
if (mss)
ctrl |= LargeSend |
((mss  MSSMask)  MSSShift);
-   else if (skb-ip_summed == CHECKSUM_HW) {
+   else if (skb-ip_summed == CHECKSUM_PARTIAL) {
if (ip-protocol == IPPROTO_TCP)
ctrl |= IPCS | TCPCS;
else if (ip-protocol == IPPROTO_UDP)
@@ -894,7 +894,7 @@ #endif
txd-addr = cpu_to_le64(first_mapping);
wmb();
 
-   if (skb-ip_summed == CHECKSUM_HW) {
+   if (skb-ip_summed == CHECKSUM_PARTIAL) {
if (ip-protocol == IPPROTO_TCP)
txd-opts1 = cpu_to_le32(first_eor | first_len |
 FirstFrag | DescOwn |
diff --git a/drivers/net/acenic.c b/drivers/net/acenic.c
index 23ff22b..3ab0e76 100644
--- a/drivers/net/acenic.c
+++ b/drivers/net/acenic.c
@@ -2041,7 +2041,7 @@ static void ace_rx_int(struct net_device
 */
if (bd_flags  BD_FLG_TCP_UDP_SUM) {
skb-csum = htons(csum);
-   skb-ip_summed = CHECKSUM_HW;
+   skb-ip_summed = CHECKSUM_COMPLETE;
} else {
skb-ip_summed = CHECKSUM_NONE;
}
@@ -2512,7 +2512,7 @@ restart:
 
mapping = ace_map_tx_skb(ap, skb, skb, idx);
flagsize = (skb-len  16) | (BD_FLG_END);
-   if (skb-ip_summed == CHECKSUM_HW)
+   if (skb-ip_summed == CHECKSUM_PARTIAL)
flagsize |= BD_FLG_TCP_UDP_SUM;
 #if ACENIC_DO_VLAN
if (vlan_tx_tag_present(skb)) {
@@ -2535,7 +2535,7 @@ #endif
 
mapping = ace_map_tx_skb(ap, skb, NULL, idx);
flagsize = (skb_headlen(skb)  16);
-   if (skb-ip_summed == CHECKSUM_HW)
+   if (skb-ip_summed == CHECKSUM_PARTIAL)
flagsize |= BD_FLG_TCP_UDP_SUM;
 #if ACENIC_DO_VLAN
if (vlan_tx_tag_present(skb)) {
@@ -2561,7 +2561,7 @@ #endif
   PCI_DMA_TODEVICE);
 
flagsize = (frag-size  16);
-   if (skb-ip_summed == CHECKSUM_HW)
+   if (skb-ip_summed == CHECKSUM_PARTIAL)
flagsize |= BD_FLG_TCP_UDP_SUM;
idx = (idx + 1) % ACE_TX_RING_ENTRIES(ap

Re: ipsec tunnel policy vs routing table

2006-07-27 Thread Herbert Xu

Marco Berizzi [EMAIL PROTECTED] wrote:
 
 172.16.0.0/23 dev eth2  proto kernel  scope link  src 172.16.1.1
 10.180.0.0/16 via 172.16.1.253 dev eth2
 10.0.0.0/8 via pub_ip dev eth0
 127.0.0.0/8 dev lo  scope link
 
 I have noticed that packets for 10.180.0.0/16 network
 are eaten by the ipsec tunnel because the policy allow
 them. Is there a way to deliver packets for 10.180.0.0
 network to the 172.16.1.253 router (because the route
 to 10.180.0.0 is more specific than 10.0.0.0/8)?

You need an IPsec pass action.  With Openswan you can do it with
something like

conn pass
left=%defaultroute
# This should be the leftsubnet of your 10.0.0.0/8 connection.
leftsubnet=0.0.0.0/0
# This field doesn't really matter.
right=172.16.1.253
rightsubnet=10.180.0.0/16
type=passthrough
authby=never
auto=route

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: ipsec tunnel policy vs routing table

2006-07-27 Thread Herbert Xu

On Thu, Jul 27, 2006 at 04:06:44PM +0200, Marco Berizzi wrote:
 
 conn pass
   left=172.16.1.1
   leftsubnet=172.16.0.0/23
   right=172.16.1.253
   rightsubnet=10.180.0./16
   type=passthrough
   authby=never
   auto=route
 
 After running 'ipsec auto --add pass  ipsec auto
 --route pass' openswan has eaten my static route
 inserted by hand:
 route add -net 10.180.0.0/16 gw 172.16.1.253
 Here is 'ip r s' output after 'ipsec auto --route
 pass':
 172.16.0.0/23 dev eth2  proto kernel  scope link  src 172.16.1.1
 10.180.0.0/16 dev eth2  scope link

Oh yeah, forgot about that :)

You can set the gateway using rightnexthop=172.16.1.253.

 All if fine now. It isn't even needed anymore to
 insert the static route now, as it is placed by
 openswan. My question is how linux understand that
 it should send packets for 10.180.0.0/24 to the
 172.16.1.253 router.

It doesn't really.  However your router might be proxy arping.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Regarding offloading IPv6 addrconf and ndisc

2006-07-27 Thread Herbert Xu

Kazunori Miyazawa [EMAIL PROTECTED] wrote:
 
 I'm interested in the approach. And I have a couple of comments.
 I think DAD and ND are time critical operations.
 Can the daemons process with confirming to the specs.
 even if it were swapped out?
 Can we prevent the oom killer from killing the daemons?

These are valid concerns.  However, if we can have things like ntpd
live in user-space without causing nuisance, then addrconf should be
fine as well.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Regarding offloading IPv6 addrconf and ndisc

2006-07-27 Thread Herbert Xu

On Thu, Jul 27, 2006 at 06:34:15PM -0700, David Miller wrote:
 
 I have severe doubts actually in this area.  And I have practical
 experience to back up these doubts in this specific case.

OK.

 Just moving the ipv6 address add/delete out of software interrupt
 context broke the TAHI and other ipv6 testsuites.
 
 The reason was simple.  Consider a simple test case that emits an
 NDISC packet that should cause an interface address to be added, and
 then it sends a packet which makes sure that host responds to that
 address.  We have those two packets in our queue, as packet A and
 B.

I'd like to know more about this test.  On the face of it this test seems
to be broken.  What if packet A was lost? Surely this shouldn't be used
as an indication that the target IPv6 stack is out-of-spec.

If we're really going to guarantee that NDISC processing is always going
to be synchronous, this imposes fairly nasty restrictions on what we can
do in future.  For instance, this would rule out having the NIC distribute
flows across CPUs as this would break the synchronocity of NDISC processing
vs. TCP processing.

 As a secondary reason not to even consider this, it's in the kernel
 already and therefore it is totally impractical to try and remove it.
 When considering new protocols or features, the user vs.  kernel
 argument is something to validly consider.  But when it's already
 there, it will have to live there basically for eternity.  It is not
 like some arbitrary internal kernel module symbol or interface we
 can deprecate over a 6 month period or something like that.

Fair enough.  I suppose another case in point is IPv4 autoconf which
is *still* in the kernel after all these years.

However, to draw an analogy we're kind of stuck in a bog here.  So
while we can't extricate ourselves easily, we should attempt to come
up with ways of eventually lifting us out.  We should also try to
avoid any actions that'll cause us to sink deeper :)

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[IPV6]: Audit all ip6_dst_lookup/ip6_dst_store calls

2006-07-28 Thread Herbert Xu

On Fri, Jul 28, 2006 at 07:45:31PM +, Matt Domsch wrote:
 Triggered on Fedora rawhide kernel-2.6.17-1.2462.fc6 x86_64 which is
 based on 2.6.18rc2-git6.  IPv6 was in use at the time.
 
 =
 [ INFO: inconsistent lock state ]
 -
 inconsistent {softirq-on-W} - {in-softirq-R} usage.
 swapper/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
  (sk-sk_dst_lock){---?}, at: [80418ef3]
  sk_dst_check+0x26/0x12b
 {softirq-on-W} state was registered at:
   [802a874d] lock_acquire+0x4a/0x69
   [802672a1] _write_lock+0x24/0x31
   [8044a26b] ip4_datagram_connect+0x2e1/0x350
   [80451214] inet_dgram_connect+0x57/0x65
   [8041652a] sys_connect+0x7d/0xa4
   [8025ff0d] system_call+0x7d/0x83

Thanks for the report.  This is actually a false positive because
by these two paths can't intersect since one is a UDP while the other
is TCP.

However, here is a patch which should shut up the validator as well
as removing unnecessary locking from most callers of ip6_dst_lookup.

[IPV6]: Audit all ip6_dst_lookup/ip6_dst_store calls

The current users of ip6_dst_lookup can be divided into two classes:

1) The caller holds no locks and is in user-context (UDP).
2) The caller does not want to lookup the dst cache at all.

The second class covers everyone except UDP because most people do
the cache lookup directly before calling ip6_dst_lookup.  This patch
adds ip6_sk_dst_lookup for the first class.

Similarly ip6_dst_store users can be divded into those that need to
take the socket dst lock and those that don't.  This patch adds
__ip6_dst_store for those (everyone except UDP/datagram) that don't
need an extra lock.

Signed-off-by: Herbert Xu [EMAIL PROTECTED]

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED]
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
--
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index ab29daf..96b0e66 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -139,16 +139,22 @@ extern rwlock_t rt6_lock;
 /*
  * Store a destination cache entry in a socket
  */
-static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
-struct in6_addr *daddr)
+static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
+  struct in6_addr *daddr)
 {
struct ipv6_pinfo *np = inet6_sk(sk);
struct rt6_info *rt = (struct rt6_info *) dst;
 
-   write_lock(sk-sk_dst_lock);
sk_setup_caps(sk, dst);
np-daddr_cache = daddr;
np-dst_cookie = rt-rt6i_node ? rt-rt6i_node-fn_sernum : 0;
+}
+
+static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
+struct in6_addr *daddr)
+{
+   write_lock(sk-sk_dst_lock);
+   __ip6_dst_store(sk, dst, daddr);
write_unlock(sk-sk_dst_lock);
 }
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index a8fdf79..ece7e8a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -468,6 +468,9 @@ extern void ip6_flush_pending_frames(s
 extern int ip6_dst_lookup(struct sock *sk,
   struct dst_entry **dst,
   struct flowi *fl);
+extern int ip6_sk_dst_lookup(struct sock *sk,
+ struct dst_entry **dst,
+ struct flowi *fl);
 
 /*
  * skb processing functions
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 9f3d4d7..610c722 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -230,7 +230,7 @@ static int dccp_v6_connect(struct sock *
ipv6_addr_copy(np-saddr, saddr);
inet-rcv_saddr = LOOPBACK4_IPV6;
 
-   ip6_dst_store(sk, dst, NULL);
+   __ip6_dst_store(sk, dst, NULL);
 
icsk-icsk_ext_hdr_len = 0;
if (np-opt != NULL)
@@ -863,7 +863,7 @@ static struct sock *dccp_v6_request_recv
 * comment in that function for the gory details. -acme
 */
 
-   ip6_dst_store(newsk, dst, NULL);
+   __ip6_dst_store(newsk, dst, NULL);
newsk-sk_route_caps = dst-dev-features  ~(NETIF_F_IP_CSUM |
  NETIF_F_TSO);
newdp6 = (struct dccp6_sock *)newsk;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 5a0ba58..ac85e9c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -658,7 +658,7 @@ int inet6_sk_rebuild_header(struct sock 
return err;
}
 
-   ip6_dst_store(sk, dst, NULL);
+   __ip6_dst_store(sk, dst, NULL);
}
 
return 0;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 5c950cc..bf49107 100644

< 3 4 5 6 7 8 9 10 11 12 >

701 - 800 of 2197 matches

Mail list logo