[NET]: Added GSO header verification
Hi Dave: This feature is only needed by Xen but most of the code here is useful for other things like TCPv4 ECN support. [NET]: Added GSO header verification When GSO packets come from an untrusted source (e.g., a Xen guest domain), we need to verify the header integrity before passing it to the hardware. Since the first step in GSO is to verify the header, we can reuse that code by adding a new bit to gso_type: SKB_GSO_DODGY. Packets with this bit set can only be fed directly to devices with the corresponding bit NETIF_F_GSO_ROBUST. If the device doesn't have that bit, then the skb is fed to the GSO engine which will allow the packet to be sent to the hardware if it passes the header check. This patch changes the sg flag to a full features flag. The same method can be used to implement TSO ECN support. We simply have to mark packets with CWR set with SKB_GSO_ECN so that only hardware with a corresponding NETIF_F_TSO_ECN can accept them. The GSO engine can either fully segment the packet, or segment the first MTU and pass the rest to the hardware for further segmentation. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -315,6 +315,7 @@ struct net_device #define NETIF_F_GSO_SHIFT 16 #define NETIF_F_TSO(SKB_GSO_TCPV4 NETIF_F_GSO_SHIFT) #define NETIF_F_UFO(SKB_GSO_UDPV4 NETIF_F_GSO_SHIFT) +#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY NETIF_F_GSO_SHIFT) #define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) #define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM) @@ -543,7 +544,8 @@ struct packet_type { struct net_device *, struct packet_type *, struct net_device *); - struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg); + struct sk_buff *(*gso_segment)(struct sk_buff *skb, + int features); void*af_packet_priv; struct list_headlist; }; @@ -968,7 +970,7 @@ extern int netdev_max_backlog; extern int weight_p; extern int netdev_set_master(struct net_device *dev, struct net_device *master); extern int skb_checksum_help(struct sk_buff *skb, int inward); -extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int sg); +extern struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features); #ifdef CONFIG_BUG extern void netdev_rx_csum_fault(struct net_device *dev); #else @@ -988,11 +990,16 @@ extern void dev_seq_stop(struct seq_file extern void linkwatch_run_queue(void); +static inline int skb_gso_ok(struct sk_buff *skb, int features) +{ + int feature = skb_shinfo(skb)-gso_size ? + skb_shinfo(skb)-gso_type NETIF_F_GSO_SHIFT : 0; + return (features feature) != feature; +} + static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) { - int feature = skb_shinfo(skb)-gso_type NETIF_F_GSO_SHIFT; - return skb_shinfo(skb)-gso_size - (dev-features feature) != feature; + return skb_gso_ok(skb, dev-features); } #endif /* __KERNEL__ */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -172,6 +172,9 @@ enum { enum { SKB_GSO_TCPV4 = 1 0, SKB_GSO_UDPV4 = 1 1, + + /* This indicates the skb is from an untrusted source. */ + SKB_GSO_DODGY = 1 2, }; /** @@ -1299,7 +1302,7 @@ extern void skb_split(struct sk_b struct sk_buff *skb1, const u32 len); extern void skb_release_data(struct sk_buff *skb); -extern struct sk_buff *skb_segment(struct sk_buff *skb, int sg); +extern struct sk_buff *skb_segment(struct sk_buff *skb, int features); static inline void *skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer) diff --git a/include/net/protocol.h b/include/net/protocol.h --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -36,7 +36,8 @@ struct net_protocol { int (*handler)(struct sk_buff *skb); void(*err_handler)(struct sk_buff *skb, u32 info); - struct sk_buff *(*gso_segment)(struct sk_buff *skb, int sg); + struct sk_buff *(*gso_segment)(struct sk_buff *skb, + int features); int no_policy; }; diff --git a/include/net/tcp.h b/include/net/tcp.h
Re: [PATCH REPOST 0/2][RFC] Network Event Notifier Mechanism
Steve Wise [EMAIL PROTECTED] wrote: This patch implements a mechanism that allows interested clients to register for notification of certain network events. The intended use is to allow RDMA devices (linux/drivers/infiniband) to be notified of neighbour updates, ICMP redirects, path MTU changes, and route changes. The reason these devices need update events is because they typically cache this information in hardware and need to be notified when this information has been updated. The key events of interest are: - neighbour mac address change - routing redirect (the next hop neighbour changes for a dst_entry) - path mtu change (the patch mtu for a dst_entry changes). - route add/deletes I'd like to know more about what the RDMA device is going to do with this information. I thought RDMA was for receiving packets? Most of the info here pertains to transmission. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [NET]: Added GSO header verification
On Tue, Jun 27, 2006 at 01:46:35PM -0700, Michael Chan wrote: On Tue, 2006-06-27 at 22:07 +1000, Herbert Xu wrote: [NET]: Added GSO header verification @@ -2166,10 +2166,14 @@ struct sk_buff *tcp_tso_segment(struct s if (!pskb_may_pull(skb, thlen)) goto out; + segs = NULL; + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) + goto out; + This logic doesn't look right to me. Perhaps it's backwards and should be: if (!skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) Oops, you're absolutely right. Here is the fix. [NET]: Fix logical error in skb_gso_ok The test in skb_gso_ok is backwards. Noticed by Michael Chan [EMAIL PROTECTED]. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 84b0f0d..efd1e2a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -994,12 +994,12 @@ static inline int skb_gso_ok(struct sk_b { int feature = skb_shinfo(skb)-gso_size ? skb_shinfo(skb)-gso_type NETIF_F_GSO_SHIFT : 0; - return (features feature) != feature; + return (features feature) == feature; } static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) { - return skb_gso_ok(skb, dev-features); + return !skb_gso_ok(skb, dev-features); } #endif /* __KERNEL__ */ - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH REPOST 0/2][RFC] Network Event Notifier Mechanism
On Tue, Jun 27, 2006 at 09:31:57AM -0500, Steve Wise wrote: I'd like to know more about what the RDMA device is going to do with this information. I thought RDMA was for receiving packets? Most of the info here pertains to transmission. RDMA Ethernet devices adhere to a set of protocols defined by the IETF. See the RDDP WG (http://www.ietf.org/html.charters/rddp-charter.html) for the Internet Drafts that define the protocols. Would it be possible for you to give us a quick summary of the relevant points? Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH Round 3 0/2][RFC] Network Event Notifier Mechanism
Steve Wise [EMAIL PROTECTED] wrote: The reason these devices need update events is because they typically cache this information in hardware and need to be notified when this information has been updated. For information on RDMA protocols, see: http://www.ietf.org/html.charters/rddp-charter.html. Please give more specific reasons for needing these events because it is certainly far from obvious from reading those documents. Without reasons these invasive changes may turn out to be completely inappropriate. Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH Round 3 0/2][RFC] Network Event Notifier Mechanism
On Wed, Jun 28, 2006 at 12:54:10PM +1000, Herbert Xu wrote: Please give more specific reasons for needing these events because it is certainly far from obvious from reading those documents. Never mind, I've found your earlier messages on the list which explains your reasons more clearly. It would be nice if you could include those explanations in your patch description. BTW, does this mean that we're now comfortable with full TOE? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH]NET: Add ECN support for TSO
On Tue, Jun 27, 2006 at 08:06:47PM -0700, Michael Chan wrote: diff --git a/include/net/sock.h b/include/net/sock.h index 2d8d6ad..2c75172 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1033,7 +1033,8 @@ static inline void sk_setup_caps(struct if (sk-sk_route_caps NETIF_F_GSO) sk-sk_route_caps |= NETIF_F_TSO; if (sk-sk_route_caps NETIF_F_TSO) { - if (sock_flag(sk, SOCK_NO_LARGESEND) || dst-header_len) + if ((sock_flag(sk, SOCK_NO_LARGESEND) + !tso_ecn_capable(sk-sk_route_caps)) || dst-header_len) sk-sk_route_caps = ~NETIF_F_TSO; Why turn it off? With GSO in place the stack will handle it just fine (even your description says so :) We should instead remove all code that turns off TSO/ECN when the other is present. Otherwise the patch looks good. Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH Round 3 0/2][RFC] Network Event Notifier Mechanism
On Tue, Jun 27, 2006 at 11:24:25PM -0400, Jeff Garzik wrote: I don't see how that position has changed? http://linux-net.osdl.org/index.php/TOE Well I must say that RDMA over TCP smells very much like TOE. They've got an ARP table, a routing table, and presumably a TCP stack. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH]NET: Add ECN support for TSO
On Tue, Jun 27, 2006 at 08:40:34PM -0700, Michael Chan wrote: We need to turn off NETIF_F_TSO for a connection that has negotiated to turn on ECN if the output device cannot handle TSO and ECN. In other words, if the output device does not have either GSO or TSO_ECN feature set. I think you're mixing up GSO the mechanism with GSO the flag. The GSO flag simply tells the TCP stack whether TSO should be used or not, even if the hardware does not support TSO at all. The GSO mechanism on the other hand is ALWAYS present. So regardless of the presence of the GSO flag, you can always rely on the GSO mechanism to pick up the pieces (or rather generate the pieces as the case may be :) Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: TOE, etc. (was Re: [PATCH Round 3 0/2][RFC] Network Event Notifier Mechanism)
On Wed, Jun 28, 2006 at 12:18:25AM -0400, Jeff Garzik wrote: A PCI device that presents itself as a SCSI controller, but under the hood is really iSCSI-over-TCP smells like TOE. Running a virtualized Linux guest on top of a proprietary stack [which provides networking services to guests] also smells like TOE. :) Agreed. However, when they start adding hooks to the ARP table, the routing table, and PMTU management, it begs the question what more is there to add for TOE (well, user-space driven TOE at least)? Unfortunately I don't have more details, so you just get a generalized rant :) OK, the patch under discussion here adds hooks to all the stuff in the previous paragraph for the purpose of RDMA over TCP (well I must say that the exact RDMA application/hardware has never been clearly given but this is what I can gather from the previous posts). Put it another way, I think the dividing line between TOE and iSCSI or virtualisation is exactly the interface between them and the Linux kernel. If the interface is an existing one such as SCSI or standard IP then it's OK. However, when it starts poking in the guts of the Linux stack I'd say that it has crossed the line. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH]NET: Add ECN support for TSO
On Tue, Jun 27, 2006 at 09:37:01PM -0700, Michael Chan wrote: Signed-off-by: Michael Chan [EMAIL PROTECTED] Looks good to me too! @@ -56,6 +55,9 @@ static inline void TCP_ECN_send(struct s if (tp-ecn_flagsTCP_ECN_QUEUE_CWR) { tp-ecn_flags = ~TCP_ECN_QUEUE_CWR; skb-h.th-cwr = 1; + if (skb_shinfo(skb)-gso_type SKB_GSO_TCPV4) + skb_shinfo(skb)-gso_type |= + SKB_GSO_TCPV4_ECN; As a byte-pincher I must suggest that you turn this check into something like if (skb_shinfo(skb)-gso_type) or even if (skb_shinfo(skb)-gso_size) :) Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH]NET: Add ECN support for TSO
On Tue, Jun 27, 2006 at 09:54:39PM -0700, Michael Chan wrote: Assuming that we'll later have GSO_TCPV6, isn't it better to check for TCPV4 explicitly now? Or just change it later when necessary. Good point, I suppose you never know whether a V6 TSO-capable card is going to handle ECN correctly in both cases. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: TOE, etc.
On Tue, Jun 27, 2006 at 09:43:23PM -0700, David Miller wrote: Socket state, and that is one thing I don't see them doing yet. I wonder what happens when the Linux TCP stack attempts to open a connection to a remote host when that connection is already open in the RDMA NIC? For that matter what happens if a Linux application decides to listen on a TCP port already listened on by the RDMA NIC? The only saving grace is that they're only doing RDMA rather than arbitrary TCP. However, exactly the same infrastructure can be used to do arbitrary TCP should they wish to. But we have to realize they've already been given %95 of the interfaces they need to speak IP using our routes and our neighbour entries. Right? Yes, however I think the same argument could be applied to TOE. With their RDMA NIC, we'll have TCP/SCTP connections that bypass netfilter, tc, IPsec, AF_PACKET/tcpdump and the rest of our stack while at the same time it is using the same IP address as us and deciding what packets we will or won't see. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2.6.17] support for TSO over IPv6
Ananda Raju [EMAIL PROTECTED] wrote: This patch enables TSO over IPv6. Currently Linux network stacks restricts TSO over IPv6 by clearing of the NETIF_F_TSO bit from dev-features. This patch will remove this restriction. Thanks, looks good over all. SKB_GSO_TCPV4 renamed to SKB_GSO_TCP to make it generic GSO packet. SKB_GSO_UDPV4 renamed to SKB_GSO_UDP as UFO is not a IPv4 feature. UFO is supported over IPv6 also This bit is wrong though. TCPv4 and TCPv6 packets can't share the same GSO feature bit. The reason is that GSO/TSO is no longer just activated by dev-features. Bridges can forward GSO/TSO packets through at any time. This is why it is crucial that each packet specficies exactly the features that it requires from the hardware. For UFO, it's OK to have just SKB_GSO_UDP since IIRC your card is the only one that supports it and it supports both protocols anyway. diff -upNr netdev.org/include/linux/skbuff.h netdev.ipv6_tso/include/linux/skbuff.h --- netdev.org/include/linux/skbuff.h 2006-06-27 07:30:36.0 -0700 +++ netdev.ipv6_tso/include/linux/skbuff.h 2006-06-27 07:38:48.0 -0700 @@ -170,8 +170,9 @@ enum { }; enum { - SKB_GSO_TCPV4 = 1 0, - SKB_GSO_UDPV4 = 1 1, + SKB_GSO_TCP = 1 0, + SKB_GSO_UDP = 1 1, + SKB_GSO_TCPV6 = 1 2, }; BTW, you should rediff against Dave's current tree which has a few extra bits there. You should also leave TCPV4 as is and just add the TCPV6 bit. diff -upNr netdev.org/net/ipv4/tcp_output.c netdev.ipv6_tso/net/ipv4/tcp_output.c --- netdev.org/net/ipv4/tcp_output.c2006-06-27 07:30:36.0 -0700 +++ netdev.ipv6_tso/net/ipv4/tcp_output.c 2006-06-27 07:38:48.0 -0700 @@ -525,7 +525,7 @@ static void tcp_set_skb_tso_segs(struct factor /= mss_now; skb_shinfo(skb)-gso_segs = factor; skb_shinfo(skb)-gso_size = mss_now; - skb_shinfo(skb)-gso_type = SKB_GSO_TCPV4; + skb_shinfo(skb)-gso_type = SKB_GSO_TCP; You need to set SKB_GSO_TCPV6 for IPv6 packets here. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2.6.17] support for TSO over IPv6
On Fri, Jun 30, 2006 at 09:32:44AM +1000, Herbert Xu wrote: diff -upNr netdev.org/include/linux/skbuff.h netdev.ipv6_tso/include/linux/skbuff.h --- netdev.org/include/linux/skbuff.h 2006-06-27 07:30:36.0 -0700 +++ netdev.ipv6_tso/include/linux/skbuff.h 2006-06-27 07:38:48.0 -0700 @@ -170,8 +170,9 @@ enum { }; enum { - SKB_GSO_TCPV4 = 1 0, - SKB_GSO_UDPV4 = 1 1, + SKB_GSO_TCP = 1 0, + SKB_GSO_UDP = 1 1, + SKB_GSO_TCPV6 = 1 2, }; BTW, you should rediff against Dave's current tree which has a few extra bits there. You should also leave TCPV4 as is and just add the TCPV6 bit. BTW, does your card handle ECN correctly? If not then we should change the new ECN bit to apply to both TCPv4 and TCPv6 since 1) We now have a piece of hardware that handles TSO6 and it doesn't do ECN. 2) It's quite likely that if the NIC can handle ECN in TCPv4 then it can do it in TCPv6. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[TCP]: Reset gso_segs if packet is dodgy
Hi Dave: I forgot to verify gso_segs on packets from untrusted sources. In fact looking around it seems that gso_segs is used by exactly one driver outside of the TCP stack. In fact it also happens to be a virtual driver: s390/qeth. Since the only other GSO user we have at the moment -- UFO, doesn't even set gso_segs, I'd like to move it to skb-cb and get rid of this. However, for now let's simply reset it in tcp_tso_segment. [TCP]: Reset gso_segs if packet is dodgy I wasn't paranoid enough in verifying GSO information. A bogus gso_segs could upset drivers as much as a bogus header would. Let's reset it in the per-protocol gso_segment functions. I didn't verify gso_size because that can be verified by the source of the dodgy packets. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0336422..0bb0ac9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2166,13 +2166,19 @@ struct sk_buff *tcp_tso_segment(struct s if (!pskb_may_pull(skb, thlen)) goto out; - segs = NULL; - if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) - goto out; - oldlen = (u16)~skb-len; __skb_pull(skb, thlen); + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + int mss = skb_shinfo(skb)-gso_size; + + skb_shinfo(skb)-gso_segs = (skb-len + mss - 1) / mss; + + segs = NULL; + goto out; + } + segs = skb_segment(skb, features); if (IS_ERR(segs)) goto out; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2.6.17] support for TSO over IPv6
On Thu, Jun 29, 2006 at 07:06:51PM -0700, Michael Chan wrote: Don't we have a bigger problem if it doesn't support ECN with ipv6 TSO? We either have to disable ECN when TSO is enabled like we used to for ipv4, or provide the gso tcp segmentation for ipv6. Right? Good point. In that case we should also add GSO for IPv6. It shouldn't be too bad because all we need to do is verify the integrity of the extension headers in terms of their length, not their content. I wonder if this chip handles all the extension headers or not. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.17-mm3 -- BUG: illegal lock usage -- illegal {softirq-on-W} - {in-softirq-R} usage.
Andrew Morton [EMAIL PROTECTED] wrote: inet_bind() -sk_dst_get -read_lock(sk-sk_dst_lock) We are still holding the sock lock when doing sk_dst_get. 1 lock held by java_vm/4418: #0: (af_family_keys + (sk)-sk_family#4){-+..}, at: [f93c9281] tcp_v6_rcv+0x308/0x7b7 [ipv6] softirq -ip6_dst_lookup -sk_dst_check -sk_dst_reset -write_lock(sk-sk_dst_lock); The sock lock prevents this path from being entered. Instead the received TCP packet is queued and replayed when the sock lock is released. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[1/4] [IPV6]: Remove redundant length check on input
Hi Dave: I've added GSO for TCPv6 and updated Ananda's patch. Please note that the following patches have only been compile-tested. [IPV6]: Remove redundant length check on input We don't need to check skb-len when we're just about to call pskb_may_pull since that checks it for us. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -84,14 +84,9 @@ int ipv6_rcv(struct sk_buff *skb, struct */ IP6CB(skb)-iif = skb-dst ? ((struct rt6_info *)skb-dst)-rt6i_idev-dev-ifindex : dev-ifindex; - if (skb-len sizeof(struct ipv6hdr)) + if (unlikely(!pskb_may_pull(skb, sizeof(*hdr goto err; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) { - IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); - goto drop; - } - hdr = skb-nh.ipv6h; if (hdr-version != 6) - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[2/4] [IPV6]: Added GSO support for TCPv6
Hi: [IPV6]: Added GSO support for TCPv6 This patch adds GSO support for IPv6 and TCPv6. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/include/net/protocol.h b/include/net/protocol.h --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -50,11 +50,17 @@ struct inet6_protocol struct inet6_skb_parm *opt, int type, int code, int offset, __u32 info); + + struct sk_buff *(*gso_segment)(struct sk_buff *skb, + int features); + unsigned intflags; /* INET6_PROTO_xxx */ }; #define INET6_PROTO_NOPOLICY 0x1 #define INET6_PROTO_FINAL 0x2 +/* This should be set for any extension header which is compatible with GSO. */ +#define INET6_PROTO_GSO_EXTHDR 0x4 #endif /* This is used to register socket interfaces for IP protocols. */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2215,6 +2215,7 @@ struct sk_buff *tcp_tso_segment(struct s out: return segs; } +EXPORT_SYMBOL(tcp_tso_segment); extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -179,7 +179,7 @@ static int ipv6_destopt_rcv(struct sk_bu static struct inet6_protocol destopt_protocol = { .handler= ipv6_destopt_rcv, - .flags = INET6_PROTO_NOPOLICY, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, }; void __init ipv6_destopt_init(void) @@ -340,7 +340,7 @@ looped_back: static struct inet6_protocol rthdr_protocol = { .handler= ipv6_rthdr_rcv, - .flags = INET6_PROTO_NOPOLICY, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_GSO_EXTHDR, }; void __init ipv6_rthdr_init(void) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -58,9 +58,71 @@ DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics) __read_mostly; +static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct ipv6hdr *ipv6h; + struct inet6_protocol *ops; + int proto; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h + goto out; + + ipv6h = skb-nh.ipv6h; + proto = ipv6h-nexthdr; + __skb_pull(skb, sizeof(*ipv6h)); + + rcu_read_lock(); + for (;;) { + struct ipv6_opt_hdr *opth; + int len; + + if (proto != NEXTHDR_HOP) { + ops = rcu_dereference(inet6_protos[proto]); + + if (unlikely(!ops)) + goto unlock; + + if (!(ops-flags INET6_PROTO_GSO_EXTHDR)) + break; + } + + if (unlikely(!pskb_may_pull(skb, 8))) + goto unlock; + + opth = (void *)skb-data; + len = opth-hdrlen * 8 + 8; + + if (unlikely(!pskb_may_pull(skb, len))) + goto unlock; + + proto = opth-nexthdr; + __skb_pull(skb, len); + } + + skb-h.raw = skb-data; + if (likely(ops-gso_segment)) + segs = ops-gso_segment(skb, features); + +unlock: + rcu_read_unlock(); + + if (unlikely(IS_ERR(segs))) + goto out; + + for (skb = segs; skb; skb = skb-next) { + ipv6h = skb-nh.ipv6h; + ipv6h-payload_len = htons(skb-len - skb-mac_len); + } + +out: + return segs; +} + static struct packet_type ipv6_packet_type = { .type = __constant_htons(ETH_P_IPV6), .func = ipv6_rcv, + .gso_segment = ipv6_gso_segment, }; struct ip6_ra_chain *ip6_ra_chain; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1605,6 +1605,7 @@ struct proto tcpv6_prot = { static struct inet6_protocol tcpv6_protocol = { .handler= tcp_v6_rcv, .err_handler= tcp_v6_err, + .gso_segment= tcp_tso_segment, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[4/4] [IPV6]: Added GSO support for TCPv6
Hi: [IPV6]: Added GSO support for TCPv6 This patch adds GSO support for IPv6 and TCPv6. This is based on a patch by Ananda Raju [EMAIL PROTECTED]. His original description is: This patch enables TSO over IPv6. Currently Linux network stacks restricts TSO over IPv6 by clearing of the NETIF_F_TSO bit from dev-features. This patch will remove this restriction. This patch will introduce a new flag NETIF_F_TSO6 which will be used to check whether device supports TSO over IPv6. If device support TSO over IPv6 then we don't clear of NETIF_F_TSO and which will make the TCP layer to create TSO packets. Any device supporting TSO over IPv6 will set NETIF_F_TSO6 flag in dev-features along with NETIF_F_TSO. In case when user disables TSO using ethtool, NETIF_F_TSO will get cleared from dev-features. So even if we have NETIF_F_TSO6 we don't get TSO packets created by TCP layer. SKB_GSO_TCPV4 renamed to SKB_GSO_TCP to make it generic GSO packet. SKB_GSO_UDPV4 renamed to SKB_GSO_UDP as UFO is not a IPv4 feature. UFO is supported over IPv6 also The following table shows there is significant improvement in throughput with normal frames and CPU usage for both normal and jumbo. -- | | 1500| 9600 | | --|---| | | thru CPU| thru CPU | -- | TSO OFF | 2.00 5.5% id | 5.66 20.0% id | -- | TSO ON | 2.63 78.0 id | 5.67 39.0% id | -- Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/drivers/net/s2io.c b/drivers/net/s2io.c --- a/drivers/net/s2io.c +++ b/drivers/net/s2io.c @@ -3960,7 +3960,7 @@ static int s2io_xmit(struct sk_buff *skb txdp-Control_2 = 0; #ifdef NETIF_F_TSO mss = skb_shinfo(skb)-gso_size; - if (skb_shinfo(skb)-gso_type == SKB_GSO_TCPV4) { + if (skb_shinfo(skb)-gso_type (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)) { txdp-Control_1 |= TXD_TCP_LSO_EN; txdp-Control_1 |= TXD_TCP_LSO_MSS(mss); } @@ -3980,7 +3980,7 @@ static int s2io_xmit(struct sk_buff *skb } frg_len = skb-len - skb-data_len; - if (skb_shinfo(skb)-gso_type == SKB_GSO_UDPV4) { + if (skb_shinfo(skb)-gso_type == SKB_GSO_UDP) { int ufo_size; ufo_size = skb_shinfo(skb)-gso_size; @@ -4009,7 +4009,7 @@ static int s2io_xmit(struct sk_buff *skb txdp-Host_Control = (unsigned long) skb; txdp-Control_1 |= TXD_BUFFER0_SIZE(frg_len); - if (skb_shinfo(skb)-gso_type == SKB_GSO_UDPV4) + if (skb_shinfo(skb)-gso_type == SKB_GSO_UDP) txdp-Control_1 |= TXD_UFO_EN; frg_cnt = skb_shinfo(skb)-nr_frags; @@ -4024,12 +4024,12 @@ static int s2io_xmit(struct sk_buff *skb (sp-pdev, frag-page, frag-page_offset, frag-size, PCI_DMA_TODEVICE); txdp-Control_1 = TXD_BUFFER0_SIZE(frag-size); - if (skb_shinfo(skb)-gso_type == SKB_GSO_UDPV4) + if (skb_shinfo(skb)-gso_type == SKB_GSO_UDP) txdp-Control_1 |= TXD_UFO_EN; } txdp-Control_1 |= TXD_GATHER_CODE_LAST; - if (skb_shinfo(skb)-gso_type == SKB_GSO_UDPV4) + if (skb_shinfo(skb)-gso_type == SKB_GSO_UDP) frg_cnt++; /* as Txd0 was used for inband header */ tx_fifo = mac_control-tx_FIFO_start[queue]; @@ -4043,7 +4043,7 @@ static int s2io_xmit(struct sk_buff *skb if (mss) val64 |= TX_FIFO_SPECIAL_FUNC; #endif - if (skb_shinfo(skb)-gso_type == SKB_GSO_UDPV4) + if (skb_shinfo(skb)-gso_type == SKB_GSO_UDP) val64 |= TX_FIFO_SPECIAL_FUNC; writeq(val64, tx_fifo-List_Control); @@ -7021,6 +7021,9 @@ s2io_init_nic(struct pci_dev *pdev, cons #ifdef NETIF_F_TSO dev-features |= NETIF_F_TSO; #endif +#ifdef NETIF_F_TSO6 + dev-features |= NETIF_F_TSO6; +#endif if (sp-device_type XFRAME_II_DEVICE) { dev-features |= NETIF_F_UFO; dev-features |= NETIF_F_HW_CSUM; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -315,9 +315,10 @@ struct net_device #define NETIF_F_GSO_SHIFT 16 #define NETIF_F_GSO_MASK 0x #define NETIF_F_TSO(SKB_GSO_TCPV4 NETIF_F_GSO_SHIFT) -#define NETIF_F_UFO
[3/4] [NET]: Generalise TSO-specific bits from skb_setup_caps
Hi: [NET]: Generalise TSO-specific bits from skb_setup_caps This patch generalises the TSO-specific bits from sk_setup_caps by adding the sk_gso_type member to struct sock. This makes sk_setup_caps generic so that it can be used by TCPv6 or UFO. The only catch is that whoever uses this must provide a GSO implementation for their protocol which I think is a fair deal :) For now UFO continues to live without a GSO implementation which is OK since it doesn't use the sock caps field at the moment. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -313,6 +313,7 @@ struct net_device /* Segmentation offload features */ #define NETIF_F_GSO_SHIFT 16 +#define NETIF_F_GSO_MASK 0x #define NETIF_F_TSO(SKB_GSO_TCPV4 NETIF_F_GSO_SHIFT) #define NETIF_F_UFO(SKB_GSO_UDPV4 NETIF_F_GSO_SHIFT) #define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY NETIF_F_GSO_SHIFT) @@ -991,13 +992,18 @@ extern void dev_seq_stop(struct seq_file extern void linkwatch_run_queue(void); -static inline int skb_gso_ok(struct sk_buff *skb, int features) +static inline int net_gso_ok(int features, int gso_type) { - int feature = skb_shinfo(skb)-gso_size ? - skb_shinfo(skb)-gso_type NETIF_F_GSO_SHIFT : 0; + int feature = gso_type NETIF_F_GSO_SHIFT; return (features feature) == feature; } +static inline int skb_gso_ok(struct sk_buff *skb, int features) +{ + return net_gso_ok(features, skb_shinfo(skb)-gso_size ? + skb_shinfo(skb)-gso_type : 0); +} + static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) { return !skb_gso_ok(skb, dev-features); diff --git a/include/net/sock.h b/include/net/sock.h --- a/include/net/sock.h +++ b/include/net/sock.h @@ -140,6 +140,7 @@ struct sock_common { *@sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, %SO_OOBINLINE settings *@sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets *@sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) + *@sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) *@sk_lingertime: %SO_LINGER l_linger setting *@sk_backlog: always used with the per-socket spinlock held *@sk_callback_lock: used with the callbacks in the end of this struct @@ -211,6 +212,7 @@ struct sock { gfp_t sk_allocation; int sk_sndbuf; int sk_route_caps; + int sk_gso_type; int sk_rcvlowat; unsigned long sk_flags; unsigned long sk_lingertime; @@ -1025,15 +1027,20 @@ extern struct dst_entry *__sk_dst_check( extern struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); +static inline int sk_can_gso(const struct sock *sk) +{ + return net_gso_ok(sk-sk_route_caps, sk-sk_gso_type); +} + static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { __sk_dst_set(sk, dst); sk-sk_route_caps = dst-dev-features; if (sk-sk_route_caps NETIF_F_GSO) - sk-sk_route_caps |= NETIF_F_TSO; - if (sk-sk_route_caps NETIF_F_TSO) { + sk-sk_route_caps |= NETIF_F_GSO_MASK; + if (sk_can_gso(sk)) { if (dst-header_len) - sk-sk_route_caps = ~NETIF_F_TSO; + sk-sk_route_caps = ~NETIF_F_GSO_MASK; else sk-sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; } diff --git a/include/net/tcp.h b/include/net/tcp.h --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -751,7 +751,7 @@ static inline int tcp_is_cwnd_limited(co if (in_flight = tp-snd_cwnd) return 1; - if (!(sk-sk_route_caps NETIF_F_TSO)) + if (!sk_can_gso(sk)) return 0; left = tp-snd_cwnd - in_flight; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -643,7 +643,7 @@ static inline int select_size(struct soc int tmp = tp-mss_cache; if (sk-sk_route_caps NETIF_F_SG) { - if (sk-sk_route_caps NETIF_F_TSO) + if (sk_can_gso(sk)) tmp = 0; else { int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -242,6 +242,7 @@ int tcp_v4_connect(struct sock *sk, stru goto failure; /* OK, now commit destination to socket. */ + sk-sk_gso_type
[5/4] [NET]: Verify gso_type too in gso_segment
Hi: [NET]: Verify gso_type too in gso_segment We don't want nasty Xen guests to pass a TCPv6 packet in with gso_type set to TCPv4 or even UDP (or a packet that's both TCP and UDP). Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8d15715..318d467 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1106,7 +1106,15 @@ static struct sk_buff *inet_gso_segment( int ihl; int id; - if (!pskb_may_pull(skb, sizeof(*iph))) + if (unlikely(skb_shinfo(skb)-gso_type +~(SKB_GSO_TCPV4 | + SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + 0))) + goto out; + + if (unlikely(!pskb_may_pull(skb, sizeof(*iph goto out; iph = skb-nh.iph; @@ -1114,7 +1122,7 @@ static struct sk_buff *inet_gso_segment( if (ihl sizeof(*iph)) goto out; - if (!pskb_may_pull(skb, ihl)) + if (unlikely(!pskb_may_pull(skb, ihl))) goto out; skb-h.raw = __skb_pull(skb, ihl); @@ -1125,7 +1133,7 @@ static struct sk_buff *inet_gso_segment( rcu_read_lock(); ops = rcu_dereference(inet_protos[proto]); - if (ops ops-gso_segment) + if (likely(ops ops-gso_segment)) segs = ops-gso_segment(skb, features); rcu_read_unlock(); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 59e30ba..2f81374 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2155,6 +2155,14 @@ struct sk_buff *tcp_tso_segment(struct s unsigned int oldlen; unsigned int len; + if (unlikely(skb_shinfo(skb)-gso_type +~(SKB_GSO_TCPV4 | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_TCPV6 | + 0))) + goto out; + if (!pskb_may_pull(skb, sizeof(*th))) goto out; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 25f8bf8..03b65aa 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -65,6 +65,14 @@ static struct sk_buff *ipv6_gso_segment( struct inet6_protocol *ops; int proto; + if (unlikely(skb_shinfo(skb)-gso_type +~(SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_TCPV6 | + 0))) + goto out; + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h goto out; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [5/4] [NET]: Verify gso_type too in gso_segment
On Fri, Jun 30, 2006 at 04:13:50PM +1000, herbert wrote: [NET]: Verify gso_type too in gso_segment Here is a better version that ensures at least one of TCPV4 and TCPV6 is set in tcp_tso_segment. [NET]: Verify gso_type too in gso_segment We don't want nasty Xen guests to pass a TCPv6 packet in with gso_type set to TCPv4 or even UDP (or a packet that's both TCP and UDP). Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- 8cbe620474c9ba82ab868e1b2e887ff91808470f diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8d15715..318d467 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1106,7 +1106,15 @@ static struct sk_buff *inet_gso_segment( int ihl; int id; - if (!pskb_may_pull(skb, sizeof(*iph))) + if (unlikely(skb_shinfo(skb)-gso_type +~(SKB_GSO_TCPV4 | + SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + 0))) + goto out; + + if (unlikely(!pskb_may_pull(skb, sizeof(*iph goto out; iph = skb-nh.iph; @@ -1114,7 +1122,7 @@ static struct sk_buff *inet_gso_segment( if (ihl sizeof(*iph)) goto out; - if (!pskb_may_pull(skb, ihl)) + if (unlikely(!pskb_may_pull(skb, ihl))) goto out; skb-h.raw = __skb_pull(skb, ihl); @@ -1125,7 +1133,7 @@ static struct sk_buff *inet_gso_segment( rcu_read_lock(); ops = rcu_dereference(inet_protos[proto]); - if (ops ops-gso_segment) + if (likely(ops ops-gso_segment)) segs = ops-gso_segment(skb, features); rcu_read_unlock(); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 59e30ba..9b5c54e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2171,8 +2171,19 @@ struct sk_buff *tcp_tso_segment(struct s if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ - int mss = skb_shinfo(skb)-gso_size; + int type = skb_shinfo(skb)-gso_type; + int mss; + + if (unlikely(type +~(SKB_GSO_TCPV4 | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_TCPV6 | + 0) || +!(type (SKB_GSO_TCPV4 | SKB_GSO_TCPV6 + goto out; + mss = skb_shinfo(skb)-gso_size; skb_shinfo(skb)-gso_segs = (skb-len + mss - 1) / mss; segs = NULL; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 25f8bf8..03b65aa 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -65,6 +65,14 @@ static struct sk_buff *ipv6_gso_segment( struct inet6_protocol *ops; int proto; + if (unlikely(skb_shinfo(skb)-gso_type +~(SKB_GSO_UDP | + SKB_GSO_DODGY | + SKB_GSO_TCP_ECN | + SKB_GSO_TCPV6 | + 0))) + goto out; + if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h goto out; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: jumbo frames and memory fragmentation
Chris Friesen [EMAIL PROTECTED] wrote: Anyone have any suggestions on how to improve this? Upgrading kernels isn't an option. I could port back the copybreak stuff fairly easily. Either upgrade your kernel or backport the page-splitting code in the current tree. That's really the only sane solution for jumbo packets. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [1/4] [IPV6]: Remove redundant length check on input
On Fri, Jun 30, 2006 at 07:44:49PM -0400, Ananda Raju wrote: I tested the patch, and TSO over ipv6 is working fine. But TSO disable not working for IPv6. I tried the from tree /pub/scm/linux/kernel/git/davem/net-2.6 I think we need some new ethtool helper functions that sets/clears both TSO/TSO6. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2.6.17] support for TSO over IPv6
Hi Leonid: On Fri, Jun 30, 2006 at 04:46:56PM -0400, Leonid Grossman wrote: If ECE == 1, then set it to one for all datagrams. If CWR == 1, then set it to one for the first datagram, and set it to zero for the rest? Exactly. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] NET: Fix ipv6 GSO payload length
On Fri, Jun 30, 2006 at 03:56:47PM -0700, Michael Chan wrote: Fix ipv6 GSO payload length calculation. The ipv6 payload length excludes the ipv6 base header length and so must be subtracted. Signed-off-by: Michael Chan [EMAIL PROTECTED] Looks good to me. Thanks for cathcing this! -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [1/4] [IPV6]: Remove redundant length check on input
On Mon, Jul 03, 2006 at 07:44:06PM -0700, David Miller wrote: I think we need some new ethtool helper functions that sets/clears both TSO/TSO6. Do you really want to semantically seperate TSO and TSO6? I would think that real users who want to disable TSO, wish to do so unilaterally. That's what I meant. I meant ethtool helper functions that clear and set both TSO/TSO6 flags at the same time. Alternatively, we can add a new features bit which gives the inherent features of a device. That can then be used to derive the actual features in use. That way we won't need to invent a new ethtool helper function for this. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [1/4] [IPV6]: Remove redundant length check on input
On Tue, Jul 04, 2006 at 12:45:27PM +1000, herbert wrote: That's what I meant. I meant ethtool helper functions that clear and set both TSO/TSO6 flags at the same time. I think I was a bit ambiguous here. To expand on my argument, what I'm saying is that we can't just change the existing ethtool helper functions to set TSO6 since that'd break NICs which do not support TSO6. Instead of adding an ad-hoc ethtool function in the neterion driver, we should either add a new ethtool function which sets both TSO/TSO6, or do the following. Alternatively, we can add a new features bit which gives the inherent features of a device. That can then be used to derive the actual features in use. That way we won't need to invent a new ethtool helper function for this. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: RDMA will be reverted
Tom Tucker [EMAIL PROTECTED] wrote: All that said, the proposed patch helps not only iWARP, but other transports (iSCSI, IB) as well. It is not large, invasive, Care to explain on how it helps those other technologies? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: RDMA will be reverted
On Thu, Jul 06, 2006 at 12:36:24PM -0500, Tom Tucker wrote: The RDMA CMA uses IP addresses and port numbers to create a uniform addressing scheme across all transport types. For IB, it is necessary to resolve IP addresses to IB GIDs. The ARP protocol is used to do this and a netfilter rule is installed to snoop the incoming ARP replies. This would not be necessary if ARP events were provided as in the patch. Well the concerns we have do not apply to just iWARP, but RDMA/IP in general so this isn't really another technology. In fact, it seems that we now have IP-specific knowledge living in drivers/infiniband/core/cma.c which is suboptimal. Unified wire iSCSI adapters have the same issue as iWARP wrt to managing IP addresses and ports. If by Unified wire iSCSI you mean something that presents a SCSI interface together with an Ethernet interface where the two share the same MAC and IP address, then we have the same concerns with it as we do with iWARP or TOE. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
What is RDMA (was: RDMA will be reverted)
On Fri, Jul 07, 2006 at 06:53:20AM +, David Miller wrote: What I am saying, however, is that we need to understand the technology and the hooks you guys want before we put any of it in. Yes indeed. Here is what I've understood so far so let's see if we can start building a censensus. 1) RDMA over straight Infiniband is not contentious. In this case no IP networking is involved. 2) RDMA over TCP/IP (or SCTP) can theoretically run on any network that supported IP, including Infiniband and Ethernet. 3) When RDMA over TCP is completely done in hardware, i.e., it has its own IP address, MAC address, and simply presents an RDMA interface (whatever that may be) to Linux, we're OK with it. This is similar to how some iSCSI adapters work. 4) When RDMA over TCP is done completely in the Linux networking stack, we don't have a problem because the existing TCP stack is still in charge. However, this is pretty pointless. 5) RDMA over TCP on the receive side is offloaded into the NIC. This allows the NIC to directly place data into the application's buffer. We're starting to have a little bit of a problem because it means that part of the incoming IP traffic is now being directly processed by the NIC, with no input from the Linux TCP/IP stack. However, as long as the connection establishment/acks are still controlled/seen by Linux we can probably live with it. 6) RDMA over TCP on the transmit side is offloaded into the NIC. This is starting to look very worrying. The reason is that we lose all control to crucial aspects of TCP like congestion control. It is now completely up to the NIC to do that. For straight RDMA over Infiniband this isn't an issue because the traffic is not likely to travel across the Internet. However, for RDMA over TCP, one of their goals is to support sending traffic over the Internet so this is a concern. Incidentally, this is why they need to know about things like MAC/route/MTU changing. 7) RDMA over TCP is completely offloaded into the NIC, however, they still use Linux's IP address, MAC address, and rely on us to tell it about events such as MTU updates or MAC changes. In addition to the problems we have in 5) and 6), we now have a portion of TCP port space which has suddenly become invisible to Linux. What's more, we lose control (e.g., netfilter) over what connections may or may not be established. So to my mind, RDMA over TCP is most problematic when it shares the same IP/MAC address as the Linux host, and when the transmit side and/or the connection establishment (case 6 and 7) is offloaded into the NIC. This also happens to be the only scenario where they need the notification patch that started all this discussion. BTW, this URL gives an interesting perspective on RDMA over TCP (particularly Q14/Q15): http://www.rdmaconsortium.org/home/FAQs_Apr25.htm Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] kernel BUG at net/core/dev.c:1133!
Petersson, Mats [EMAIL PROTECTED] wrote: Looks like the GSO is involved? It's certainly what crashed your machine :) It's probably not the guilty party though. Someone is passing through a TSO packet with checksum set to something other than CHECKSUM_HW. I bet it's netfilter and we just never noticed before because real NICS would simply corrupt the checksum silently. Could you confirm that you have netfilter rules (in particular NAT rules) and that this goes away if you flush all your netfilter tables? Patrick, do we really have to zap the checksum on outbound NAT? Could we update it instead? I got this while running Dom0 only (no guests), with a BOINC/[EMAIL PROTECTED] application running on all 4 cores. changeset: 10649:8e55c5c11475 Build: x86_32p (pae). [ cut here ] kernel BUG at net/core/dev.c:1133! invalid opcode: [#1] SMP CPU:0 EIP:0061:[c04dceb0]Not tainted VLI EFLAGS: 00210297 (2.6.16.13-xen #12) EIP is at skb_gso_segment+0xf0/0x110 eax: ebx: 0003 ecx: 0002 edx: c06e2e00 esi: 0008 edi: cd9e32e0 ebp: c63a7900 esp: c0de5ad0 ds: 007b es: 007b ss: 0069 Process rosetta_5.25_i6 (pid: 8826, threadinfo=c0de4000 task=cb019560) Stack: 0c8f69060 ffa3 0003 cd9e32e0 0002 c63a7900 c04dcfb0 cd9e32e0 0003 cd9e32e0 cf8e3000 cf8e3140 c04dd07e cd9e32e0 cf8e3000 cd9e32e0 cf8e3000 c04ec07e cd9e32e0 cf8e3000 c0895140 Call Trace: [c04dcfb0] dev_gso_segment+0x30/0xb0 [c04dd07e] dev_hard_start_xmit+0x4e/0x110 [c04ec07e] __qdisc_run+0xbe/0x280 [c04dd4b9] dev_queue_xmit+0x379/0x380 [c05bbe44] br_dev_queue_push_xmit+0xa4/0x140 [c05c2402] br_nf_post_routing+0x102/0x1d0 [c05c22b0] br_nf_dev_queue_xmit+0x0/0x50 [c05bbda0] br_dev_queue_push_xmit+0x0/0x140 [c04f0eab] nf_iterate+0x6b/0xa0 [c05bbda0] br_dev_queue_push_xmit+0x0/0x140 [c05bbda0] br_dev_queue_push_xmit+0x0/0x140 [c04f0f4e] nf_hook_slow+0x6e/0x120 [c05bbda0] br_dev_queue_push_xmit+0x0/0x140 [c05bbf40] br_forward_finish+0x60/0x70 [c05bbda0] br_dev_queue_push_xmit+0x0/0x140 [c05c1b71] br_nf_forward_finish+0x71/0x130 [c05bbee0] br_forward_finish+0x0/0x70 [c05c1d20] br_nf_forward_ip+0xf0/0x1a0 [c05c1b00] br_nf_forward_finish+0x0/0x130 [c05bbee0] br_forward_finish+0x0/0x70 [c04f0eab] nf_iterate+0x6b/0xa0 [c05bbee0] br_forward_finish+0x0/0x70 [c05bbee0] br_forward_finish+0x0/0x70 [c04f0f4e] nf_hook_slow+0x6e/0x120 [c05bbee0] br_forward_finish+0x0/0x70 [c05bc044] __br_forward+0x74/0x80 [c05bbee0] br_forward_finish+0x0/0x70 [c05bceb1] br_handle_frame_finish+0xd1/0x160 [c05bcde0] br_handle_frame_finish+0x0/0x160 [c05c0e0b] br_nf_pre_routing_finish+0xfb/0x480 [c05bcde0] br_handle_frame_finish+0x0/0x160 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480 [c054fe13] ip_nat_in+0x43/0xc0 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480 [c04f0eab] nf_iterate+0x6b/0xa0 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480 [c04f0f4e] nf_hook_slow+0x6e/0x120 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480 [c05c1914] br_nf_pre_routing+0x404/0x580 [c05c0d10] br_nf_pre_routing_finish+0x0/0x480 [c04f0eab] nf_iterate+0x6b/0xa0 [c05bcde0] br_handle_frame_finish+0x0/0x160 [c05bcde0] br_handle_frame_finish+0x0/0x160 [c04f0f4e] nf_hook_slow+0x6e/0x120 [c05bcde0] br_handle_frame_finish+0x0/0x160 [c05bd124] br_handle_frame+0x1e4/0x250 [c05bcde0] br_handle_frame_finish+0x0/0x160 [c04ddae5] netif_receive_skb+0x165/0x2a0 [c04ddcdf] process_backlog+0xbf/0x180 [c04ddebf] net_rx_action+0x11f/0x1d0 [c01262e6] __do_softirq+0x86/0x120 [c01263f5] do_softirq+0x75/0x90 [c0106cef] do_IRQ+0x1f/0x30 [c04271d0] evtchn_do_upcall+0x90/0x100 [c0105315] hypervisor_callback+0x3d/0x48 Code: c2 2b 57 24 29 d0 8d 14 2a 89 87 94 00 00 00 89 57 60 8b 44 24 08 83 c4 0c 5b 5e 5f 5d c3 0f 0 b 69 03 fe 8c 66 c0 e9 69 ff ff ff 0f 0b 6d 04 e8 ab 6c c0 e9 3a ff ff ff 0f 0b 6c 04 e8 ab 6c c0 0Kernel panic - not syncing: Fatal exception in interrupt Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] kernel BUG at net/core/dev.c:1133!
On Fri, Jul 07, 2006 at 05:03:36PM +0200, Petersson, Mats wrote: So, nothing going on there... I certainly haven't got NAT on my machine, as my machine is within the AMD network, and doesn't need NAT. AMD probably uses NAT as part of it's external communications, but I doubt it's used at all internally. Actually, just having it loaded is enough to break TSO. So for all this time anyone who had ip_nat loaded were silently corrupting all their TSO checksums! I'll send a patch soon once I've tested it. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: starting mc triggers lockdep
Arjan van de Ven [EMAIL PROTECTED] wrote: i_mutex is taken within rtln_mutex like this: [8030f4a0] create_dir+0x2c/0x1e2 [8030fa5b] sysfs_create_dir+0x59/0x78 [8034d2e2] kobject_add+0x114/0x1d8 [803bb1e7] class_device_add+0xb5/0x49d [804300b1] netdev_register_sysfs+0x98/0xa2 [80426f58] register_netdevice+0x28c/0x376 [8042709c] register_netdev+0x5a/0x69 creating the AB dependency This is a sysfs inode. now for the third part, which involves the nfs client: stat on an nfs file, which ends up taken the i_mutex of a directory in the path (obvious), and then does [8022800b] tcp_sendmsg+0x1e/0xb1a [80248f4b] inet_sendmsg+0x45/0x53 [80259d25] sock_sendmsg+0x110/0x130 [8041f462] kernel_sendmsg+0x3c/0x52 [885399e9] xs_tcp_send_request+0x117/0x320 [sunrpc] [885388d5] xprt_transmit+0x105/0x21e [sunrpc] [8853771e] call_transmit+0x1f4/0x239 [sunrpc] [8853c06e] __rpc_execute+0x9b/0x1e6 [sunrpc] [8853c1de] rpc_execute+0x1a/0x1d [sunrpc] [885364ad] rpc_call_sync+0x87/0xb9 [sunrpc] [885a2587] nfs3_rpc_wrapper+0x2e/0x74 [nfs] [885a2a14] nfs3_proc_lookup+0xe0/0x163 [nfs] where tcp_sendmsg calls lock_sock. So this is the BC dependency. This is an nfs inode. Did I miss something? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: auro deadlock (was Re: e100 lockdep irq lock inversion.)
Arjan van de Ven [EMAIL PROTECTED] wrote: Act 1 Enter the mpi_start_xmit() function, which is airo's xmit function. This function takes the aux_lock first, with irq's off, then calls skb_queue_tail(). skb_queue_tail takes the sk_receive_queue.lock (with irqsave as well). Nope, make that ai-txq. Act 2 Enter the ipcalc program. This program calls an ioctl, which ends up calling udp_ioctl. udp_ioctl does spin_lock_bh(sk-sk_receive_queue.lock); Different queue. So no deadlock. Better luck next time :) -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] correct dev_alloc_skb kerneldoc
David Miller [EMAIL PROTECTED] wrote: What is the point of dev_alloc_skb anyway? all it does is add header space. In stone-age times it actually had specific semantics, but yes today it is just a synonym. Does anyone still need those 16 bytes of header space? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[1/2] [NET] gso: Add skb_is_gso
Hi Dave: These two patches fix the netfilter/checksum/TSO problem where netfilter destroys the partial checksum which breaks TSO. [NET] gso: Add skb_is_gso This patch adds the wrapper function skb_is_gso which can be used instead of directly testing skb_shinfo(skb)-gso_size. This makes things a little nicer and allows us to change the primary key for indicating whether an skb is GSO (if we ever want to do that). Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -1639,7 +1639,7 @@ bnx2_tx_int(struct bnx2 *bp) skb = tx_buf-skb; #ifdef BCM_TSO /* partial BD completions possible with TSO packets */ - if (skb_shinfo(skb)-gso_size) { + if (skb_is_gso(skb)) { u16 last_idx, last_ring_idx; last_idx = sw_cons + diff --git a/drivers/net/chelsio/sge.c b/drivers/net/chelsio/sge.c --- a/drivers/net/chelsio/sge.c +++ b/drivers/net/chelsio/sge.c @@ -1417,7 +1417,7 @@ int t1_start_xmit(struct sk_buff *skb, s struct cpl_tx_pkt *cpl; #ifdef NETIF_F_TSO - if (skb_shinfo(skb)-gso_size) { + if (skb_is_gso(skb)) { int eth_type; struct cpl_tx_pkt_lso *hdr; diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -2394,7 +2394,7 @@ e1000_tso(struct e1000_adapter *adapter, uint8_t ipcss, ipcso, tucss, tucso, hdr_len; int err; - if (skb_shinfo(skb)-gso_size) { + if (skb_is_gso(skb)) { if (skb_header_cloned(skb)) { err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); if (err) @@ -2519,7 +2519,7 @@ e1000_tx_map(struct e1000_adapter *adapt * tso gets written back prematurely before the data is fully * DMA'd to the controller */ if (!skb-data_len tx_ring-last_tx_tso - !skb_shinfo(skb)-gso_size) { + !skb_is_gso(skb)) { tx_ring-last_tx_tso = 0; size -= 4; } @@ -2806,8 +2806,7 @@ e1000_xmit_frame(struct sk_buff *skb, st #ifdef NETIF_F_TSO /* Controller Erratum workaround */ - if (!skb-data_len tx_ring-last_tx_tso - !skb_shinfo(skb)-gso_size) + if (!skb-data_len tx_ring-last_tx_tso !skb_is_gso(skb)) count++; #endif diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -1495,7 +1495,7 @@ static int nv_start_xmit(struct sk_buff np-tx_skbuff[nr] = skb; #ifdef NETIF_F_TSO - if (skb_shinfo(skb)-gso_size) + if (skb_is_gso(skb)) tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)-gso_size NV_TX2_TSO_SHIFT); else #endif diff --git a/drivers/net/ixgb/ixgb_main.c b/drivers/net/ixgb/ixgb_main.c --- a/drivers/net/ixgb/ixgb_main.c +++ b/drivers/net/ixgb/ixgb_main.c @@ -1173,7 +1173,7 @@ ixgb_tso(struct ixgb_adapter *adapter, s uint16_t ipcse, tucse, mss; int err; - if(likely(skb_shinfo(skb)-gso_size)) { + if (likely(skb_is_gso(skb))) { if (skb_header_cloned(skb)) { err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); if (err) diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -139,7 +139,7 @@ static int loopback_xmit(struct sk_buff #endif #ifdef LOOPBACK_TSO - if (skb_shinfo(skb)-gso_size) { + if (skb_is_gso(skb)) { BUG_ON(skb-protocol != htons(ETH_P_IP)); BUG_ON(skb-nh.iph-protocol != IPPROTO_TCP); diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c --- a/drivers/net/myri10ge/myri10ge.c +++ b/drivers/net/myri10ge/myri10ge.c @@ -2116,7 +2116,7 @@ abort_linearize: } idx = (idx + 1) tx-mask; } while (idx != last_idx); - if (skb_shinfo(skb)-gso_size) { + if (skb_is_gso(skb)) { printk(KERN_ERR myri10ge: %s: TSO but wanted to linearize?!?!?\n, mgp-dev-name); diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -1159,7 +1159,7 @@ static unsigned tx_le_req(const struct s count = sizeof(dma_addr_t) / sizeof(u32); count += skb_shinfo(skb)-nr_frags * count; - if (skb_shinfo(skb)-gso_size) + if (skb_is_gso(skb)) ++count; if (skb
[2/2] [NET] gso: Fix up GSO packets with broken checksums
Hi: [NET] gso: Fix up GSO packets with broken checksums Certain subsystems in the stack (e.g., netfilter) can break the partial checksum on GSO packets. Until they're fixed, this patch allows this to work by recomputing the partial checksums through the GSO mechanism. Once they've all been converted to update the partial checksum instead of clearing it, this workaround can be removed. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -549,6 +549,7 @@ struct packet_type { struct net_device *); struct sk_buff *(*gso_segment)(struct sk_buff *skb, int features); + int (*gso_send_check)(struct sk_buff *skb); void*af_packet_priv; struct list_headlist; }; @@ -1001,13 +1002,14 @@ static inline int net_gso_ok(int feature static inline int skb_gso_ok(struct sk_buff *skb, int features) { - return net_gso_ok(features, skb_is_gso(skb) ? - skb_shinfo(skb)-gso_type : 0); + return net_gso_ok(features, skb_shinfo(skb)-gso_type); } static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) { - return !skb_gso_ok(skb, dev-features); + return skb_is_gso(skb) + (!skb_gso_ok(skb, dev-features) || + unlikely(skb-ip_summed != CHECKSUM_HW)); } #endif /* __KERNEL__ */ diff --git a/include/net/protocol.h b/include/net/protocol.h --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -36,6 +36,7 @@ struct net_protocol { int (*handler)(struct sk_buff *skb); void(*err_handler)(struct sk_buff *skb, u32 info); + int (*gso_send_check)(struct sk_buff *skb); struct sk_buff *(*gso_segment)(struct sk_buff *skb, int features); int no_policy; @@ -51,6 +52,7 @@ struct inet6_protocol int type, int code, int offset, __u32 info); + int (*gso_send_check)(struct sk_buff *skb); struct sk_buff *(*gso_segment)(struct sk_buff *skb, int features); diff --git a/include/net/tcp.h b/include/net/tcp.h --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1086,6 +1086,7 @@ extern struct request_sock_ops tcp_reque extern int tcp_v4_destroy_sock(struct sock *sk); +extern int tcp_v4_gso_send_check(struct sk_buff *skb); extern struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features); #ifdef CONFIG_PROC_FS diff --git a/net/core/dev.c b/net/core/dev.c --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1162,9 +1162,17 @@ int skb_checksum_help(struct sk_buff *sk unsigned int csum; int ret = 0, offset = skb-h.raw - skb-data; - if (inward) { - skb-ip_summed = CHECKSUM_NONE; - goto out; + if (inward) + goto out_set_summed; + + if (unlikely(skb_shinfo(skb)-gso_size)) { + static int warned; + + WARN_ON(!warned); + warned = 1; + + /* Let GSO fix up the checksum. */ + goto out_set_summed; } if (skb_cloned(skb)) { @@ -1181,6 +1189,8 @@ int skb_checksum_help(struct sk_buff *sk BUG_ON(skb-csum + 2 offset); *(u16*)(skb-h.raw + skb-csum) = csum_fold(csum); + +out_set_summed: skb-ip_summed = CHECKSUM_NONE; out: return ret; @@ -1201,17 +1211,35 @@ struct sk_buff *skb_gso_segment(struct s struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_type *ptype; int type = skb-protocol; + int err; BUG_ON(skb_shinfo(skb)-frag_list); - BUG_ON(skb-ip_summed != CHECKSUM_HW); skb-mac.raw = skb-data; skb-mac_len = skb-nh.raw - skb-data; __skb_pull(skb, skb-mac_len); + if (unlikely(skb-ip_summed != CHECKSUM_HW)) { + static int warned; + + WARN_ON(!warned); + warned = 1; + + if (skb_header_cloned(skb) + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return ERR_PTR(err); + } + rcu_read_lock(); list_for_each_entry_rcu(ptype, ptype_base[ntohs(type) 15], list) { if (ptype-type == type !ptype-dev ptype-gso_segment) { + if (unlikely(skb-ip_summed != CHECKSUM_HW)) { + err = ptype
Re: airo maybe should select crypto_aes
Robert Schulze [EMAIL PROTECTED] wrote: I first wrote to the linux-pcmcia ML, but they said it wasn't the right address for my issue. The driver airo (for Cisco Wlan-Cards) complains about failed to load transform for AES, when it is loaded and CRYPTO_AES is not selected in Kconfig. I've got a patch for that, maybe it's worth it. First of your patch is space-damaged. Please make sure that the original tabs are preserved and resend. Also, wireless patches should be sent to this list with a cc to John W. Linville [EMAIL PROTECTED]. Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: starting mc triggers lockdep
On Sat, Jul 08, 2006 at 11:53:20AM +0200, Arjan van de Ven wrote: now for the third part, which involves the nfs client: stat on an nfs file, which ends up taken the i_mutex of a directory in the path (obvious), and then does [8022800b] tcp_sendmsg+0x1e/0xb1a [80248f4b] inet_sendmsg+0x45/0x53 [80259d25] sock_sendmsg+0x110/0x130 [8041f462] kernel_sendmsg+0x3c/0x52 [885399e9] xs_tcp_send_request+0x117/0x320 [sunrpc] [885388d5] xprt_transmit+0x105/0x21e [sunrpc] [8853771e] call_transmit+0x1f4/0x239 [sunrpc] [8853c06e] __rpc_execute+0x9b/0x1e6 [sunrpc] [8853c1de] rpc_execute+0x1a/0x1d [sunrpc] [885364ad] rpc_call_sync+0x87/0xb9 [sunrpc] [885a2587] nfs3_rpc_wrapper+0x2e/0x74 [nfs] [885a2a14] nfs3_proc_lookup+0xe0/0x163 [nfs] where tcp_sendmsg calls lock_sock. So this is the BC dependency. This is an nfs inode. Did I miss something? is it not possible to nfs export /sys, and then mount it over loopback? Possibly, but not with the backtrace above. You'd need an nfs server backtrace to get the real sysfs inode. In any case, the sock lock from the other backtrace that you had (udp setsockopt) cannot be held by the kernel nfs client or server since the kernel nfs sockets are not visible to user space. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.17-mm3
Michal Piotrowski [EMAIL PROTECTED] wrote: It was moved, sorry. I fail to spot any relevant backtraces for skge or indeed any part of the networking stack. Ingo/Arjan, perhaps you guys can figure out what's wrong here. In future perhaps you should consider posting the dmesg to the list directly. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[IPV4] inetpeer: Get rid of volatile from peer_total
Hi Dave: Just joining on the fun on volatile :) [IPV4] inetpeer: Get rid of volatile from peer_total The variable peer_total is protected by a lock. The volatile marker makes no sense. This shaves off 20 bytes on i386. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 2160874..03ff62e 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -86,7 +86,7 @@ static struct inet_peer *peer_root = pee static DEFINE_RWLOCK(peer_pool_lock); #define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ -static volatile int peer_total; +static int peer_total; /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold = 65536 + 128; /* start to throw entries more * aggressively at this stage */ - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: ipsec tunnel asymmetrical mtu
Hi Marco: On Mon, Apr 24, 2006 at 09:23:00AM +, Marco Berizzi wrote: What should I do? Mangling MSS with iptables --set-mss ? Altering MSS to 1440 did the trick. See: http://marc.theaimsgroup.com/?l=linux-netdevm=114373067423528w=2 Yes that's enough, although proper PMTU would be better. and here is snmp when the sapgui client has told me that the connections has been reset: [EMAIL PROTECTED]:/var/log# cat SNMP-CONN-RESET Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqds ReasmOKs ReasmFails FragOKs FragFails FragCreates Ip: 1 64 79257 0 31 48139 0 0 38799 56650 2 0 2 182 90 2 90 0 124 OK, the number of reassemblies equals the number of FragOKs. So it does not look like there is a problem within mimosa, i.e., Linux. I've looked at your packet dumps again and in fact there is not qualitative difference between WITHTCPDUMP and WITHOUTTCPDUMP. What is different is that the latter seems to have experienced a higher packet loss rate at an early stage and therefore the sender has already backed off to a very long retry period. The fact that tcpdump makes a difference could simply be that it changes the timing of the fragment tramissions on mimosa which has an impact on the loss rate between mimosa and pleiadi. We can say these things for certain: 1) The path between mimosa and pleiadi has a packet loss problem. A small burst of 10 or so fragments is enough to cause at least half of them to be lost. This problem may be specific to IPsec traffic (ISPs often discriminate against traffic with protocols other than TCP/UDP). 2) Fragmentation exacerbates the packet loss problem because it increases the number of packets and a packet is lost if only one of its fragments is lost. 3) The fact that the TCP connections are not using PMTU causes fragmentation in the presence of IPsec. From what I've seen, there does not appear to be a bug in Linux that could explain the behaviour change that is seen when you run tcpdump. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: ipsec tunnel asymmetrical mtu
On Mon, May 08, 2006 at 08:28:32AM +, Marco Berizzi wrote: [EMAIL PROTECTED]:~# ping 10.49.59.23 PING 10.49.59.23 (10.49.59.23) 56(84) bytes of data. 64 bytes from 10.49.59.23: icmp_seq=1 ttl=247 time=91.9 ms 64 bytes from 10.49.59.23: icmp_seq=2 ttl=247 time=49.3 ms 64 bytes from 10.49.59.23: icmp_seq=3 ttl=247 time=106 ms 64 bytes from 10.49.59.23: icmp_seq=4 ttl=247 time=74.3 ms --- 10.49.59.23 ping statistics --- 4 packets transmitted, 4 received, 0% packet loss, time 2998ms rtt min/avg/max/mdev = 49.316/80.460/106.257/21.241 ms [EMAIL PROTECTED]:~# cd /tmp/ [EMAIL PROTECTED]:/tmp# tcpdump -v -p -n ip host 10.49.59.23 /tmp/NULL-10.49.59.23 [1] 18981 [EMAIL PROTECTED]:/tmp# tcpdump: listening on eth0, link-type EN10MB (Ethernet), capture size 96 bytes [EMAIL PROTECTED]:/tmp# ping 10.49.59.23 PING 10.49.59.23 (10.49.59.23) 56(84) bytes of data. --- 10.49.59.23 ping statistics --- 8 packets transmitted, 0 received, 100% packet loss, time 6999ms [EMAIL PROTECTED]:/tmp# fg tcpdump -v -p -n ip host 10.49.59.23 /tmp/NULL-10.49.59.23 101 packets captured 101 packets received by filter 0 packets dropped by kernel Yes this is really weird. The only thing I can think of is that it somehow managed to put some bogus entry into the conntrack table. What happens if you do grep 10.49.59.23 /proc/net/ip_conntrack before and after the tcpdump? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: What is RDMA (was: RDMA will be reverted)
On Fri, Jul 07, 2006 at 01:25:44PM -0500, Steve Wise wrote: Some IP networking is involved for this. IP addresses and port numbers are used by the RDMA Connection Manager. The motivation for this was two-fold, I think: 1) to simplify the connection setup model. The IB CM model was very complex. 2) to allow ULPs to be transport independent. Thus a single code base for NFSoRDMA, for example, can run over Infiniband and RDMA/TCP transports without code changes or knowing about transport-specific addressing. The routing table is also consulted to determine which rdma device should be used for connection setup. Each rdma device also installs a netdev device for native stack traffic. The RDMA CM maintains an association between the netdev device and the rdma device. And the Infiniband subsystem uses ARP over IPoIB to map IP addresses to GID/QPN info. This is done by calling arp_send() directly, and snooping all ARP packets to discover when the arp entry is completed. This sounds interesting. Since this is going to be IB-neutral, what about moving high-level logic like this is moved out of drivers/infiniband and into net? That way the rest of the networking community can add input into how things are done. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] cancel_rearming_delayed_work infinite loop fix
Michael Buesch [EMAIL PROTECTED] wrote: cancel_rearming_delayed_work{queue} is broken, because it is possible to enter an infinite loop if: We call the function on a work that is currently not executing or pending. Why are you calling it on a work that was never scheduled? Sounds like a bug to me. void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, struct work_struct *work) { - while (!cancel_delayed_work(work)) + do { + cancel_delayed_work(work); flush_workqueue(wq); + } while (test_bit(0, work-pending)); This is broken. If the work just starts running before your test_bit you'd exit without cancelling it properly. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: ipsec tunnel asymmetrical mtu
On Tue, Jul 11, 2006 at 11:22:18AM +0200, Marco Berizzi wrote: I'm able to connect to a sap server connected to the milano network from a sapgui client connected to the venezia network. No problem. If packet loss is a problem it should be also a problem with this tunnel. Am I wrong? It depends. A mild packet loss problem can become a big one when it is exacerbated by fragmentation, e.g., a 20% rate can become 40%. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener
Andrew Morton [EMAIL PROTECTED] wrote: On Tue, 11 Jul 2006 00:36:39 -0400 Shailabh Nagar [EMAIL PROTECTED] wrote: down_write(listeners-sem); list_for_each_entry_safe(s, tmp, listeners-list, list) { - ret = genlmsg_unicast(skb, s-pid); + skb_next = NULL; + if (!list_islast(s-list, listeners-list)) { + skb_next = skb_clone(skb_cur, GFP_KERNEL); If we do a GFP_KERNEL allocation with this semaphore held, and the oom-killer tries to kill something to satisfy the allocation, and the killed task gets stuck on that semaphore, I wonder of the box locks up. We do GFP_KERNEL inside semaphores/mutexes in lots of places. So if this can deadlock with the oom-killer we probably should fix that, preferably by having GFP_KERNEL fail in that case. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Patch 6/6] per task delay accounting taskstats interface: fix clone skbs for each listener
On Tue, Jul 11, 2006 at 03:57:31AM -0700, Andrew Morton wrote: down_write(listeners-sem); list_for_each_entry_safe(s, tmp, listeners-list, list) { - ret = genlmsg_unicast(skb, s-pid); + skb_next = NULL; + if (!list_islast(s-list, listeners-list)) { + skb_next = skb_clone(skb_cur, GFP_KERNEL); If we do a GFP_KERNEL allocation with this semaphore held, and the oom-killer tries to kill something to satisfy the allocation, and the killed task gets stuck on that semaphore, I wonder of the box locks up. We do GFP_KERNEL inside semaphores/mutexes in lots of places. So if this can deadlock with the oom-killer we probably should fix that, preferably by having GFP_KERNEL fail in that case. This lock is special, in that it's taken on the exit() path (I think). So it can block tasks which are trying to exit. Sorry, missed the context. If there is a deadlock then it's not just this allocation that you need worry about. There is also an allocation within genlmsg_uniast that would be GFP_KERNEL. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: ipsec tunnel asymmetrical mtu
On Tue, Jul 11, 2006 at 11:31:33AM +0200, Marco Berizzi wrote: Me again. After a while here is: [EMAIL PROTECTED]:/tmp# ping 10.49.59.23 PING 10.49.59.23 (10.49.59.23) 56(84) bytes of data. --- 10.49.59.23 ping statistics --- 4 packets transmitted, 0 received, 100% packet loss, time 3010ms Please check using ip -s x p to make sure that the packet is hitting the right policy. Unfortunately we don't update the byte/packet counters so you'll have to look at the `use' time stamp. If it's passing through IPsec, then you should trace through your iptables rules using the LOG target to see if it's hitting them. We need to know if it's being dropped before, in, or after netfilter. Please also do ip r g 10.49.59.23 to make sure that it says something sane. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[IPCOMP]: Fix truesize after decompression
On Thu, Jul 06, 2006 at 12:53:45PM +, Beschorner Daniel wrote: Does it harm? SKB BUG: Invalid truesize (380) len=1383, sizeof(sk_buff)=156 SKB BUG: Invalid truesize (316) len=1383, sizeof(sk_buff)=156 SKB BUG: Invalid truesize (348) len=1383, sizeof(sk_buff)=156 SKB BUG: Invalid truesize (316) len=1383, sizeof(sk_buff)=156 SKB BUG: Invalid truesize (380) len=1383, sizeof(sk_buff)=156 I found it in the log of a 2.6.17 box using IPSEC tunnels. It's not fatal, but it does stuff up socket accounting. Unfortunately getting totally accurate truesizes is not easy due to the large numbers of pskb_expand_head calls scattered around the stack. [IPCOMP]: Fix truesize after decompression The truesize check has uncovered the fact that we forgot to update truesize after pskb_expand_head. Unfortunately pskb_expand_head can't update it for us because it's used in all sorts of different contexts, some of which would not allow truesize to be updated by itself. So the solution for now is to simply update it in IPComp. This patch also changes skb_put to __skb_put since we've just expanded tailroom by exactly that amount so we know it's there (but gcc does not). Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 8e03748..8a8b5cf 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -70,7 +70,8 @@ static int ipcomp_decompress(struct xfrm if (err) goto out; - skb_put(skb, dlen - plen); + skb-truesize += dlen - plen; + __skb_put(skb, dlen - plen); memcpy(skb-data, scratch, dlen); out: put_cpu(); diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index b285b03..7e4d1c1 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -109,7 +109,8 @@ static int ipcomp6_input(struct xfrm_sta goto out_put_cpu; } - skb_put(skb, dlen - plen); + skb-truesize += dlen - plen; + __skb_put(skb, dlen - plen); memcpy(skb-data, scratch, dlen); err = ipch-nexthdr; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: ipsec tunnel asymmetrical mtu
On Tue, Jul 11, 2006 at 12:32:45PM +0200, Marco Berizzi wrote: Running this on mimosa 'mitigates' the problem: ip addr add 172.29.128.1/28 dev eth2 Connections are pretty slow but they aren't reseted anymore. Hmm, I thought 172.29.128.1 was already a local address? What does ip addr show? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Problems with xfrm (IPSec) and multicast
On Wed, Jun 14, 2006 at 01:09:59PM +, Roar Bj?rgum Rotvik wrote: So I cannot make encrypted multicast traffic to flow both ways at the same time, and has no clue as to why the first packets after changing direction is dropped somewhere. Sounds like conntrack. Check /proc/net/ip_conntrack when this happens. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [IPCOMP]: Fix truesize after decompression
On Tue, Jul 11, 2006 at 01:55:53PM -0700, David Miller wrote: I think it is possible cover a certain class of these situations from within pskb_expand_head. For example, if skb-sk is NULL we can prove that updating skb-truesize is safe since no socket's buffer accounting can possible depend upon the truesize value of this skb. Yes that's certainly possible. However, we'll need to audit the few spots (e.g., ATM) that use truesize without setting skb-sk. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [IPCOMP]: Fix truesize after decompression
On Tue, Jul 11, 2006 at 04:22:17PM -0700, David Miller wrote: What ATM is doing here is charging the SKB to the virtual circuit sockets. At least in the few cases I've looked at just now, the skb is some private ATM level signalling message, and not part of a normal transmit/receive packet from the normal networking stack. Indeed, at least they do have an sk to charge things to :) -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [BUG] Two BUG warnings in net/core/dev.c
john stultz [EMAIL PROTECTED] wrote: Both of these were seen on my laptop w/ the current (as of this writing) -git tree using the e1000 driver after a suspend/resume cycle. It's just a reminder that we need to fix NAT to update checksums incrementally. You'll only see it once per boot. BUG: warning at net/core/dev.c:1171/skb_checksum_help() [c0103d69] show_trace_log_lvl+0x149/0x170 [c01052bb] show_trace+0x1b/0x20 [c01052e4] dump_stack+0x24/0x30 [c03c7523] skb_checksum_help+0x163/0x170 [c0439c15] ip_nat_fn+0x1a5/0x210 Of course, if anyone sees it with a backtrace that does not contain ip_nat_fn, please let us know. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Unnecessary check in __sk_stream_mem_reclaim?
Ian McDonald [EMAIL PROTECTED] wrote: It looks to me like this check here in net/core/stream.c for __sk_stream_mem_reclaim: if (sk-sk_forward_alloc = SK_STREAM_MEM_QUANTUM) { is unnecessary. It's needed after skb's have been freed which can push sk_forward_alloc above a quantum. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Unnecessary check in __sk_stream_mem_reclaim?
On Wed, Jul 12, 2006 at 03:17:43PM +1200, Ian McDonald wrote: I'm not saying the check is unneeded - just saying doing it twice is unneeded. Right, got you this time. I don't think we need to worry about people who use __sk_stream_mem_reclaim when there is a perfectly good sk_stream_mem_reclaim around. Besides, this function has only been exported since 2004 so it's highly unlikely for there to be stuff out there using it. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] bonding: enhance the IP address check of arp_ip_target
Stephen Hemminger [EMAIL PROTECTED] wrote: Why not just use sscanf? Better yet, use a better interface like netlink rather than module parameters. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Bug 6688] Memory allocation problem
Andrew Morton [EMAIL PROTECTED] wrote: On Mon, 19 Jun 2006 23:46:08 -0700 [EMAIL PROTECTED] wrote: http://bugzilla.kernel.org/show_bug.cgi?id=6688 This is looking like a net memory leak in 2.6.16. 1/3rd is in ip_fib_alias and 2/3rds is in size-64. I've asked the reporter to apply the leak detector patch so we can find out who is using the size-64 part. I had a look at fib_trie.c and found a bug. This is probably not the cause of the leak since it looks more likely to cause a crash than anything. However, please retest with this applied just to be sure. [IPV4]: Fix error handling for fib_insert_node call The error handling around fib_insert_node was broken because we always zeroed the error before checking it. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Bug 6688] Memory allocation problem
On Wed, Jul 12, 2006 at 09:44:46PM +1000, Herbert Xu wrote: [IPV4]: Fix error handling for fib_insert_node call Doh, fogot the patch. [IPV4]: Fix error handling for fib_insert_node call The error handling around fib_insert_node was broken because we always zeroed the error before checking it. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 1cb6530..23fb9d9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1252,8 +1252,8 @@ fn_trie_insert(struct fib_table *tb, str */ if (!fa_head) { - fa_head = fib_insert_node(t, err, key, plen); err = 0; + fa_head = fib_insert_node(t, err, key, plen); if (err) goto out_free_new_fa; } - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/1] net: fix __sk_stream_mem_reclaim
Ian McDonald [EMAIL PROTECTED] wrote: __sk_stream_mem_reclaim is only called by sk_stream_mem_reclaim. As such the check on sk-sk_forward_alloc is not needed and can be removed. Signed-off-by: Ian McDonald [EMAIL PROTECTED] Acked-by: Herbert Xu [EMAIL PROTECTED] What's more, even if the check does turn out to be false for some renegade caller, the function will still work since all the operations turn out to be no-ops. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[NET]: Update frag_list in pskb_trim
Hi Dave: This needs to go into stable as well. In fact, there is another unrelated bug with exactly the same symptoms which was inadvertently fixed by the GSO patches. So I'll send a simpler fix for that to stable. [NET]: Update frag_list in pskb_trim When pskb_trim has to defer to ___pksb_trim to trim the frag_list part of the packet, the frag_list is not updated to reflect the trimming. This will usually work fine until you hit something that uses the packet length or tail from the frag_list. Examples include esp_output and ip_fragment. Another problem caused by this is that you can end up with a linear packet with a frag_list attached. It is possible to get away with this if we audit everything to make sure that they always consult skb-len before going down onto frag_list. In fact we can do the samething for the paged part as well to avoid copying the data area of the skb. For now though, let's do the conservative fix and update frag_list. Many thanks to Marco Berizzi for helping me to track down this bug. This 4-year old bug took 3 months to track down. Marco was very patient indeed :) Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 44f6a18..476aa39 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -257,11 +257,11 @@ nodata: } -static void skb_drop_fraglist(struct sk_buff *skb) +static void skb_drop_list(struct sk_buff **listp) { - struct sk_buff *list = skb_shinfo(skb)-frag_list; + struct sk_buff *list = *listp; - skb_shinfo(skb)-frag_list = NULL; + *listp = NULL; do { struct sk_buff *this = list; @@ -270,6 +270,11 @@ static void skb_drop_fraglist(struct sk_ } while (list); } +static inline void skb_drop_fraglist(struct sk_buff *skb) +{ + skb_drop_list(skb_shinfo(skb)-frag_list); +} + static void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; @@ -830,41 +835,75 @@ free_skb: int ___pskb_trim(struct sk_buff *skb, unsigned int len) { + struct sk_buff **fragp; + struct sk_buff *frag; int offset = skb_headlen(skb); int nfrags = skb_shinfo(skb)-nr_frags; int i; + int err; + + if (skb_cloned(skb) + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC + return err; for (i = 0; i nfrags; i++) { int end = offset + skb_shinfo(skb)-frags[i].size; - if (end len) { - if (skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return -ENOMEM; - } - if (len = offset) { - put_page(skb_shinfo(skb)-frags[i].page); - skb_shinfo(skb)-nr_frags--; - } else { - skb_shinfo(skb)-frags[i].size = len - offset; - } + + if (end len) { + offset = end; + continue; } - offset = end; + + if (len offset) + skb_shinfo(skb)-frags[i++].size = len - offset; + + skb_shinfo(skb)-nr_frags = i; + + for (; i nfrags; i++) + put_page(skb_shinfo(skb)-frags[i].page); + + if (skb_shinfo(skb)-frag_list) + skb_drop_fraglist(skb); + break; } - if (offset len) { + for (fragp = skb_shinfo(skb)-frag_list; (frag = *fragp); +fragp = frag-next) { + int end = offset + frag-len; + + if (skb_shared(frag)) { + struct sk_buff *nfrag; + + nfrag = skb_clone(frag, GFP_ATOMIC); + if (unlikely(!nfrag)) + return -ENOMEM; + + nfrag-next = frag-next; + frag = nfrag; + *fragp = frag; + } + + if (end len) { + offset = end; + continue; + } + + if (end len + unlikely((err = pskb_trim(frag, len - offset + return err; + + if (frag-next) + skb_drop_list(frag-next); + break; + } + + if (len skb_headlen(skb)) { skb-data_len -= skb-len - len; skb-len = len; } else { - if (len = skb_headlen(skb)) { - skb-len = len; - skb-data_len = 0
[NET]: Add missing UFO initialisations
Hi: This is only needed for 2.6.17-stable. [NET]: Add missing UFO initialisations This bug was unknowingly fixed the GSO patches (or rather, its effect was unknown at the time). Thanks to Marco Berizzi's persistence which is documented in the thread ipsec tunnel asymmetrical mtu, we now know that it can have highly non-obvious symptoms. What happens is that uninitialised uso_size fields can cause packets to be incorrectly identified as UFO, which means that it does not get fragmented even if it's over the MTU. The fix is simple enough. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/core/dev.c b/net/core/dev.c index 4fba549..7d472ed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1246,6 +1246,7 @@ int __skb_linearize(struct sk_buff *skb, atomic_set(ninfo-dataref, 1); ninfo-tso_size = skb_shinfo(skb)-tso_size; ninfo-tso_segs = skb_shinfo(skb)-tso_segs; + ninfo-ufo_size = skb_shinfo(skb)-ufo_size; ninfo-nr_frags = 0; ninfo-frag_list = NULL; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fb3770f..0280535 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -240,6 +240,7 @@ struct sk_buff *alloc_skb_from_cache(kme skb_shinfo(skb)-nr_frags = 0; skb_shinfo(skb)-tso_size = 0; skb_shinfo(skb)-tso_segs = 0; + skb_shinfo(skb)-ufo_size = 0; skb_shinfo(skb)-frag_list = NULL; out: return skb; @@ -529,6 +530,7 @@ #endif atomic_set(new-users, 1); skb_shinfo(new)-tso_size = skb_shinfo(old)-tso_size; skb_shinfo(new)-tso_segs = skb_shinfo(old)-tso_segs; + skb_shinfo(new)-ufo_size = skb_shinfo(old)-ufo_size; } /** - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: another networking lockdep bug
Dave Jones [EMAIL PROTECTED] wrote: Not sure if this one got reported/fixed yet, as I was running a kernel from sometime last week.. I think we've seen a couple of similar ones, this one is more elaborate though :) - #1 (rtnl_mutex){--..}: [802ab792] lock_acquire+0x4a/0x69 [802691c2] __mutex_lock_slowpath+0xeb/0x29f [8026939f] mutex_lock+0x29/0x2e [8042d973] rtnl_lock+0xf/0x12 [8045c18a] ip_mc_leave_group+0x1e/0xae [80446087] do_ip_setsockopt+0x6ad/0x9b2 [8044643a] ip_setsockopt+0x2a/0x84 [80454328] udp_setsockopt+0xd/0x1c [8041f094] sock_common_setsockopt+0xe/0x11 [8041e20f] sys_setsockopt+0x8e/0xb4 [80262fd9] tracesys+0xd0/0xdb - #0 (sk_lock-AF_INET){--..}: [802ab792] lock_acquire+0x4a/0x69 [8023726c] lock_sock+0xd4/0xe7 [80228061] tcp_sendmsg+0x1e/0xb1a [80248ff8] inet_sendmsg+0x45/0x53 [80259dd3] sock_sendmsg+0x110/0x130 [8041ed0c] kernel_sendmsg+0x3c/0x52 [8853c9e9] xs_tcp_send_request+0x117/0x320 [sunrpc] [8853b8d5] xprt_transmit+0x105/0x21e [sunrpc] [8853a71e] call_transmit+0x1f4/0x239 [sunrpc] [8853f06e] __rpc_execute+0x9b/0x1e6 [sunrpc] [8853f1de] rpc_execute+0x1a/0x1d [sunrpc] [885394ad] rpc_call_sync+0x87/0xb9 [sunrpc] [885a5587] nfs3_rpc_wrapper+0x2e/0x74 [nfs] [885a5870] nfs3_proc_setattr+0x9b/0xd3 [nfs] [8859bffb] nfs_setattr+0xe9/0x11e [nfs] [8022f7b4] notify_change+0x154/0x2f7 [802e00c7] do_truncate+0x52/0x72 [80212d17] may_open+0x1d5/0x231 [8021c270] open_namei+0x290/0x6b4 [80229974] do_filp_open+0x27/0x46 [8021acb7] do_sys_open+0x4e/0xcd [80234b2a] sys_open+0x1a/0x1d [80262fd9] tracesys+0xd0/0xdb We know this is a false positive because the NFS sockets are not exported to user-space and therefore #1 can't happen. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: IPSec + large packets being corrupted
Chris Audley [EMAIL PROTECTED] wrote: Large packet (eg. 1600 byte ping) received by VPN server A. Packet encrypted and fragmented then sent from Server A to Server B. Packet received by network subsytem on B and frag_list created ah_input() strips the AH header -- frag sizes are not changed! esp_input() decrypts data ip_fragment() uses existing frag_list sizes from before the AH header being stripped, and sends too much data (16 bytes extra). This breaks the checksum and packets get dropped by destination host. Aha, this sounds exactly like the bug I fixed today for Marco Berizzi. The following patch should fix the problem for you. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- [NET]: Update frag_list in pskb_trim When pskb_trim has to defer to ___pksb_trim to trim the frag_list part of the packet, the frag_list is not updated to reflect the trimming. This will usually work fine until you hit something that uses the packet length or tail from the frag_list. Examples include esp_output and ip_fragment. Another problem caused by this is that you can end up with a linear packet with a frag_list attached. It is possible to get away with this if we audit everything to make sure that they always consult skb-len before going down onto frag_list. In fact we can do the samething for the paged part as well to avoid copying the data area of the skb. For now though, let's do the conservative fix and update frag_list. Many thanks to Marco Berizzi for helping me to track down this bug. This 4-year old bug took 3 months to track down. Marco was very patient indeed :) Signed-off-by: Herbert Xu [EMAIL PROTECTED] diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 44f6a18..476aa39 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -257,11 +257,11 @@ nodata: } -static void skb_drop_fraglist(struct sk_buff *skb) +static void skb_drop_list(struct sk_buff **listp) { - struct sk_buff *list = skb_shinfo(skb)-frag_list; + struct sk_buff *list = *listp; - skb_shinfo(skb)-frag_list = NULL; + *listp = NULL; do { struct sk_buff *this = list; @@ -270,6 +270,11 @@ static void skb_drop_fraglist(struct sk_ } while (list); } +static inline void skb_drop_fraglist(struct sk_buff *skb) +{ + skb_drop_list(skb_shinfo(skb)-frag_list); +} + static void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; @@ -830,41 +835,75 @@ free_skb: int ___pskb_trim(struct sk_buff *skb, unsigned int len) { + struct sk_buff **fragp; + struct sk_buff *frag; int offset = skb_headlen(skb); int nfrags = skb_shinfo(skb)-nr_frags; int i; + int err; + + if (skb_cloned(skb) + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC + return err; for (i = 0; i nfrags; i++) { int end = offset + skb_shinfo(skb)-frags[i].size; - if (end len) { - if (skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return -ENOMEM; - } - if (len = offset) { - put_page(skb_shinfo(skb)-frags[i].page); - skb_shinfo(skb)-nr_frags--; - } else { - skb_shinfo(skb)-frags[i].size = len - offset; - } + + if (end len) { + offset = end; + continue; } - offset = end; + + if (len offset) + skb_shinfo(skb)-frags[i++].size = len - offset; + + skb_shinfo(skb)-nr_frags = i; + + for (; i nfrags; i++) + put_page(skb_shinfo(skb)-frags[i].page); + + if (skb_shinfo(skb)-frag_list) + skb_drop_fraglist(skb); + break; } - if (offset len) { + for (fragp = skb_shinfo(skb)-frag_list; (frag = *fragp); +fragp = frag-next) { + int end = offset + frag-len; + + if (skb_shared(frag)) { + struct sk_buff *nfrag; + + nfrag = skb_clone(frag, GFP_ATOMIC); + if (unlikely(!nfrag)) + return -ENOMEM; + + nfrag-next = frag-next; + frag = nfrag; + *fragp = frag; + } + + if (end len) { + offset = end; + continue; + } + + if (end len + unlikely((err = pskb_trim(frag, len - offset
Re: [Bugme-new] [Bug 6430] New: ipsec tunnel : reply is not forwarded
Raphael Astier [EMAIL PROTECTED] wrote: On GW1 : #setkey -f flush; spdflush; add 192.168.1.1 192.168.1.2 esp 1000 -m tunnel -E des-cbc 12345678; spdadd 10.0.0.0/24 11.0.0.0/24 any -P out ipsec esp/tunnel/192.168.1.1-192.168.1.2/require; On GW2 : (only need to have SPI to decrypt packets coming from GW1) #!/usr/local/sbin/setkey -f flush; spdflush; add -n 192.168.1.1 192.168.1.2 esp 1000 -m tunnel -E des-cbc 12345678; This can't possibly work since inbound policies are required for tunnel-mode SAs (otherwise people can send packets with arbitrary source addresses once they have a tunnel-mode SA with you). So you need at least 1 more policy on GW1 and 2 policies on GW2 for this to have a chance of working. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: jumbo frames and memory fragmentation
Chris Friesen [EMAIL PROTECTED] wrote: Herbert Xu wrote: Either upgrade your kernel or backport the page-splitting code in the current tree. That's really the only sane solution for jumbo packets. Looking at the page-splitting code, it says 82571 and greater support packet-split We're running the 82546GB device. Looks like it won't help me. Well, time to fork out for a new card then :) -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: possible dos / wsize affected frozen connection length
CaT [EMAIL PROTECTED] wrote: I'm just wondering if connections hanging around this long are normal. The above has now been running for 6 days. netstat is still reporting an established session. netcat has not timed out. It's all just sitting there doing nothing. TCP connections without keepalives can sit there for all eternity, if your machine lasts that long :) -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.18-rc1-mm2
On Fri, Jul 14, 2006 at 12:05:51AM -0700, Andrew Morton wrote: Call Trace: [8026963e] show_trace+0xae/0x265 [8026980a] dump_stack+0x15/0x1b [8043ba7b] skb_checksum_help+0x61/0x126 [8802f35f] :iptable_nat:ip_nat_fn+0x5f/0x1d2 This is tell you that there is a bug in ip_nat_fn in that it completes the partial checksum even for TSO packets which will cause them to go out with bogus checksums. The warning also indicates that the system has detected this and has worked around it by recomputing the partial checksum after NAT. The warning is here so someone can fix NAT to not trash the partial checksum. It would also tell us if anyone else breaks checksums in this way. I've already made the warning appear only once per-boot so I'd really like to keep it in until 1) NAT is fixed. 2) We're reasonably sure there's nothing else doing this. Prior to this change your TSO packets would've gone out with corrupted checksums silently. Essentially TSO would only slow your machine down since every transmission it makes has to be retransmitted as non-TSO. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [NET]: Update frag_list in pskb_trim
On Thu, Jul 13, 2006 at 10:05:26PM -0700, David Miller wrote: As I noted already, this is in my tree and will go off to Linus soon. Please toss this over to -stable under seperate cover, if you haven't done so already. Please add my signoff: Signed-off-by: David S. Miller [EMAIL PROTECTED] Great. Here we go: [NET]: Update frag_list in pskb_trim When pskb_trim has to defer to ___pksb_trim to trim the frag_list part of the packet, the frag_list is not updated to reflect the trimming. This will usually work fine until you hit something that uses the packet length or tail from the frag_list. Examples include esp_output and ip_fragment. Another problem caused by this is that you can end up with a linear packet with a frag_list attached. It is possible to get away with this if we audit everything to make sure that they always consult skb-len before going down onto frag_list. In fact we can do the samething for the paged part as well to avoid copying the data area of the skb. For now though, let's do the conservative fix and update frag_list. Many thanks to Marco Berizzi for helping me to track down this bug. This 4-year old bug took 3 months to track down. Marco was very patient indeed :) Signed-off-by: Herbert Xu [EMAIL PROTECTED] Signed-off-by: David S. Miller [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 44f6a18..476aa39 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -257,11 +257,11 @@ nodata: } -static void skb_drop_fraglist(struct sk_buff *skb) +static void skb_drop_list(struct sk_buff **listp) { - struct sk_buff *list = skb_shinfo(skb)-frag_list; + struct sk_buff *list = *listp; - skb_shinfo(skb)-frag_list = NULL; + *listp = NULL; do { struct sk_buff *this = list; @@ -270,6 +270,11 @@ static void skb_drop_fraglist(struct sk_ } while (list); } +static inline void skb_drop_fraglist(struct sk_buff *skb) +{ + skb_drop_list(skb_shinfo(skb)-frag_list); +} + static void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; @@ -830,41 +835,75 @@ free_skb: int ___pskb_trim(struct sk_buff *skb, unsigned int len) { + struct sk_buff **fragp; + struct sk_buff *frag; int offset = skb_headlen(skb); int nfrags = skb_shinfo(skb)-nr_frags; int i; + int err; + + if (skb_cloned(skb) + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC + return err; for (i = 0; i nfrags; i++) { int end = offset + skb_shinfo(skb)-frags[i].size; - if (end len) { - if (skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return -ENOMEM; - } - if (len = offset) { - put_page(skb_shinfo(skb)-frags[i].page); - skb_shinfo(skb)-nr_frags--; - } else { - skb_shinfo(skb)-frags[i].size = len - offset; - } + + if (end len) { + offset = end; + continue; } - offset = end; + + if (len offset) + skb_shinfo(skb)-frags[i++].size = len - offset; + + skb_shinfo(skb)-nr_frags = i; + + for (; i nfrags; i++) + put_page(skb_shinfo(skb)-frags[i].page); + + if (skb_shinfo(skb)-frag_list) + skb_drop_fraglist(skb); + break; } - if (offset len) { + for (fragp = skb_shinfo(skb)-frag_list; (frag = *fragp); +fragp = frag-next) { + int end = offset + frag-len; + + if (skb_shared(frag)) { + struct sk_buff *nfrag; + + nfrag = skb_clone(frag, GFP_ATOMIC); + if (unlikely(!nfrag)) + return -ENOMEM; + + nfrag-next = frag-next; + frag = nfrag; + *fragp = frag; + } + + if (end len) { + offset = end; + continue; + } + + if (end len + unlikely((err = pskb_trim(frag, len - offset + return err; + + if (frag-next) + skb_drop_list(frag-next); + break; + } + + if (len skb_headlen(skb)) { skb-data_len -= skb-len - len; skb-len = len
Re: [PATCH 08/10] MLSXFRM: Add security context to acquire messages using PF_KEY
On Fri, Jul 14, 2006 at 09:54:59AM -0400, James Morris wrote: Herbert, any review from you on this would be greatly appreciated. OK, I'll try to have a look tomorrow (I'm GMT-4 at the moment). Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] clear skb cb on IP input
David Miller [EMAIL PROTECTED] wrote: Thank goodness this thing is only 3-words in size, this is going to run on every single IPv4 packet received by the system. :-/ At least this lets us get rid of a few other memsets :) [IPV4]: Get rid of redundant IPCB-opts initialisation Now that we always zero the IPCB-opts in ip_rcv, it is no longer necessary to do so before calling netif_rx for tunneled packets. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6ff9b10..0f9b3a3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -617,7 +617,6 @@ static int ipgre_rcv(struct sk_buff *skb skb-mac.raw = skb-nh.raw; skb-nh.raw = __pskb_pull(skb, offset); skb_postpull_rcsum(skb, skb-h.raw, offset); - memset((IPCB(skb)-opt), 0, sizeof(struct ip_options)); skb-pkt_type = PACKET_HOST; #ifdef CONFIG_NET_IPGRE_BROADCAST if (MULTICAST(iph-daddr)) { diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index cbcae65..406056e 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -256,7 +256,6 @@ int ip_options_compile(struct ip_options if (!opt) { opt = (IPCB(skb)-opt); - memset(opt, 0, sizeof(struct ip_options)); iph = skb-nh.raw; opt-optlen = ((struct iphdr *)iph)-ihl*4 - sizeof(struct iphdr); optptr = iph + sizeof(struct iphdr); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3291d51..76ab50b 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -487,7 +487,6 @@ static int ipip_rcv(struct sk_buff *skb) skb-mac.raw = skb-nh.raw; skb-nh.raw = skb-data; - memset((IPCB(skb)-opt), 0, sizeof(struct ip_options)); skb-protocol = htons(ETH_P_IP); skb-pkt_type = PACKET_HOST; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ba33f86..9ccacf5 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1461,7 +1461,6 @@ int pim_rcv_v1(struct sk_buff * skb) skb_pull(skb, (u8*)encap - skb-data); skb-nh.iph = (struct iphdr *)skb-data; skb-dev = reg_dev; - memset((IPCB(skb)-opt), 0, sizeof(struct ip_options)); skb-protocol = htons(ETH_P_IP); skb-ip_summed = 0; skb-pkt_type = PACKET_HOST; @@ -1517,7 +1516,6 @@ static int pim_rcv(struct sk_buff * skb) skb_pull(skb, (u8*)encap - skb-data); skb-nh.iph = (struct iphdr *)skb-data; skb-dev = reg_dev; - memset((IPCB(skb)-opt), 0, sizeof(struct ip_options)); skb-protocol = htons(ETH_P_IP); skb-ip_summed = 0; skb-pkt_type = PACKET_HOST; diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index f8d880b..13cafbe 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -92,7 +92,6 @@ static int xfrm4_tunnel_input(struct xfr skb-mac.raw = memmove(skb-data - skb-mac_len, skb-mac.raw, skb-mac_len); skb-nh.raw = skb-data; - memset((IPCB(skb)-opt), 0, sizeof(struct ip_options)); err = 0; out: diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index c56aeec..836eecd 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -380,7 +380,6 @@ static int ipip6_rcv(struct sk_buff *skb secpath_reset(skb); skb-mac.raw = skb-nh.raw; skb-nh.raw = skb-data; - memset((IPCB(skb)-opt), 0, sizeof(struct ip_options)); IPCB(skb)-flags = 0; skb-protocol = htons(ETH_P_IPV6); skb-pkt_type = PACKET_HOST; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCHv2 2.6.18-rc1-mm2 1/3] net: UDP-Lite generic support
Gerrit Renker [EMAIL PROTECTED] wrote: diff -Nurp a/net/core/sock.c b/net/core/sock.c --- a/net/core/sock.c 2006-07-06 09:08:24.0 +0100 +++ b/net/core/sock.c 2006-07-14 10:17:50.0 +0100 @@ -479,7 +479,12 @@ set_rcvbuf: break; case SO_NO_CHECK: - sk-sk_no_check = valbool; + /* UDP-Lite (RFC 3828) mandates checksumming, +* hence user must not enable this option. */ + if (sk-sk_protocol == IPPROTO_UDPLITE) + ret = -EOPNOTSUPP; + else + sk-sk_no_check = valbool; Please don't add protocol-specific stuff to generic functions. In this case why don't you just ignore sk_no_check for UDPLITE as we do for TCP? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] clear skb cb on IP input
On Sat, Jul 15, 2006 at 06:12:22PM -0700, David Miller wrote: But I'm beginning to think that the onus of this may in fact fall upon the devices, in fact. Loopback is one of the few devices where the control block might not be cleared out, due to uses in the output path. Devices predominantly provide a zero'd out control block in the skb on packet receive. The thing is qdiscs using cb means that this method of clearing cb before netif_rx doesn't work anymore. In particular, even if loopback clears cb before calling netif_rx, some qdisc could come along between netif_rx and ip_rcv and put stuff in the cb. The same thing can happen to any NIC in fact, as long as we allow qdiscs to use the cb area without clearing it, ip_rcv needs to clear it itself. With a little bit of effort we should be able to get away with clearing just optlen. Whether this effort is worthwhile I don't know :) Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [stable] [NET]: Update frag_list in pskb_trim
On Mon, Jul 17, 2006 at 08:22:44AM -0700, Greg KH wrote: Ick, this doesn't apply to 2.6.17, care to rediff it? I don't trust myself to get it correct :) Oops, I thought I rediffed against 2.6.17, but it must've been something else. Here is a second attempt: [NET]: Update frag_list in pskb_trim When pskb_trim has to defer to ___pksb_trim to trim the frag_list part of the packet, the frag_list is not updated to reflect the trimming. This will usually work fine until you hit something that uses the packet length or tail from the frag_list. Examples include esp_output and ip_fragment. Another problem caused by this is that you can end up with a linear packet with a frag_list attached. It is possible to get away with this if we audit everything to make sure that they always consult skb-len before going down onto frag_list. In fact we can do the samething for the paged part as well to avoid copying the data area of the skb. For now though, let's do the conservative fix and update frag_list. Many thanks to Marco Berizzi for helping me to track down this bug. This 4-year old bug took 3 months to track down. Marco was very patient indeed :) Signed-off-by: Herbert Xu [EMAIL PROTECTED] Signed-off-by: David S. Miller [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f8f2347..2c31bb0 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -967,15 +967,16 @@ #ifndef NET_SKB_PAD #define NET_SKB_PAD16 #endif -extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc); +extern int ___pskb_trim(struct sk_buff *skb, unsigned int len); static inline void __skb_trim(struct sk_buff *skb, unsigned int len) { - if (!skb-data_len) { - skb-len = len; - skb-tail = skb-data + len; - } else - ___pskb_trim(skb, len, 0); + if (unlikely(skb-data_len)) { + WARN_ON(1); + return; + } + skb-len = len; + skb-tail = skb-data + len; } /** @@ -985,6 +986,7 @@ static inline void __skb_trim(struct sk_ * * Cut the length of a buffer down by removing data from the tail. If * the buffer is already under the length specified it is not modified. + * The skb must be linear. */ static inline void skb_trim(struct sk_buff *skb, unsigned int len) { @@ -995,12 +997,10 @@ static inline void skb_trim(struct sk_bu static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) { - if (!skb-data_len) { - skb-len = len; - skb-tail = skb-data+len; - return 0; - } - return ___pskb_trim(skb, len, 1); + if (skb-data_len) + return ___pskb_trim(skb, len); + __skb_trim(skb, len); + return 0; } static inline int pskb_trim(struct sk_buff *skb, unsigned int len) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fb3770f..40f108e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -250,11 +250,11 @@ nodata: } -static void skb_drop_fraglist(struct sk_buff *skb) +static void skb_drop_list(struct sk_buff **listp) { - struct sk_buff *list = skb_shinfo(skb)-frag_list; + struct sk_buff *list = *listp; - skb_shinfo(skb)-frag_list = NULL; + *listp = NULL; do { struct sk_buff *this = list; @@ -263,6 +263,11 @@ static void skb_drop_fraglist(struct sk_ } while (list); } +static inline void skb_drop_fraglist(struct sk_buff *skb) +{ + skb_drop_list(skb_shinfo(skb)-frag_list); +} + static void skb_clone_fraglist(struct sk_buff *skb) { struct sk_buff *list; @@ -800,49 +805,80 @@ struct sk_buff *skb_pad(struct sk_buff * return nskb; } -/* Trims skb to length len. It can change skb pointers, if realloc is 1. - * If realloc==0 and trimming is impossible without change of data, - * it is BUG(). +/* Trims skb to length len. It can change skb pointers. */ -int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) +int ___pskb_trim(struct sk_buff *skb, unsigned int len) { + struct sk_buff **fragp; + struct sk_buff *frag; int offset = skb_headlen(skb); int nfrags = skb_shinfo(skb)-nr_frags; int i; + int err; + + if (skb_cloned(skb) + unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC + return err; for (i = 0; i nfrags; i++) { int end = offset + skb_shinfo(skb)-frags[i].size; - if (end len) { - if (skb_cloned(skb)) { - BUG_ON(!realloc); - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - return
Re: [PATCH] Bug in pskb_trim_rcsum()
On Tue, Jul 18, 2006 at 09:09:34AM +0800, Wei Yongjun wrote: And in my test, UDP under IPv4 maybe do that. My UDP packet is: packet1: ___ | Source Port | Dest Port | |_|_| | Length = 16 | Checksum(*1) | |_|_| | payload24 | |__| The whole point of CHECKSUM_UNNECESSARY is that the hardware parses the protocol header for us. So in this case it must calculate the checksum for only the first 8 bytes of the payload. If it does this incorrectly, then it doesn't support RX checksums at all. Which NIC is doing this BTW? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Bug in pskb_trim_rcsum()
On Tue, Jul 18, 2006 at 04:54:39PM +0400, Alexey Kuznetsov wrote: I preferred optimistic approach: if the checksum comes out correct, we do not really care, how device calculated it. Probably, it calculated checksum over wrong data, but got a good checksum. So what? It is not a crypto digest yet. And if device found wrong checksum, we will recalculate it anyway. Agreed. I would like to add that CHECKSUM_UNNECESSARY can be used, when checksum is really wrong (on loopback), that's why it is not cleared, when trimming. CHECKSUM_HW can always fall back to CHECKSUM_NONE, but CHECKSUM_UNNECESSARY cannot. Probably, this was bad idea, but it still means that if some generic function starts to clear it, all the code using it should be reverified. Actually, I plan to differentiate between RX CHECKSUM_HW and TX CHECKSUM_HW. Now that we have things like Xen it is possible for RX packets to have patial checksums too. When this is done loopback can send TX CHECKSUM_HW packets instead of CHECKSUM_UNNECESSARY (I'm currently calling this CHECKSUM_PARTIAL). Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 32/33] Add the Xen virtual network device driver.
jamal [EMAIL PROTECTED] wrote: I dont think the ifup/ifconfig provide operational status (i.e link up/down) - or do they? If they can be made to invoke scripts in such a case then we are set. In fact, that's a very good reason why this shouldn't be in netfront. Indeed, it shouldn't be in the guest at all. The reason is that the guest has no idea whether the physical carrier is present. It's much better for the host to send the ARP packet on behalf of the guest since the host knows the carrier status and the guest's MAC address. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 32/33] Add the Xen virtual network device driver.
John Haller [EMAIL PROTECTED] wrote: But sending ARPs is not the right thing if the guest is expecting to use IPv6 networking, in which case unsolicited neighbor advertisements are the right thing to do. The driver just doesn't seem to be the right place to do this, as it doesn't/ shouldn't need to know the difference between IPv4/IPv6. In this case it doesn't really matter because AFAIK they're trying to get switches to notice that the MAC has moved. So all you need is some packet that the switches can grok. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] clear skb cb on IP input
On Tue, Jul 18, 2006 at 08:19:34PM +0200, Guillaume Chazarain wrote: Why not clearing the whole IPCB(skb) instead of just IPCB(skb)-opts? that would also clear IPCB(skb)-flags. I agree, we should clear the whole IPCB. And, does not ipv6 need the same treatment with IP6CB? Probably. Patches are welcome :) Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 32/33] Add the Xen virtual network device driver.
Stephen Hemminger [EMAIL PROTECTED] wrote: diff -r eadc12b20f35 drivers/xen/netfront/netfront.c --- /dev/null Thu Jan 01 00:00:00 1970 + +++ b/drivers/xen/netfront/netfront.c Fri Jun 09 15:03:12 2006 -0400 @@ -0,0 +1,1584 @@ +static inline void init_skb_shinfo(struct sk_buff *skb) +{ + atomic_set((skb_shinfo(skb)-dataref), 1); + skb_shinfo(skb)-nr_frags = 0; + skb_shinfo(skb)-frag_list = NULL; +} Shouldn't this move to skbuff.h? If and when my dom0=domU GSO patches are applied, this will simply disappear. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: michael_mic in crypto api?
Michael Wu [EMAIL PROTECTED] wrote: Simplicity and consistency. Whereas the relatively simple mic part of the TKIP algorithm is in crypto API, the (more important, more complicated) key mixing part is not in crypto api. It is unlikely that either the mic or key mixing part would be used separately or even outside of TKIP/802.11i code, and we don't want to encourage people anyways since they're just bandaids for problems associated with using rc4. Sure, I don't mind either way. I think Jouni wrote this originally, maybe he can share his thoughts with us? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: michael_mic in crypto api?
Jouni Malinen [EMAIL PROTECTED] wrote: However, at least for some time, there are two different TKIP implementations (net/ieee80211 and net/d80211) so this would mean duplicating Michael MIC implementation and I would rather not do that. Good point, let's keep it for now. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: A question about linux/net/ipv4/ipcomp.c
Igor V. Liferenko [EMAIL PROTECTED] wrote: Would you please say why it's 60, and not 52? The header length / 4 must fit within a single hexadecimal digit. Therefore the maximum is 15 * 4 = 60. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [XFRM]: Fix protocol field value for outgoing IPv6 GSO packets
On Tue, Jul 25, 2006 at 02:09:26AM +0200, Patrick McHardy wrote: This appears to be a mistake, but I didn't follow the GSO stuff very closely, so there could be some non-obvious reason. Yes it definitely was a mistake! Thanks for picking this up Patrick. [XFRM]: Fix protocol field value for outgoing IPv6 GSO packets Signed-off-by: Patrick McHardy [EMAIL PROTECTED] Acked-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] ip multicast route bug fix
Stephen Hemminger [EMAIL PROTECTED] wrote: @@ -1593,12 +1594,19 @@ int ipmr_get_route(struct sk_buff *skb, read_unlock(mrt_lock); return -ENODEV; } - skb-nh.raw = skb_push(skb, sizeof(struct iphdr)); - skb-nh.iph-ihl = sizeof(struct iphdr)2; - skb-nh.iph-saddr = rt-rt_src; - skb-nh.iph-daddr = rt-rt_dst; - skb-nh.iph-version = 0; - err = ipmr_cache_unresolved(vif, skb); + + iskb = alloc_skb(sizeof(struct iphdr), GFP_KERNEL); + if (!iskb) { + read_unlock(mrt_lock); + return -ENOMEM; + } + memset(iskb-data, 0, sizeof(struct iphdr)); + iskb-nh.raw = iskb-data; + iskb-nh.iph-ihl = sizeof(struct iphdr)2; + iskb-nh.iph-saddr = rt-rt_src; + iskb-nh.iph-daddr = rt-rt_dst; + + err = ipmr_cache_unresolved(vif, iskb); I'm afraid this is still broken in a different way. If ipmr_cache_unresolved queues the skb onto the unresolved list things it's going to try to use the skb as a netlink skb instead :) Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [IPROUTE]: Add support for multipath route realms
Patrick McHardy [EMAIL PROTECTED] wrote: [IPROUTE]: Add support for multipath route realms Routing realms exist per nexthop, but iproute currently only allows to send a single route realm, which is refused by the kernel for multipath routes. Add support for specifying per nexthop realms. Old kernels only return the first realm back to userspace when dumping, so the others can't be displayed, besides that it will also behave correctly on old kernels. old kernel: 1.2.3.4 realm 1 nexthop dev dummy0 weight 1 nexthop dev dummy1 weight 1 nexthop dev dummy2 weight 1 nexthop dev dummy3 weight 1 new kernel: 1.2.3.4 nexthop realm 1 dev dummy0 weight 1 nexthop realm 2 dev dummy1 weight 1 nexthop realm 3 dev dummy2 weight 1 nexthop realm 4 dev dummy3 weight 1 This really looks like papering over fundamental brokenness of IP_ROUTE_MULTIPATH_CACHED since you wouldn't otherwise get these entries in the routing cache. This reminds me that I better revisit the reasons that people gave for actually using IP_ROUTE_MULTIPATH_CACHED the last time we tried to get rid of it. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [IPROUTE]: Add support for multipath route realms
On Tue, Jul 25, 2006 at 06:19:33PM +1000, Herbert Xu wrote: new kernel: 1.2.3.4 nexthop realm 1 dev dummy0 weight 1 nexthop realm 2 dev dummy1 weight 1 nexthop realm 3 dev dummy2 weight 1 nexthop realm 4 dev dummy3 weight 1 This really looks like papering over fundamental brokenness of IP_ROUTE_MULTIPATH_CACHED since you wouldn't otherwise get these entries in the routing cache. Nevermind, I misread your changelog. Your patch is obviously not related to IP_ROUTE_MULTIPATH_CACHED :) -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] ip multicast route bug fix
Alexey Kuznetsov [EMAIL PROTECTED] wrote: I think you mean this. Note, it is real skb_clone(), not alloc_skb(). Equeued skb contains the whole half-prepared netlink message plus room for the rest. It could be also skb_copy(), if we want to be puristic about mangling cloned data, but original copy is really not going to be used. I like this. However, since the cloned skb is either discarded in case of error, or queued in which case the caller discards its reference right away, wouldn't it be simpler to just do this? Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ba33f86..0a2af08 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1593,6 +1593,7 @@ int ipmr_get_route(struct sk_buff *skb, read_unlock(mrt_lock); return -ENODEV; } + skb_get(skb); skb-nh.raw = skb_push(skb, sizeof(struct iphdr)); skb-nh.iph-ihl = sizeof(struct iphdr)2; skb-nh.iph-saddr = rt-rt_src; - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: BUGs in skb_checksum_help() and skb_gso_segment() in 2.6.18-rc2
Hi Patrick: On Wed, Jul 26, 2006 at 05:38:07AM +0200, Patrick McHardy wrote: I have a patch which changes netfilter to do incremental checksumming. The hook number is passed to all functions doing this so they know how to update the checksum. Could you explain how CHECKSUM_COMPLETE/CHECKSUM_PARTIAL are going to be used? I assume they're meant to avoid passing hook numbers around everywhere? Yes the hook number is another way to solve the same problem. However, it can only be used within netfilter. CHECKSUM_COMPLETE/CHECKSUM_PARTIAL on the other hand are valid throughout the stack. With Xen feeding Linux packets into the stack the netfilter hook is also no longer sufficient to distinguish between these two cases as partial checksum packets can now appear on receive. The problem is that you need to do different incremental updates depending on whether the checksum is complete (i.e., CHECKSUM_HW on receive), or partial (i.e., CHECKSUM_HW on transmit). With complete checksums the current update code in netfilter can be used as is. With partial checksums you need to exclude bits which weren't used when computing the partial checksums (e.g., TCP port numbers need to be excluded, but the IP address needs to be included for NAT). I have a patch that adds CHECKSUM_COMPLETE/CHECKSUM_PARTIAL if you want something to work from. Let me know if you want this and I'll bounce it to you. Thanks, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH Round 4 2/3] Core network changes to support network event notification.
On Tue, Jul 25, 2006 at 10:05:40AM -0500, Steve Wise wrote: But they really are seeing a delete followed by an add. That's what the kernel is doing. Actually that's the other thing I don't really like. The user-space monitor may perceive that a route was actually deleted and replaced by a new one even though this isn't what's happening at all. In fact the problem here is that you're sending route notifications when it's really the dst_entry that's changing. User-space as it stands only get notifications about fib changes which is quite different from changes to the transient dst_entry objects which only exist in the route cache. Is anyone actually going to use the user-space interface of this? If not perhaps we should wait until someone really needs it before adding the netlink part of the patch. We can change the kernel interface at will so if we make a mistake with netevent it can be easily corrected. For user-space though the rules are totally different. I'd really hate to be stuck with an interface which turns out to not be the one that people actually want to have. The rdma driver needs to update all established rdma connections that are using the next-hop information of the existing route and make them use the next-hop information of the new route. In addition, the rdma driver might have a reference to the old dst entry. So it can release that ref and add a ref to the new dst entry. Do you really need the old route for the user-space part of your patch? I have to admit I'm a little fuzzy on the routing stuff. The main netevents I've utilized in the the rdma driver I'm writing is the neighbour update event and the redirect event. Route add/del was added for completeness of routing netevents. So you mean you aren't going to use the route notifications? In that case we should probably just drop them and add them when someone actually needs it. At that point they can tell us what semantics they want from it :) Can you expand further or point me to code where the IP stack flushes its tables when routes are changed? Grep for rt_cache_flush in net/ipv4/fib_hash.c. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: BUGs in skb_checksum_help() and skb_gso_segment() in 2.6.18-rc2
On Wed, Jul 26, 2006 at 06:01:40AM +0200, Patrick McHardy wrote: Please send it, I'll update my patch based on that. Thanks. Here it is, it sits on top of commit ca6bb5d7ab22ac79f608fe6cbc6b12de6a5a19f0 Author: David Woodhouse [EMAIL PROTECTED] Date: Thu Jun 22 16:07:52 2006 -0700 [NET]: Require CAP_NET_ADMIN to create tuntap devices. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- 61a015eb86469404587e910e9b852fc35ce436b8 diff --git a/drivers/atm/he.c b/drivers/atm/he.c index fde9334..601e7ee 100644 --- a/drivers/atm/he.c +++ b/drivers/atm/he.c @@ -1913,7 +1913,7 @@ #endif skb-tail = skb-data + skb-len; #ifdef USE_CHECKSUM_HW if (vcc-vpi == 0 vcc-vci = ATM_NOT_RSV_VCI) { - skb-ip_summed = CHECKSUM_HW; + skb-ip_summed = CHECKSUM_COMPLETE; skb-csum = TCP_CKSUM(skb-data, he_vcc-pdu_len); } diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c index e277789..15dcd4e 100644 --- a/drivers/net/3c59x.c +++ b/drivers/net/3c59x.c @@ -2243,7 +2243,7 @@ boomerang_start_xmit(struct sk_buff *skb vp-tx_ring[entry].next = 0; #if DO_ZEROCOPY - if (skb-ip_summed != CHECKSUM_HW) + if (skb-ip_summed != CHECKSUM_PARTIAL) vp-tx_ring[entry].status = cpu_to_le32(skb-len | TxIntrUploaded); else vp-tx_ring[entry].status = cpu_to_le32(skb-len | TxIntrUploaded | AddTCPChksum | AddUDPChksum); diff --git a/drivers/net/8139cp.c b/drivers/net/8139cp.c index ad0c8c3..4f566d8 100644 --- a/drivers/net/8139cp.c +++ b/drivers/net/8139cp.c @@ -809,7 +809,7 @@ #endif if (mss) flags |= LargeSend | ((mss MSSMask) MSSShift); - else if (skb-ip_summed == CHECKSUM_HW) { + else if (skb-ip_summed == CHECKSUM_PARTIAL) { const struct iphdr *ip = skb-nh.iph; if (ip-protocol == IPPROTO_TCP) flags |= IPCS | TCPCS; @@ -863,7 +863,7 @@ #endif if (mss) ctrl |= LargeSend | ((mss MSSMask) MSSShift); - else if (skb-ip_summed == CHECKSUM_HW) { + else if (skb-ip_summed == CHECKSUM_PARTIAL) { if (ip-protocol == IPPROTO_TCP) ctrl |= IPCS | TCPCS; else if (ip-protocol == IPPROTO_UDP) @@ -894,7 +894,7 @@ #endif txd-addr = cpu_to_le64(first_mapping); wmb(); - if (skb-ip_summed == CHECKSUM_HW) { + if (skb-ip_summed == CHECKSUM_PARTIAL) { if (ip-protocol == IPPROTO_TCP) txd-opts1 = cpu_to_le32(first_eor | first_len | FirstFrag | DescOwn | diff --git a/drivers/net/acenic.c b/drivers/net/acenic.c index 23ff22b..3ab0e76 100644 --- a/drivers/net/acenic.c +++ b/drivers/net/acenic.c @@ -2041,7 +2041,7 @@ static void ace_rx_int(struct net_device */ if (bd_flags BD_FLG_TCP_UDP_SUM) { skb-csum = htons(csum); - skb-ip_summed = CHECKSUM_HW; + skb-ip_summed = CHECKSUM_COMPLETE; } else { skb-ip_summed = CHECKSUM_NONE; } @@ -2512,7 +2512,7 @@ restart: mapping = ace_map_tx_skb(ap, skb, skb, idx); flagsize = (skb-len 16) | (BD_FLG_END); - if (skb-ip_summed == CHECKSUM_HW) + if (skb-ip_summed == CHECKSUM_PARTIAL) flagsize |= BD_FLG_TCP_UDP_SUM; #if ACENIC_DO_VLAN if (vlan_tx_tag_present(skb)) { @@ -2535,7 +2535,7 @@ #endif mapping = ace_map_tx_skb(ap, skb, NULL, idx); flagsize = (skb_headlen(skb) 16); - if (skb-ip_summed == CHECKSUM_HW) + if (skb-ip_summed == CHECKSUM_PARTIAL) flagsize |= BD_FLG_TCP_UDP_SUM; #if ACENIC_DO_VLAN if (vlan_tx_tag_present(skb)) { @@ -2561,7 +2561,7 @@ #endif PCI_DMA_TODEVICE); flagsize = (frag-size 16); - if (skb-ip_summed == CHECKSUM_HW) + if (skb-ip_summed == CHECKSUM_PARTIAL) flagsize |= BD_FLG_TCP_UDP_SUM; idx = (idx + 1) % ACE_TX_RING_ENTRIES(ap
Re: ipsec tunnel policy vs routing table
Marco Berizzi [EMAIL PROTECTED] wrote: 172.16.0.0/23 dev eth2 proto kernel scope link src 172.16.1.1 10.180.0.0/16 via 172.16.1.253 dev eth2 10.0.0.0/8 via pub_ip dev eth0 127.0.0.0/8 dev lo scope link I have noticed that packets for 10.180.0.0/16 network are eaten by the ipsec tunnel because the policy allow them. Is there a way to deliver packets for 10.180.0.0 network to the 172.16.1.253 router (because the route to 10.180.0.0 is more specific than 10.0.0.0/8)? You need an IPsec pass action. With Openswan you can do it with something like conn pass left=%defaultroute # This should be the leftsubnet of your 10.0.0.0/8 connection. leftsubnet=0.0.0.0/0 # This field doesn't really matter. right=172.16.1.253 rightsubnet=10.180.0.0/16 type=passthrough authby=never auto=route Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: ipsec tunnel policy vs routing table
On Thu, Jul 27, 2006 at 04:06:44PM +0200, Marco Berizzi wrote: conn pass left=172.16.1.1 leftsubnet=172.16.0.0/23 right=172.16.1.253 rightsubnet=10.180.0./16 type=passthrough authby=never auto=route After running 'ipsec auto --add pass ipsec auto --route pass' openswan has eaten my static route inserted by hand: route add -net 10.180.0.0/16 gw 172.16.1.253 Here is 'ip r s' output after 'ipsec auto --route pass': 172.16.0.0/23 dev eth2 proto kernel scope link src 172.16.1.1 10.180.0.0/16 dev eth2 scope link Oh yeah, forgot about that :) You can set the gateway using rightnexthop=172.16.1.253. All if fine now. It isn't even needed anymore to insert the static route now, as it is placed by openswan. My question is how linux understand that it should send packets for 10.180.0.0/24 to the 172.16.1.253 router. It doesn't really. However your router might be proxy arping. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Regarding offloading IPv6 addrconf and ndisc
Kazunori Miyazawa [EMAIL PROTECTED] wrote: I'm interested in the approach. And I have a couple of comments. I think DAD and ND are time critical operations. Can the daemons process with confirming to the specs. even if it were swapped out? Can we prevent the oom killer from killing the daemons? These are valid concerns. However, if we can have things like ntpd live in user-space without causing nuisance, then addrconf should be fine as well. Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Regarding offloading IPv6 addrconf and ndisc
On Thu, Jul 27, 2006 at 06:34:15PM -0700, David Miller wrote: I have severe doubts actually in this area. And I have practical experience to back up these doubts in this specific case. OK. Just moving the ipv6 address add/delete out of software interrupt context broke the TAHI and other ipv6 testsuites. The reason was simple. Consider a simple test case that emits an NDISC packet that should cause an interface address to be added, and then it sends a packet which makes sure that host responds to that address. We have those two packets in our queue, as packet A and B. I'd like to know more about this test. On the face of it this test seems to be broken. What if packet A was lost? Surely this shouldn't be used as an indication that the target IPv6 stack is out-of-spec. If we're really going to guarantee that NDISC processing is always going to be synchronous, this imposes fairly nasty restrictions on what we can do in future. For instance, this would rule out having the NIC distribute flows across CPUs as this would break the synchronocity of NDISC processing vs. TCP processing. As a secondary reason not to even consider this, it's in the kernel already and therefore it is totally impractical to try and remove it. When considering new protocols or features, the user vs. kernel argument is something to validly consider. But when it's already there, it will have to live there basically for eternity. It is not like some arbitrary internal kernel module symbol or interface we can deprecate over a 6 month period or something like that. Fair enough. I suppose another case in point is IPv4 autoconf which is *still* in the kernel after all these years. However, to draw an analogy we're kind of stuck in a bog here. So while we can't extricate ourselves easily, we should attempt to come up with ways of eventually lifting us out. We should also try to avoid any actions that'll cause us to sink deeper :) Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[IPV6]: Audit all ip6_dst_lookup/ip6_dst_store calls
On Fri, Jul 28, 2006 at 07:45:31PM +, Matt Domsch wrote: Triggered on Fedora rawhide kernel-2.6.17-1.2462.fc6 x86_64 which is based on 2.6.18rc2-git6. IPv6 was in use at the time. = [ INFO: inconsistent lock state ] - inconsistent {softirq-on-W} - {in-softirq-R} usage. swapper/0 [HC0[0]:SC1[1]:HE1:SE0] takes: (sk-sk_dst_lock){---?}, at: [80418ef3] sk_dst_check+0x26/0x12b {softirq-on-W} state was registered at: [802a874d] lock_acquire+0x4a/0x69 [802672a1] _write_lock+0x24/0x31 [8044a26b] ip4_datagram_connect+0x2e1/0x350 [80451214] inet_dgram_connect+0x57/0x65 [8041652a] sys_connect+0x7d/0xa4 [8025ff0d] system_call+0x7d/0x83 Thanks for the report. This is actually a false positive because by these two paths can't intersect since one is a UDP while the other is TCP. However, here is a patch which should shut up the validator as well as removing unnecessary locking from most callers of ip6_dst_lookup. [IPV6]: Audit all ip6_dst_lookup/ip6_dst_store calls The current users of ip6_dst_lookup can be divided into two classes: 1) The caller holds no locks and is in user-context (UDP). 2) The caller does not want to lookup the dst cache at all. The second class covers everyone except UDP because most people do the cache lookup directly before calling ip6_dst_lookup. This patch adds ip6_sk_dst_lookup for the first class. Similarly ip6_dst_store users can be divded into those that need to take the socket dst lock and those that don't. This patch adds __ip6_dst_store for those (everyone except UDP/datagram) that don't need an extra lock. Signed-off-by: Herbert Xu [EMAIL PROTECTED] Cheers, -- Visit Openswan at http://www.openswan.org/ Email: Herbert Xu ~{PmVHI~} [EMAIL PROTECTED] Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt -- diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index ab29daf..96b0e66 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -139,16 +139,22 @@ extern rwlock_t rt6_lock; /* * Store a destination cache entry in a socket */ -static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, -struct in6_addr *daddr) +static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst, + struct in6_addr *daddr) { struct ipv6_pinfo *np = inet6_sk(sk); struct rt6_info *rt = (struct rt6_info *) dst; - write_lock(sk-sk_dst_lock); sk_setup_caps(sk, dst); np-daddr_cache = daddr; np-dst_cookie = rt-rt6i_node ? rt-rt6i_node-fn_sernum : 0; +} + +static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, +struct in6_addr *daddr) +{ + write_lock(sk-sk_dst_lock); + __ip6_dst_store(sk, dst, daddr); write_unlock(sk-sk_dst_lock); } diff --git a/include/net/ipv6.h b/include/net/ipv6.h index a8fdf79..ece7e8a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -468,6 +468,9 @@ extern void ip6_flush_pending_frames(s extern int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl); +extern int ip6_sk_dst_lookup(struct sock *sk, + struct dst_entry **dst, + struct flowi *fl); /* * skb processing functions diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 9f3d4d7..610c722 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -230,7 +230,7 @@ static int dccp_v6_connect(struct sock * ipv6_addr_copy(np-saddr, saddr); inet-rcv_saddr = LOOPBACK4_IPV6; - ip6_dst_store(sk, dst, NULL); + __ip6_dst_store(sk, dst, NULL); icsk-icsk_ext_hdr_len = 0; if (np-opt != NULL) @@ -863,7 +863,7 @@ static struct sock *dccp_v6_request_recv * comment in that function for the gory details. -acme */ - ip6_dst_store(newsk, dst, NULL); + __ip6_dst_store(newsk, dst, NULL); newsk-sk_route_caps = dst-dev-features ~(NETIF_F_IP_CSUM | NETIF_F_TSO); newdp6 = (struct dccp6_sock *)newsk; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5a0ba58..ac85e9c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -658,7 +658,7 @@ int inet6_sk_rebuild_header(struct sock return err; } - ip6_dst_store(sk, dst, NULL); + __ip6_dst_store(sk, dst, NULL); } return 0; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 5c950cc..bf49107 100644