Please queue up the following networking bug fixes for 2.6.37-stable Thanks!
>From 0a45581901aa642be8a4a1af6acada3dd2696fb8 Mon Sep 17 00:00:00 2001 From: David S. Miller <[email protected]> Date: Wed, 5 Jan 2011 15:38:53 -0800 Subject: [PATCH 1/5] af_unix: Avoid socket->sk NULL OOPS in stream connect security hooks.
[ Upstream commit 3610cda53f247e176bcbb7a7cca64bc53b12acdb ] unix_release() can asynchornously set socket->sk to NULL, and it does so without holding the unix_state_lock() on "other" during stream connects. However, the reverse mapping, sk->sk_socket, is only transitioned to NULL under the unix_state_lock(). Therefore make the security hooks follow the reverse mapping instead of the forward mapping. Reported-by: Jeremy Fitzhardinge <[email protected]> Reported-by: Linus Torvalds <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- include/linux/security.h | 15 +++++++-------- net/unix/af_unix.c | 2 +- security/capability.c | 2 +- security/security.c | 3 +-- security/selinux/hooks.c | 10 +++++----- security/smack/smack_lsm.c | 14 +++++++------- 6 files changed, 22 insertions(+), 24 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index fd4d55f..d47a4c2 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -796,8 +796,9 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @unix_stream_connect: * Check permissions before establishing a Unix domain stream connection * between @sock and @other. - * @sock contains the socket structure. - * @other contains the peer socket structure. + * @sock contains the sock structure. + * @other contains the peer sock structure. + * @newsk contains the new sock structure. * Return 0 if permission is granted. * @unix_may_send: * Check permissions before connecting or sending datagrams from @sock to @@ -1568,8 +1569,7 @@ struct security_operations { int (*inode_getsecctx)(struct inode *inode, void **ctx, u32 *ctxlen); #ifdef CONFIG_SECURITY_NETWORK - int (*unix_stream_connect) (struct socket *sock, - struct socket *other, struct sock *newsk); + int (*unix_stream_connect) (struct sock *sock, struct sock *other, struct sock *newsk); int (*unix_may_send) (struct socket *sock, struct socket *other); int (*socket_create) (int family, int type, int protocol, int kern); @@ -2525,8 +2525,7 @@ static inline int security_inode_getsecctx(struct inode *inode, void **ctx, u32 #ifdef CONFIG_SECURITY_NETWORK -int security_unix_stream_connect(struct socket *sock, struct socket *other, - struct sock *newsk); +int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk); int security_unix_may_send(struct socket *sock, struct socket *other); int security_socket_create(int family, int type, int protocol, int kern); int security_socket_post_create(struct socket *sock, int family, @@ -2567,8 +2566,8 @@ void security_tun_dev_post_create(struct sock *sk); int security_tun_dev_attach(struct sock *sk); #else /* CONFIG_SECURITY_NETWORK */ -static inline int security_unix_stream_connect(struct socket *sock, - struct socket *other, +static inline int security_unix_stream_connect(struct sock *sock, + struct sock *other, struct sock *newsk) { return 0; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2268e67..759bbcb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1156,7 +1156,7 @@ restart: goto restart; } - err = security_unix_stream_connect(sock, other->sk_socket, newsk); + err = security_unix_stream_connect(sk, other, newsk); if (err) { unix_state_unlock(sk); goto out_unlock; diff --git a/security/capability.c b/security/capability.c index c773635..2a5df2b 100644 --- a/security/capability.c +++ b/security/capability.c @@ -548,7 +548,7 @@ static int cap_sem_semop(struct sem_array *sma, struct sembuf *sops, } #ifdef CONFIG_SECURITY_NETWORK -static int cap_unix_stream_connect(struct socket *sock, struct socket *other, +static int cap_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { return 0; diff --git a/security/security.c b/security/security.c index 1b798d3..e5fb07a 100644 --- a/security/security.c +++ b/security/security.c @@ -977,8 +977,7 @@ EXPORT_SYMBOL(security_inode_getsecctx); #ifdef CONFIG_SECURITY_NETWORK -int security_unix_stream_connect(struct socket *sock, struct socket *other, - struct sock *newsk) +int security_unix_stream_connect(struct sock *sock, struct sock *other, struct sock *newsk) { return security_ops->unix_stream_connect(sock, other, newsk); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 65fa8bf..fe8188c 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3921,18 +3921,18 @@ static int selinux_socket_shutdown(struct socket *sock, int how) return sock_has_perm(current, sock->sk, SOCKET__SHUTDOWN); } -static int selinux_socket_unix_stream_connect(struct socket *sock, - struct socket *other, +static int selinux_socket_unix_stream_connect(struct sock *sock, + struct sock *other, struct sock *newsk) { - struct sk_security_struct *sksec_sock = sock->sk->sk_security; - struct sk_security_struct *sksec_other = other->sk->sk_security; + struct sk_security_struct *sksec_sock = sock->sk_security; + struct sk_security_struct *sksec_other = other->sk_security; struct sk_security_struct *sksec_new = newsk->sk_security; struct common_audit_data ad; int err; COMMON_AUDIT_DATA_INIT(&ad, NET); - ad.u.net.sk = other->sk; + ad.u.net.sk = other; err = avc_has_perm(sksec_sock->sid, sksec_other->sid, sksec_other->sclass, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 489a85a..ccb71a0 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -2408,22 +2408,22 @@ static int smack_setprocattr(struct task_struct *p, char *name, /** * smack_unix_stream_connect - Smack access on UDS - * @sock: one socket - * @other: the other socket + * @sock: one sock + * @other: the other sock * @newsk: unused * * Return 0 if a subject with the smack of sock could access * an object with the smack of other, otherwise an error code */ -static int smack_unix_stream_connect(struct socket *sock, - struct socket *other, struct sock *newsk) +static int smack_unix_stream_connect(struct sock *sock, + struct sock *other, struct sock *newsk) { - struct inode *sp = SOCK_INODE(sock); - struct inode *op = SOCK_INODE(other); + struct inode *sp = SOCK_INODE(sock->sk_socket); + struct inode *op = SOCK_INODE(other->sk_socket); struct smk_audit_info ad; smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET); - smk_ad_setfield_u_net_sk(&ad, other->sk); + smk_ad_setfield_u_net_sk(&ad, other); return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_READWRITE, &ad); } -- 1.7.3.4 >From dc36df3b8b791ad6d668df5fe5533cb9776ac88a Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov <[email protected]> Date: Wed, 12 Jan 2011 08:34:08 +0000 Subject: [PATCH 2/5] inet6: prevent network storms caused by linux IPv6 routers [ Upstream commit 72b43d0898e97f588293b4a24b33c58c46633d81 ] Linux IPv6 forwards unicast packets, which are link layer multicasts... The hole was present since day one. I was 100% this check is there, but it is not. The problem shows itself, f.e. when Microsoft Network Load Balancer runs on a network. This software resolves IPv6 unicast addresses to multicast MAC addresses. Signed-off-by: Alexey Kuznetsov <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv6/ip6_output.c | 3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 94b5bf1..5f8d242 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -401,6 +401,9 @@ int ip6_forward(struct sk_buff *skb) goto drop; } + if (skb->pkt_type != PACKET_HOST) + goto drop; + skb_forward_csum(skb); /* -- 1.7.3.4 >From 4429b20f7c404b6e9b1926931eb3065e346c12fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet <[email protected]> Date: Thu, 6 Jan 2011 10:54:29 -0800 Subject: [PATCH 3/5] net: add POLLPRI to sock_def_readable() [ Upstream commit 2c6607c611cb7bf0a6750bcea34a258144e302c5 ] Leonardo Chiquitto found poll() could block forever on tcp sockets and Urgent data was received, if the event flag only contains POLLPRI. He did a bisection and found commit 4938d7e0233 (poll: avoid extra wakeups in select/poll) was the source of the problem. Problem is TCP sockets use standard sock_def_readable() function for their sk_data_ready() handler, and sock_def_readable() doesnt signal POLLPRI. Only TCP is affected by the problem. Adding POLLPRI to the list of flags might trigger unnecessary schedules, but URGENT handling is such a seldom used feature this seems a good compromise. Thanks a lot to Leonardo for providing the bisection result and a test program as well. Reference : http://www.spinics.net/lists/netdev/msg151793.html Reported-and-bisected-by: Leonardo Chiquitto <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> Tested-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/core/sock.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index e5af8d5..7fd3541 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1907,7 +1907,7 @@ static void sock_def_readable(struct sock *sk, int len) rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (wq_has_sleeper(wq)) - wake_up_interruptible_sync_poll(&wq->wait, POLLIN | + wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); -- 1.7.3.4 >From b3fe7bc4469f681a6b2454324120bb58a01ae5b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <[email protected]> Date: Wed, 5 Jan 2011 07:52:55 +0000 Subject: [PATCH 4/5] ipv4: IP defragmentation must be ECN aware [ Upstream commit 6623e3b24a5ebb07e81648c478d286a1329ab891 ] RFC3168 (The Addition of Explicit Congestion Notification to IP) states : 5.3. Fragmentation ECN-capable packets MAY have the DF (Don't Fragment) bit set. Reassembly of a fragmented packet MUST NOT lose indications of congestion. In other words, if any fragment of an IP packet to be reassembled has the CE codepoint set, then one of two actions MUST be taken: * Set the CE codepoint on the reassembled packet. However, this MUST NOT occur if any of the other fragments contributing to this reassembly carries the Not-ECT codepoint. * The packet is dropped, instead of being reassembled, for any other reason. This patch implements this requirement for IPv4, choosing the first action : If one fragment had NO-ECT codepoint reassembled frame has NO-ECT ElIf one fragment had CE codepoint reassembled frame has CE Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv4/ip_fragment.c | 34 ++++++++++++++++++++++++++++++++++ 1 files changed, 34 insertions(+), 0 deletions(-) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1684408..fb9b94a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -45,6 +45,7 @@ #include <linux/udp.h> #include <linux/inet.h> #include <linux/netfilter_ipv4.h> +#include <net/inet_ecn.h> /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -70,11 +71,28 @@ struct ipq { __be32 daddr; __be16 id; u8 protocol; + u8 ecn; /* RFC3168 support */ int iif; unsigned int rid; struct inet_peer *peer; }; +#define IPFRAG_ECN_CLEAR 0x01 /* one frag had INET_ECN_NOT_ECT */ +#define IPFRAG_ECN_SET_CE 0x04 /* one frag had INET_ECN_CE */ + +static inline u8 ip4_frag_ecn(u8 tos) +{ + tos = (tos & INET_ECN_MASK) + 1; + /* + * After the last operation we have (in binary): + * INET_ECN_NOT_ECT => 001 + * INET_ECN_ECT_1 => 010 + * INET_ECN_ECT_0 => 011 + * INET_ECN_CE => 100 + */ + return (tos & 2) ? 0 : tos; +} + static struct inet_frags ip4_frags; int ip_frag_nqueues(struct net *net) @@ -137,6 +155,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, void *a) qp->protocol = arg->iph->protocol; qp->id = arg->iph->id; + qp->ecn = ip4_frag_ecn(arg->iph->tos); qp->saddr = arg->iph->saddr; qp->daddr = arg->iph->daddr; qp->user = arg->user; @@ -316,6 +335,7 @@ static int ip_frag_reinit(struct ipq *qp) qp->q.fragments = NULL; qp->q.fragments_tail = NULL; qp->iif = 0; + qp->ecn = 0; return 0; } @@ -328,6 +348,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) int flags, offset; int ihl, end; int err = -ENOENT; + u8 ecn; if (qp->q.last_in & INET_FRAG_COMPLETE) goto err; @@ -339,6 +360,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) goto err; } + ecn = ip4_frag_ecn(ip_hdr(skb)->tos); offset = ntohs(ip_hdr(skb)->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; @@ -472,6 +494,7 @@ found: } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; + qp->ecn |= ecn; atomic_add(skb->truesize, &qp->q.net->mem); if (offset == 0) qp->q.last_in |= INET_FRAG_FIRST_IN; @@ -583,6 +606,17 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); + /* RFC3168 5.3 Fragmentation support + * If one fragment had INET_ECN_NOT_ECT, + * reassembled frame also has INET_ECN_NOT_ECT + * Elif one fragment had INET_ECN_CE + * reassembled frame also has INET_ECN_CE + */ + if (qp->ecn & IPFRAG_ECN_CLEAR) + iph->tos &= ~INET_ECN_MASK; + else if (qp->ecn & IPFRAG_ECN_SET_CE) + iph->tos |= INET_ECN_CE; + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; -- 1.7.3.4 >From c31d192b403aa8944c24133eea7db6de4541d24f Mon Sep 17 00:00:00 2001 From: Eric Dumazet <[email protected]> Date: Wed, 5 Jan 2011 10:35:02 +0000 Subject: [PATCH 5/5] net_sched: pfifo_head_drop problem [ Upstream commit 44b8288308ac9da27eab7d7bdbf1375a568805c3 ] commit 57dbb2d83d100ea (sched: add head drop fifo queue) introduced pfifo_head_drop, and broke the invariant that sch->bstats.bytes and sch->bstats.packets are COUNTER (increasing counters only) This can break estimators because est_timer() handles unsigned deltas only. A decreasing counter can then give a huge unsigned delta. My mid term suggestion would be to change things so that sch->bstats.bytes and sch->bstats.packets are incremented in dequeue() only, not at enqueue() time. We also could add drop_bytes/drop_packets and provide estimations of drop rates. It would be more sensible anyway for very low speeds, and big bursts. Right now, if we drop packets, they still are accounted in byte/packets abolute counters and rate estimators. Before this mid term change, this patch makes pfifo_head_drop behavior similar to other qdiscs in case of drops : Dont decrement sch->bstats.bytes and sch->bstats.packets Signed-off-by: Eric Dumazet <[email protected]> Acked-by: Hagen Paul Pfeifer <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/sched/sch_fifo.c | 2 -- 1 files changed, 0 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 4dfecb0..aa4d633 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -54,8 +54,6 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch) /* queue full, remove one skb to fulfill the limit */ skb_head = qdisc_dequeue_head(sch); - sch->bstats.bytes -= qdisc_pkt_len(skb_head); - sch->bstats.packets--; sch->qstats.drops++; kfree_skb(skb_head); -- 1.7.3.4
_______________________________________________ stable mailing list [email protected] http://linux.kernel.org/mailman/listinfo/stable
