This patch partially isolates ipv4 by adding the network namespace
structure in the structure sock, bind bucket and skbuf. When a socket
is created, the pointer to the network namespace is stored in the
struct sock and the socket belongs to the namespace by this way. That
allows to identify sockets related to a namespace for lookup and
procfs. 

The lookup is extended with a network namespace pointer, in
order to identify listen points binded to the same port. That allows
to have several applications binded to INADDR_ANY:port in different
network namespace without conflicting. The bind is checked against
port and network namespace.

When an outgoing packet has the loopback destination addres, the
skbuff is filled with the network namespace. So the loopback packets
never go outside the namespace. This approach facilitate the migration
of loopback because identification is done by network namespace and
not by address. The loopback has been benchmarked by tbench and the
overhead is roughly 1.5 %

Replace-Subject: [Network namespace] ipv4 isolation
Signed-off-by: Daniel Lezcano <[EMAIL PROTECTED]> 
--
 include/linux/skbuff.h           |    2 ++
 include/net/inet_hashtables.h    |   34 ++++++++++++++++++++++++----------
 include/net/inet_timewait_sock.h |    1 +
 include/net/sock.h               |    4 ++++
 net/dccp/ipv4.c                  |    7 ++++---
 net/ipv4/af_inet.c               |    2 ++
 net/ipv4/inet_connection_sock.c  |    3 ++-
 net/ipv4/inet_diag.c             |    3 ++-
 net/ipv4/inet_hashtables.c       |    6 +++++-
 net/ipv4/inet_timewait_sock.c    |    1 +
 net/ipv4/ip_output.c             |    4 ++++
 net/ipv4/tcp_ipv4.c              |   25 ++++++++++++++++---------
 net/ipv4/udp.c                   |    7 +++++--
 13 files changed, 72 insertions(+), 27 deletions(-)

Index: 2.6-mm/include/linux/skbuff.h
===================================================================
--- 2.6-mm.orig/include/linux/skbuff.h
+++ 2.6-mm/include/linux/skbuff.h
@@ -27,6 +27,7 @@
 #include <linux/poll.h>
 #include <linux/net.h>
 #include <linux/textsearch.h>
+#include <linux/net_ns.h>
 #include <net/checksum.h>
 #include <linux/dmaengine.h>
 
@@ -301,6 +302,7 @@
                                *data,
                                *tail,
                                *end;
+       struct net_namespace    *net_ns;
 };
 
 #ifdef __KERNEL__
Index: 2.6-mm/include/net/inet_hashtables.h
===================================================================
--- 2.6-mm.orig/include/net/inet_hashtables.h
+++ 2.6-mm/include/net/inet_hashtables.h
@@ -23,6 +23,8 @@
 #include <linux/spinlock.h>
 #include <linux/types.h>
 #include <linux/wait.h>
+#include <linux/in.h>
+#include <linux/net_ns.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_sock.h>
@@ -78,6 +80,7 @@
        signed short            fastreuse;
        struct hlist_node       node;
        struct hlist_head       owners;
+       struct net_namespace    *net_ns;
 };
 
 #define inet_bind_bucket_for_each(tb, node, head) \
@@ -274,13 +277,15 @@
 extern struct sock *__inet_lookup_listener(const struct hlist_head *head,
                                           const u32 daddr,
                                           const unsigned short hnum,
-                                          const int dif);
+                                          const int dif,
+                                          const struct net_namespace *net_ns);
 
 /* Optimize the common listener case. */
 static inline struct sock *
                inet_lookup_listener(struct inet_hashinfo *hashinfo,
                                     const u32 daddr,
-                                    const unsigned short hnum, const int dif)
+                                    const unsigned short hnum, const int dif,
+                                    const struct net_namespace *net_ns)
 {
        struct sock *sk = NULL;
        const struct hlist_head *head;
@@ -294,8 +299,9 @@
                    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
                    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
                    !sk->sk_bound_dev_if)
-                       goto sherry_cache;
-               sk = __inet_lookup_listener(head, daddr, hnum, dif);
+                       if (sk->sk_net_ns == net_ns && LOOPBACK(daddr))
+                               goto sherry_cache;
+               sk = __inet_lookup_listener(head, daddr, hnum, dif, net_ns);
        }
        if (sk) {
 sherry_cache:
@@ -358,7 +364,8 @@
        __inet_lookup_established(struct inet_hashinfo *hashinfo,
                                  const u32 saddr, const u16 sport,
                                  const u32 daddr, const u16 hnum,
-                                 const int dif)
+                                 const int dif,
+                                 const struct net_namespace *net_ns)
 {
        INET_ADDR_COOKIE(acookie, saddr, daddr)
        const __u32 ports = INET_COMBINED_PORTS(sport, hnum);
@@ -373,12 +380,16 @@
        prefetch(head->chain.first);
        read_lock(&head->lock);
        sk_for_each(sk, node, &head->chain) {
+               if (sk->sk_net_ns != net_ns && LOOPBACK(daddr))
+                       continue;
                if (INET_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
                        goto hit; /* You sunk my battleship! */
        }
 
        /* Must check for a TIME_WAIT'er before going to listener hash. */
        sk_for_each(sk, node, &(head + hashinfo->ehash_size)->chain) {
+               if (sk->sk_net_ns != net_ns && LOOPBACK(daddr))
+                       continue;
                if (INET_TW_MATCH(sk, hash, acookie, saddr, daddr, ports, dif))
                        goto hit;
        }
@@ -394,22 +405,25 @@
 static inline struct sock *__inet_lookup(struct inet_hashinfo *hashinfo,
                                         const u32 saddr, const u16 sport,
                                         const u32 daddr, const u16 hnum,
-                                        const int dif)
+                                        const int dif,
+                                        const struct net_namespace *net_ns)
 {
        struct sock *sk = __inet_lookup_established(hashinfo, saddr, sport, 
daddr,
-                                                   hnum, dif);
-       return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif);
+                                                   hnum, dif, net_ns);
+       return sk ? : inet_lookup_listener(hashinfo, daddr, hnum, dif, net_ns);
 }
 
 static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo,
                                       const u32 saddr, const u16 sport,
                                       const u32 daddr, const u16 dport,
-                                      const int dif)
+                                      const int dif,
+                                      const struct net_namespace *net_ns)
 {
        struct sock *sk;
 
        local_bh_disable();
-       sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+       sk = __inet_lookup(hashinfo, saddr, sport, daddr, ntohs(dport),
+                          dif, net_ns);
        local_bh_enable();
 
        return sk;
Index: 2.6-mm/include/net/inet_timewait_sock.h
===================================================================
--- 2.6-mm.orig/include/net/inet_timewait_sock.h
+++ 2.6-mm/include/net/inet_timewait_sock.h
@@ -115,6 +115,7 @@
 #define tw_refcnt              __tw_common.skc_refcnt
 #define tw_hash                        __tw_common.skc_hash
 #define tw_prot                        __tw_common.skc_prot
+#define tw_net_ns               __tw_common.skc_net_ns
        volatile unsigned char  tw_substate;
        /* 3 bits hole, try to pack */
        unsigned char           tw_rcv_wscale;
Index: 2.6-mm/include/net/sock.h
===================================================================
--- 2.6-mm.orig/include/net/sock.h
+++ 2.6-mm/include/net/sock.h
@@ -47,6 +47,7 @@
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>      /* struct sk_buff */
 #include <linux/security.h>
+#include <linux/net_ns.h>
 
 #include <linux/filter.h>
 
@@ -94,6 +95,7 @@
  *     @skc_refcnt: reference count
  *     @skc_hash: hash value used with various protocol lookup tables
  *     @skc_prot: protocol handlers inside a network family
+ *      @skc_net_ns: network namespace owning the socket
  *
  *     This is the minimal network layer representation of sockets, the header
  *     for struct sock and struct inet_timewait_sock.
@@ -108,6 +110,7 @@
        atomic_t                skc_refcnt;
        unsigned int            skc_hash;
        struct proto            *skc_prot;
+       struct net_namespace    *skc_net_ns;
 };
 
 /**
@@ -183,6 +186,7 @@
 #define sk_refcnt              __sk_common.skc_refcnt
 #define sk_hash                        __sk_common.skc_hash
 #define sk_prot                        __sk_common.skc_prot
+#define sk_net_ns               __sk_common.skc_net_ns
        unsigned char           sk_shutdown : 2,
                                sk_no_check : 2,
                                sk_userlocks : 4;
Index: 2.6-mm/net/dccp/ipv4.c
===================================================================
--- 2.6-mm.orig/net/dccp/ipv4.c
+++ 2.6-mm/net/dccp/ipv4.c
@@ -308,7 +308,8 @@
        }
 
        sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport,
-                        iph->saddr, dh->dccph_sport, inet_iif(skb));
+                        iph->saddr, dh->dccph_sport, inet_iif(skb),
+                        skb->net_ns);
        if (sk == NULL) {
                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
                return;
@@ -610,7 +611,7 @@
        nsk = __inet_lookup_established(&dccp_hashinfo,
                                        iph->saddr, dh->dccph_sport,
                                        iph->daddr, ntohs(dh->dccph_dport),
-                                       inet_iif(skb));
+                                       inet_iif(skb), skb->net_ns);
        if (nsk != NULL) {
                if (nsk->sk_state != DCCP_TIME_WAIT) {
                        bh_lock_sock(nsk);
@@ -924,7 +925,7 @@
        sk = __inet_lookup(&dccp_hashinfo,
                           skb->nh.iph->saddr, dh->dccph_sport,
                           skb->nh.iph->daddr, ntohs(dh->dccph_dport),
-                          inet_iif(skb));
+                          inet_iif(skb), skb->net_ns);
 
        /* 
         * Step 2:
Index: 2.6-mm/net/ipv4/af_inet.c
===================================================================
--- 2.6-mm.orig/net/ipv4/af_inet.c
+++ 2.6-mm/net/ipv4/af_inet.c
@@ -325,6 +325,7 @@
        sk->sk_family      = PF_INET;
        sk->sk_protocol    = protocol;
        sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+       sk->sk_net_ns      = net_ns();
 
        inet->uc_ttl    = -1;
        inet->mc_loop   = 1;
@@ -616,6 +617,7 @@
 
        sock_graft(sk2, newsock);
 
+       sk2->sk_net_ns = net_ns();
        newsock->state = SS_CONNECTED;
        err = 0;
        release_sock(sk2);
Index: 2.6-mm/net/ipv4/inet_connection_sock.c
===================================================================
--- 2.6-mm.orig/net/ipv4/inet_connection_sock.c
+++ 2.6-mm/net/ipv4/inet_connection_sock.c
@@ -116,7 +116,7 @@
                head = &hashinfo->bhash[inet_bhashfn(snum, 
hashinfo->bhash_size)];
                spin_lock(&head->lock);
                inet_bind_bucket_for_each(tb, node, &head->chain)
-                       if (tb->port == snum)
+                       if (tb->port == snum && tb->net_ns == net_ns())
                                goto tb_found;
        }
        tb = NULL;
@@ -146,6 +146,7 @@
        } else if (tb->fastreuse &&
                   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
                tb->fastreuse = 0;
+       tb->net_ns = net_ns();
 success:
        if (!inet_csk(sk)->icsk_bind_hash)
                inet_bind_hash(sk, tb, snum);
Index: 2.6-mm/net/ipv4/inet_diag.c
===================================================================
--- 2.6-mm.orig/net/ipv4/inet_diag.c
+++ 2.6-mm/net/ipv4/inet_diag.c
@@ -241,7 +241,8 @@
        if (req->idiag_family == AF_INET) {
                sk = inet_lookup(hashinfo, req->id.idiag_dst[0],
                                 req->id.idiag_dport, req->id.idiag_src[0],
-                                req->id.idiag_sport, req->id.idiag_if);
+                                req->id.idiag_sport, req->id.idiag_if,
+                                in_skb->net_ns);
        }
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
        else if (req->idiag_family == AF_INET6) {
Index: 2.6-mm/net/ipv4/inet_hashtables.c
===================================================================
--- 2.6-mm.orig/net/ipv4/inet_hashtables.c
+++ 2.6-mm/net/ipv4/inet_hashtables.c
@@ -126,7 +126,8 @@
  * wildcarded during the search since they can never be otherwise.
  */
 struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 
daddr,
-                                   const unsigned short hnum, const int dif)
+                                   const unsigned short hnum, const int dif,
+                                   const struct net_namespace *net_ns)
 {
        struct sock *result = NULL, *sk;
        const struct hlist_node *node;
@@ -139,6 +140,9 @@
                        const __u32 rcv_saddr = inet->rcv_saddr;
                        int score = sk->sk_family == PF_INET ? 1 : 0;
 
+                       if (sk->sk_net_ns != net_ns && LOOPBACK(daddr))
+                               continue;
+
                        if (rcv_saddr) {
                                if (rcv_saddr != daddr)
                                        continue;
Index: 2.6-mm/net/ipv4/inet_timewait_sock.c
===================================================================
--- 2.6-mm.orig/net/ipv4/inet_timewait_sock.c
+++ 2.6-mm/net/ipv4/inet_timewait_sock.c
@@ -110,6 +110,7 @@
                tw->tw_hash         = sk->sk_hash;
                tw->tw_ipv6only     = 0;
                tw->tw_prot         = sk->sk_prot_creator;
+               tw->tw_net_ns       = sk->sk_net_ns;
                atomic_set(&tw->tw_refcnt, 1);
                inet_twsk_dead_node_init(tw);
                __module_get(tw->tw_prot->owner);
Index: 2.6-mm/net/ipv4/ip_output.c
===================================================================
--- 2.6-mm.orig/net/ipv4/ip_output.c
+++ 2.6-mm/net/ipv4/ip_output.c
@@ -284,6 +284,10 @@
 
        skb->dev = dev;
        skb->protocol = htons(ETH_P_IP);
+       if ((skb->nh.iph->protocol == IPPROTO_TCP ||
+            skb->nh.iph->protocol == IPPROTO_UDP) &&
+           LOOPBACK(skb->nh.iph->daddr))
+                       skb->net_ns = skb->sk->sk_net_ns;
 
        return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
                            ip_finish_output,
Index: 2.6-mm/net/ipv4/tcp_ipv4.c
===================================================================
--- 2.6-mm.orig/net/ipv4/tcp_ipv4.c
+++ 2.6-mm/net/ipv4/tcp_ipv4.c
@@ -349,7 +349,7 @@
        }
 
        sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr,
-                        th->source, inet_iif(skb));
+                        th->source, inet_iif(skb), skb->net_ns);
        if (!sk) {
                ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
                return;
@@ -933,7 +933,8 @@
 
        nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr,
                                        th->source, skb->nh.iph->daddr,
-                                       ntohs(th->dest), inet_iif(skb));
+                                       ntohs(th->dest), inet_iif(skb),
+                                       skb->net_ns);
 
        if (nsk) {
                if (nsk->sk_state != TCP_TIME_WAIT) {
@@ -1071,7 +1072,7 @@
 
        sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source,
                           skb->nh.iph->daddr, ntohs(th->dest),
-                          inet_iif(skb));
+                          inet_iif(skb), skb->net_ns);
 
        if (!sk)
                goto no_tcp_socket;
@@ -1149,7 +1150,8 @@
                struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
                                                        skb->nh.iph->daddr,
                                                        ntohs(th->dest),
-                                                       inet_iif(skb));
+                                                       inet_iif(skb),
+                                                       skb->net_ns);
                if (sk2) {
                        inet_twsk_deschedule((struct inet_timewait_sock *)sk,
                                             &tcp_death_row);
@@ -1395,7 +1397,8 @@
        }
 get_sk:
        sk_for_each_from(sk, node) {
-               if (sk->sk_family == st->family) {
+               if (sk->sk_family == st->family &&
+                   sk->sk_net_ns == net_ns()) {
                        cur = sk;
                        goto out;
                }
@@ -1446,7 +1449,8 @@
 
                read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
                sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
-                       if (sk->sk_family != st->family) {
+                       if (sk->sk_family != st->family ||
+                           sk->sk_net_ns != net_ns()) {
                                continue;
                        }
                        rc = sk;
@@ -1455,7 +1459,8 @@
                st->state = TCP_SEQ_STATE_TIME_WAIT;
                inet_twsk_for_each(tw, node,
                                   &tcp_hashinfo.ehash[st->bucket + 
tcp_hashinfo.ehash_size].chain) {
-                       if (tw->tw_family != st->family) {
+                       if (tw->tw_family != st->family ||
+                           tw->tw_net_ns != net_ns()) {
                                continue;
                        }
                        rc = tw;
@@ -1481,7 +1486,8 @@
                tw = cur;
                tw = tw_next(tw);
 get_tw:
-               while (tw && tw->tw_family != st->family) {
+               while (tw && (tw->tw_family != st->family ||
+                             tw->tw_net_ns != net_ns())) {
                        tw = tw_next(tw);
                }
                if (tw) {
@@ -1505,7 +1511,8 @@
                sk = sk_next(sk);
 
        sk_for_each_from(sk, node) {
-               if (sk->sk_family == st->family)
+               if (sk->sk_family == st->family &&
+                   sk->sk_net_ns == net_ns())
                        goto found;
        }
 
Index: 2.6-mm/net/ipv4/udp.c
===================================================================
--- 2.6-mm.orig/net/ipv4/udp.c
+++ 2.6-mm/net/ipv4/udp.c
@@ -184,6 +184,7 @@
                            (!inet2->rcv_saddr ||
                             !inet->rcv_saddr ||
                             inet2->rcv_saddr == inet->rcv_saddr) &&
+                           sk2->sk_net_ns == sk->sk_net_ns &&
                            (!sk2->sk_reuse || !sk->sk_reuse))
                                goto fail;
                }
@@ -1404,7 +1405,8 @@
        for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; 
++state->bucket) {
                struct hlist_node *node;
                sk_for_each(sk, node, &udp_hash[state->bucket]) {
-                       if (sk->sk_family == state->family)
+                       if (sk->sk_family == state->family &&
+                           sk->sk_net_ns == net_ns())
                                goto found;
                }
        }
@@ -1421,7 +1423,8 @@
                sk = sk_next(sk);
 try_again:
                ;
-       } while (sk && sk->sk_family != state->family);
+       } while (sk && (sk->sk_family != state->family ||
+                       sk->sk_net_ns != net_ns()));
 
        if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
                sk = sk_head(&udp_hash[state->bucket]);

--
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to