Re: [PATCH net] net/ipv6: Move anycast init/cleanup functions out of CONFIG_PROC_FS

2018-11-05 Thread Jeff Barnhill
Thanks, David.  Sorry for missing that in the original patch.

Jeff
On Mon, Nov 5, 2018 at 4:55 PM David Miller  wrote:
>
> From: Jeff Barnhill <0xeff...@gmail.com>
> Date: Mon,  5 Nov 2018 20:36:45 +
>
> > Move the anycast.c init and cleanup functions which were inadvertently
> > added inside the CONFIG_PROC_FS definition.
> >
> > Fixes: 2384d02520ff ("net/ipv6: Add anycast addresses to a global 
> > hashtable")
> > Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
>
> Applied, thanks Jeff.


[PATCH net] net/ipv6: Move anycast init/cleanup functions out of CONFIG_PROC_FS

2018-11-05 Thread Jeff Barnhill
Move the anycast.c init and cleanup functions which were inadvertently
added inside the CONFIG_PROC_FS definition.

Fixes: 2384d02520ff ("net/ipv6: Add anycast addresses to a global hashtable")
Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 net/ipv6/anycast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 7698637cf827..94999058e110 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -590,6 +590,7 @@ void ac6_proc_exit(struct net *net)
 {
remove_proc_entry("anycast6", net->proc_net);
 }
+#endif
 
 /* Init / cleanup code
  */
@@ -611,4 +612,3 @@ void ipv6_anycast_cleanup(void)
WARN_ON(!hlist_empty(_acaddr_lst[i]));
spin_unlock(_hash_lock);
 }
-#endif
-- 
2.14.1



[PATCH net v7] net/ipv6: Add anycast addresses to a global hashtable

2018-11-02 Thread Jeff Barnhill
icmp6_send() function is expensive on systems with a large number of
interfaces. Every time it’s called, it has to verify that the source
address does not correspond to an existing anycast address by looping
through every device and every anycast address on the device.  This can
result in significant delays for a CPU when there are a large number of
neighbors and ND timers are frequently timing out and calling
neigh_invalidate().

Add anycast addresses to a global hashtable to allow quick searching for
matching anycast addresses.  This is based on inet6_addr_lst in addrconf.c.

Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 include/net/addrconf.h |  2 ++
 include/net/if_inet6.h |  2 ++
 net/ipv6/af_inet6.c|  5 
 net/ipv6/anycast.c | 80 +++---
 4 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 14b789a123e7..1656c5978498 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -317,6 +317,8 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device 
*dev,
 const struct in6_addr *addr);
 bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
 const struct in6_addr *addr);
+int ipv6_anycast_init(void);
+void ipv6_anycast_cleanup(void);
 
 /* Device notifier */
 int register_inet6addr_notifier(struct notifier_block *nb);
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d7578cf49c3a..c9c78c15bce0 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -146,10 +146,12 @@ struct ifacaddr6 {
struct in6_addr aca_addr;
struct fib6_info*aca_rt;
struct ifacaddr6*aca_next;
+   struct hlist_node   aca_addr_lst;
int aca_users;
refcount_t  aca_refcnt;
unsigned long   aca_cstamp;
unsigned long   aca_tstamp;
+   struct rcu_head rcu;
 };
 
 #defineIFA_HOSTIPV6_ADDR_LOOPBACK
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3f4d61017a69..f0cd291034f0 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1001,6 +1001,9 @@ static int __init inet6_init(void)
err = ip6_flowlabel_init();
if (err)
goto ip6_flowlabel_fail;
+   err = ipv6_anycast_init();
+   if (err)
+   goto ipv6_anycast_fail;
err = addrconf_init();
if (err)
goto addrconf_fail;
@@ -1091,6 +1094,8 @@ static int __init inet6_init(void)
 ipv6_exthdrs_fail:
addrconf_cleanup();
 addrconf_fail:
+   ipv6_anycast_cleanup();
+ipv6_anycast_fail:
ip6_flowlabel_cleanup();
 ip6_flowlabel_fail:
ndisc_late_cleanup();
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..7698637cf827 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -44,8 +44,22 @@
 
 #include 
 
+#define IN6_ADDR_HSIZE_SHIFT   8
+#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT)
+/* anycast address hash table
+ */
+static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(acaddr_hash_lock);
+
 static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr 
*addr);
 
+static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);
+
+   return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
+}
+
 /*
  * socket join an anycast group
  */
@@ -204,16 +218,39 @@ void ipv6_sock_ac_close(struct sock *sk)
rtnl_unlock();
 }
 
+static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca)
+{
+   unsigned int hash = inet6_acaddr_hash(net, >aca_addr);
+
+   spin_lock(_hash_lock);
+   hlist_add_head_rcu(>aca_addr_lst, _acaddr_lst[hash]);
+   spin_unlock(_hash_lock);
+}
+
+static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca)
+{
+   spin_lock(_hash_lock);
+   hlist_del_init_rcu(>aca_addr_lst);
+   spin_unlock(_hash_lock);
+}
+
 static void aca_get(struct ifacaddr6 *aca)
 {
refcount_inc(>aca_refcnt);
 }
 
+static void aca_free_rcu(struct rcu_head *h)
+{
+   struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu);
+
+   fib6_info_release(aca->aca_rt);
+   kfree(aca);
+}
+
 static void aca_put(struct ifacaddr6 *ac)
 {
if (refcount_dec_and_test(>aca_refcnt)) {
-   fib6_info_release(ac->aca_rt);
-   kfree(ac);
+   call_rcu(>rcu, aca_free_rcu);
}
 }
 
@@ -229,6 +266,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
aca->aca_addr = *addr;
fib6_info_hold(f6i);
aca->aca_rt = f6i;
+   INIT_HLIST_NODE(>aca_addr_lst);
aca->aca_users = 1;
/* aca_tstamp should be updated upon changes */
aca->aca_cstamp = aca->aca_tstamp = jiffies;

[PATCH net v6] net/ipv6: Add anycast addresses to a global hashtable

2018-10-31 Thread Jeff Barnhill
icmp6_send() function is expensive on systems with a large number of
interfaces. Every time it’s called, it has to verify that the source
address does not correspond to an existing anycast address by looping
through every device and every anycast address on the device.  This can
result in significant delays for a CPU when there are a large number of
neighbors and ND timers are frequently timing out and calling
neigh_invalidate().

Add anycast addresses to a global hashtable to allow quick searching for
matching anycast addresses.  This is based on inet6_addr_lst in addrconf.c.

Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 include/net/addrconf.h |  2 ++
 include/net/if_inet6.h |  2 ++
 net/ipv6/af_inet6.c|  5 
 net/ipv6/anycast.c | 80 +++---
 4 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 14b789a123e7..799af1a037d1 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -317,6 +317,8 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device 
*dev,
 const struct in6_addr *addr);
 bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
 const struct in6_addr *addr);
+int anycast_init(void);
+void anycast_cleanup(void);
 
 /* Device notifier */
 int register_inet6addr_notifier(struct notifier_block *nb);
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d7578cf49c3a..c9c78c15bce0 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -146,10 +146,12 @@ struct ifacaddr6 {
struct in6_addr aca_addr;
struct fib6_info*aca_rt;
struct ifacaddr6*aca_next;
+   struct hlist_node   aca_addr_lst;
int aca_users;
refcount_t  aca_refcnt;
unsigned long   aca_cstamp;
unsigned long   aca_tstamp;
+   struct rcu_head rcu;
 };
 
 #defineIFA_HOSTIPV6_ADDR_LOOPBACK
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3f4d61017a69..ddc8a6dbfba2 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1001,6 +1001,9 @@ static int __init inet6_init(void)
err = ip6_flowlabel_init();
if (err)
goto ip6_flowlabel_fail;
+   err = anycast_init();
+   if (err)
+   goto anycast_fail;
err = addrconf_init();
if (err)
goto addrconf_fail;
@@ -1091,6 +1094,8 @@ static int __init inet6_init(void)
 ipv6_exthdrs_fail:
addrconf_cleanup();
 addrconf_fail:
+   anycast_cleanup();
+anycast_fail:
ip6_flowlabel_cleanup();
 ip6_flowlabel_fail:
ndisc_late_cleanup();
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..f6c4c8ac184c 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -44,8 +44,22 @@
 
 #include 
 
+#define IN6_ADDR_HSIZE_SHIFT   8
+#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT)
+/* anycast address hash table
+ */
+static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(acaddr_hash_lock);
+
 static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr 
*addr);
 
+static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);
+
+   return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
+}
+
 /*
  * socket join an anycast group
  */
@@ -204,16 +218,39 @@ void ipv6_sock_ac_close(struct sock *sk)
rtnl_unlock();
 }
 
+static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca)
+{
+   unsigned int hash = inet6_acaddr_hash(net, >aca_addr);
+
+   spin_lock(_hash_lock);
+   hlist_add_head_rcu(>aca_addr_lst, _acaddr_lst[hash]);
+   spin_unlock(_hash_lock);
+}
+
+static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca)
+{
+   spin_lock(_hash_lock);
+   hlist_del_init_rcu(>aca_addr_lst);
+   spin_unlock(_hash_lock);
+}
+
 static void aca_get(struct ifacaddr6 *aca)
 {
refcount_inc(>aca_refcnt);
 }
 
+static void aca_free_rcu(struct rcu_head *h)
+{
+   struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu);
+
+   fib6_info_release(aca->aca_rt);
+   kfree(aca);
+}
+
 static void aca_put(struct ifacaddr6 *ac)
 {
if (refcount_dec_and_test(>aca_refcnt)) {
-   fib6_info_release(ac->aca_rt);
-   kfree(ac);
+   call_rcu(>rcu, aca_free_rcu);
}
 }
 
@@ -229,6 +266,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
aca->aca_addr = *addr;
fib6_info_hold(f6i);
aca->aca_rt = f6i;
+   INIT_HLIST_NODE(>aca_addr_lst);
aca->aca_users = 1;
/* aca_tstamp should be updated upon changes */
aca->aca_cstamp = aca->aca_tstamp = jiffies;
@@ -285,6 +323,8 @@ int __ipv6

Re: [PATCH net v5] net/ipv6: Add anycast addresses to a global hashtable

2018-10-31 Thread Jeff Barnhill
I'll follow this email with a new patch using ifacaddr6 instead of
creating a new struct. I ended up using fib6_nh.nh_dev to get the net,
instead of adding a back pointer to idev.  It seems that idev was
recently removed in lieu of this, so if this is incorrect, please let
me know. Hopefully, I got the locking correct.
Thanks,
Jeff
On Tue, Oct 30, 2018 at 7:19 PM David Miller  wrote:
>
> From: David Ahern 
> Date: Tue, 30 Oct 2018 16:06:46 -0600
>
> > or make the table per namespace.
>
> This will increase namespace create/destroy cost, so I'd rather not
> for something like this.


Re: [PATCH net v5] net/ipv6: Add anycast addresses to a global hashtable

2018-10-30 Thread Jeff Barnhill
I originally started implementing it the way you suggested; however,
it seemed to complicate management of that structure because it isn't
currently using rcu.  Also, assuming that can be worked out, where
would I get the net from?  Would I need to store a copy in ifcaddr6,
or is there some way to access it during ipv6_chk_acast_addr()?  It
seems that if I don't add a copy of net, but instead access it through
aca_rt(?), then freeing the ifcaddr6 memory becomes problematic
(detaching it from idev, while read_rcu may still be accessing it).
On Mon, Oct 29, 2018 at 11:32 PM David Miller  wrote:
>
> From: Jeff Barnhill <0xeff...@gmail.com>
> Date: Sun, 28 Oct 2018 01:51:59 +
>
> > +struct ipv6_ac_addrlist {
> > + struct in6_addr acal_addr;
> > + possible_net_t  acal_pnet;
> > + refcount_t  acal_users;
> > + struct hlist_node   acal_lst; /* inet6_acaddr_lst */
> > + struct rcu_head rcu;
> > +};
>
> Please just add the hlist to ifcaddr6 instead of duplicating so much
> information and reference counters here.
>
> This seems to waste a lot of memory unnecessary and add lots of
> unnecessary object allocate/setup/destroy logic.


[PATCH net v5] net/ipv6: Add anycast addresses to a global hashtable

2018-10-27 Thread Jeff Barnhill
icmp6_send() function is expensive on systems with a large number of
interfaces. Every time it’s called, it has to verify that the source
address does not correspond to an existing anycast address by looping
through every device and every anycast address on the device.  This can
result in significant delays for a CPU when there are a large number of
neighbors and ND timers are frequently timing out and calling
neigh_invalidate().

Add anycast addresses to a global hashtable to allow quick searching for
matching anycast addresses.  This is based on inet6_addr_lst in addrconf.c.

Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 include/net/addrconf.h |   2 +
 include/net/if_inet6.h |   8 
 net/ipv6/af_inet6.c|   5 ++
 net/ipv6/anycast.c | 121 -
 4 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 14b789a123e7..799af1a037d1 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -317,6 +317,8 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device 
*dev,
 const struct in6_addr *addr);
 bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
 const struct in6_addr *addr);
+int anycast_init(void);
+void anycast_cleanup(void);
 
 /* Device notifier */
 int register_inet6addr_notifier(struct notifier_block *nb);
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d7578cf49c3a..a445014b981d 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -142,6 +142,14 @@ struct ipv6_ac_socklist {
struct ipv6_ac_socklist *acl_next;
 };
 
+struct ipv6_ac_addrlist {
+   struct in6_addr acal_addr;
+   possible_net_t  acal_pnet;
+   refcount_t  acal_users;
+   struct hlist_node   acal_lst; /* inet6_acaddr_lst */
+   struct rcu_head rcu;
+};
+
 struct ifacaddr6 {
struct in6_addr aca_addr;
struct fib6_info*aca_rt;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3f4d61017a69..ddc8a6dbfba2 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1001,6 +1001,9 @@ static int __init inet6_init(void)
err = ip6_flowlabel_init();
if (err)
goto ip6_flowlabel_fail;
+   err = anycast_init();
+   if (err)
+   goto anycast_fail;
err = addrconf_init();
if (err)
goto addrconf_fail;
@@ -1091,6 +1094,8 @@ static int __init inet6_init(void)
 ipv6_exthdrs_fail:
addrconf_cleanup();
 addrconf_fail:
+   anycast_cleanup();
+anycast_fail:
ip6_flowlabel_cleanup();
 ip6_flowlabel_fail:
ndisc_late_cleanup();
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..ca51c9d57ce5 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -44,8 +44,22 @@
 
 #include 
 
+#define IN6_ADDR_HSIZE_SHIFT   8
+#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT)
+/* anycast address hash table
+ */
+static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(acaddr_hash_lock);
+
 static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr 
*addr);
 
+static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);
+
+   return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
+}
+
 /*
  * socket join an anycast group
  */
@@ -204,6 +218,73 @@ void ipv6_sock_ac_close(struct sock *sk)
rtnl_unlock();
 }
 
+static struct ipv6_ac_addrlist *acal_alloc(struct net *net,
+  const struct in6_addr *addr)
+{
+   struct ipv6_ac_addrlist *acal;
+
+   acal = kzalloc(sizeof(*acal), GFP_ATOMIC);
+   if (!acal)
+   return NULL;
+
+   acal->acal_addr = *addr;
+   write_pnet(>acal_pnet, net);
+   refcount_set(>acal_users, 1);
+   INIT_HLIST_NODE(>acal_lst);
+
+   return acal;
+}
+
+static int ipv6_add_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   unsigned int hash = inet6_acaddr_hash(net, addr);
+   struct ipv6_ac_addrlist *acal;
+   int err = 0;
+
+   spin_lock(_hash_lock);
+   hlist_for_each_entry(acal, _acaddr_lst[hash], acal_lst) {
+   if (!net_eq(read_pnet(>acal_pnet), net))
+   continue;
+   if (ipv6_addr_equal(>acal_addr, addr)) {
+   refcount_inc(>acal_users);
+   goto out;
+   }
+   }
+
+   acal = acal_alloc(net, addr);
+   if (!acal) {
+   err = -ENOMEM;
+   goto out;
+   }
+
+   hlist_add_head_rcu(>acal_lst, _acaddr_lst[hash]);
+
+out:
+   spin_unlock(_hash_lock);
+   return err;
+}
+
+static void ipv6_del_acaddr_hash(struct net *net, const struct in6_addr *addr

Re: [PATCH net v4] net/ipv6: Add anycast addresses to a global hashtable

2018-10-27 Thread Jeff Barnhill
You are right, David...I mistook the refcount_dec_and_test() in
aca_put() as being for the fib6_info, but it's for the aca_refcnt.
Thanks!  I'll submit a corrected patch.
On Sat, Oct 27, 2018 at 7:39 PM David Ahern  wrote:
>
> On 10/27/18 12:02 PM, Jeff Barnhill wrote:
> > @@ -275,6 +356,11 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const 
> > struct in6_addr *addr)
> >   err = -ENOMEM;
> >   goto out;
> >   }
> > + err = ipv6_add_acaddr_hash(dev_net(idev->dev), addr);
> > + if (err) {
> > + aca_put(aca);
> > + goto out;
> > + }
> >
> >   aca->aca_next = idev->ac_list;
> >   idev->ac_list = aca;
>
> you misunderstood my comment. aca_put is instead of a double call to
> fib6_info_release(f6i). You still need one call to
> fib6_info_release(f6i) for the addrconf_f6i_alloc.


[PATCH net v4] net/ipv6: Add anycast addresses to a global hashtable

2018-10-27 Thread Jeff Barnhill
icmp6_send() function is expensive on systems with a large number of
interfaces. Every time it’s called, it has to verify that the source
address does not correspond to an existing anycast address by looping
through every device and every anycast address on the device.  This can
result in significant delays for a CPU when there are a large number of
neighbors and ND timers are frequently timing out and calling
neigh_invalidate().

Add anycast addresses to a global hashtable to allow quick searching for
matching anycast addresses.  This is based on inet6_addr_lst in addrconf.c.

Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 include/net/addrconf.h |   2 +
 include/net/if_inet6.h |   8 
 net/ipv6/af_inet6.c|   5 +++
 net/ipv6/anycast.c | 120 -
 4 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 14b789a123e7..799af1a037d1 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -317,6 +317,8 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device 
*dev,
 const struct in6_addr *addr);
 bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
 const struct in6_addr *addr);
+int anycast_init(void);
+void anycast_cleanup(void);
 
 /* Device notifier */
 int register_inet6addr_notifier(struct notifier_block *nb);
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d7578cf49c3a..a445014b981d 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -142,6 +142,14 @@ struct ipv6_ac_socklist {
struct ipv6_ac_socklist *acl_next;
 };
 
+struct ipv6_ac_addrlist {
+   struct in6_addr acal_addr;
+   possible_net_t  acal_pnet;
+   refcount_t  acal_users;
+   struct hlist_node   acal_lst; /* inet6_acaddr_lst */
+   struct rcu_head rcu;
+};
+
 struct ifacaddr6 {
struct in6_addr aca_addr;
struct fib6_info*aca_rt;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3f4d61017a69..ddc8a6dbfba2 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1001,6 +1001,9 @@ static int __init inet6_init(void)
err = ip6_flowlabel_init();
if (err)
goto ip6_flowlabel_fail;
+   err = anycast_init();
+   if (err)
+   goto anycast_fail;
err = addrconf_init();
if (err)
goto addrconf_fail;
@@ -1091,6 +1094,8 @@ static int __init inet6_init(void)
 ipv6_exthdrs_fail:
addrconf_cleanup();
 addrconf_fail:
+   anycast_cleanup();
+anycast_fail:
ip6_flowlabel_cleanup();
 ip6_flowlabel_fail:
ndisc_late_cleanup();
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..45585010908a 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -44,8 +44,22 @@
 
 #include 
 
+#define IN6_ADDR_HSIZE_SHIFT   8
+#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT)
+/* anycast address hash table
+ */
+static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(acaddr_hash_lock);
+
 static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr 
*addr);
 
+static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);
+
+   return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
+}
+
 /*
  * socket join an anycast group
  */
@@ -204,6 +218,73 @@ void ipv6_sock_ac_close(struct sock *sk)
rtnl_unlock();
 }
 
+static struct ipv6_ac_addrlist *acal_alloc(struct net *net,
+  const struct in6_addr *addr)
+{
+   struct ipv6_ac_addrlist *acal;
+
+   acal = kzalloc(sizeof(*acal), GFP_ATOMIC);
+   if (!acal)
+   return NULL;
+
+   acal->acal_addr = *addr;
+   write_pnet(>acal_pnet, net);
+   refcount_set(>acal_users, 1);
+   INIT_HLIST_NODE(>acal_lst);
+
+   return acal;
+}
+
+static int ipv6_add_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   unsigned int hash = inet6_acaddr_hash(net, addr);
+   struct ipv6_ac_addrlist *acal;
+   int err = 0;
+
+   spin_lock(_hash_lock);
+   hlist_for_each_entry(acal, _acaddr_lst[hash], acal_lst) {
+   if (!net_eq(read_pnet(>acal_pnet), net))
+   continue;
+   if (ipv6_addr_equal(>acal_addr, addr)) {
+   refcount_inc(>acal_users);
+   goto out;
+   }
+   }
+
+   acal = acal_alloc(net, addr);
+   if (!acal) {
+   err = -ENOMEM;
+   goto out;
+   }
+
+   hlist_add_head_rcu(>acal_lst, _acaddr_lst[hash]);
+
+out:
+   spin_unlock(_hash_lock);
+   return err;
+}
+
+static void ipv6_del_acaddr_hash(struct net *net, const struct in6_addr *addr

[PATCH net v3] net/ipv6: Add anycast addresses to a global hashtable

2018-10-26 Thread Jeff Barnhill
icmp6_send() function is expensive on systems with a large number of
interfaces. Every time it’s called, it has to verify that the source
address does not correspond to an existing anycast address by looping
through every device and every anycast address on the device.  This can
result in significant delays for a CPU when there are a large number of
neighbors and ND timers are frequently timing out and calling
neigh_invalidate().

Add anycast addresses to a global hashtable to allow quick searching for
matching anycast addresses.  This is based on inet6_addr_lst in addrconf.c.

Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 include/net/addrconf.h |   2 +
 include/net/if_inet6.h |   8 
 net/ipv6/af_inet6.c|   5 ++
 net/ipv6/anycast.c | 122 -
 4 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 14b789a123e7..799af1a037d1 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -317,6 +317,8 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device 
*dev,
 const struct in6_addr *addr);
 bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
 const struct in6_addr *addr);
+int anycast_init(void);
+void anycast_cleanup(void);
 
 /* Device notifier */
 int register_inet6addr_notifier(struct notifier_block *nb);
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d7578cf49c3a..a445014b981d 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -142,6 +142,14 @@ struct ipv6_ac_socklist {
struct ipv6_ac_socklist *acl_next;
 };
 
+struct ipv6_ac_addrlist {
+   struct in6_addr acal_addr;
+   possible_net_t  acal_pnet;
+   refcount_t  acal_users;
+   struct hlist_node   acal_lst; /* inet6_acaddr_lst */
+   struct rcu_head rcu;
+};
+
 struct ifacaddr6 {
struct in6_addr aca_addr;
struct fib6_info*aca_rt;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3f4d61017a69..ddc8a6dbfba2 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1001,6 +1001,9 @@ static int __init inet6_init(void)
err = ip6_flowlabel_init();
if (err)
goto ip6_flowlabel_fail;
+   err = anycast_init();
+   if (err)
+   goto anycast_fail;
err = addrconf_init();
if (err)
goto addrconf_fail;
@@ -1091,6 +1094,8 @@ static int __init inet6_init(void)
 ipv6_exthdrs_fail:
addrconf_cleanup();
 addrconf_fail:
+   anycast_cleanup();
+anycast_fail:
ip6_flowlabel_cleanup();
 ip6_flowlabel_fail:
ndisc_late_cleanup();
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..1040d08867ab 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -44,8 +44,22 @@
 
 #include 
 
+#define IN6_ADDR_HSIZE_SHIFT   8
+#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT)
+/* anycast address hash table
+ */
+static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(acaddr_hash_lock);
+
 static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr 
*addr);
 
+static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);
+
+   return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
+}
+
 /*
  * socket join an anycast group
  */
@@ -204,6 +218,73 @@ void ipv6_sock_ac_close(struct sock *sk)
rtnl_unlock();
 }
 
+static struct ipv6_ac_addrlist *acal_alloc(struct net *net,
+  const struct in6_addr *addr)
+{
+   struct ipv6_ac_addrlist *acal;
+
+   acal = kzalloc(sizeof(*acal), GFP_ATOMIC);
+   if (!acal)
+   return NULL;
+
+   acal->acal_addr = *addr;
+   write_pnet(>acal_pnet, net);
+   refcount_set(>acal_users, 1);
+   INIT_HLIST_NODE(>acal_lst);
+
+   return acal;
+}
+
+static int ipv6_add_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   unsigned int hash = inet6_acaddr_hash(net, addr);
+   struct ipv6_ac_addrlist *acal;
+   int err = 0;
+
+   spin_lock(_hash_lock);
+   hlist_for_each_entry(acal, _acaddr_lst[hash], acal_lst) {
+   if (!net_eq(read_pnet(>acal_pnet), net))
+   continue;
+   if (ipv6_addr_equal(>acal_addr, addr)) {
+   refcount_inc(>acal_users);
+   goto out;
+   }
+   }
+
+   acal = acal_alloc(net, addr);
+   if (!acal) {
+   err = -ENOMEM;
+   goto out;
+   }
+
+   hlist_add_head_rcu(>acal_lst, _acaddr_lst[hash]);
+
+out:
+   spin_unlock(_hash_lock);
+   return err;
+}
+
+static void ipv6_del_acaddr_hash(struct net *net, const struct in6_addr *addr

Re: [PATCH net v2] net/ipv6: Add anycast addresses to a global hashtable

2018-10-23 Thread Jeff Barnhill
Thanks for the feedback.

As suggested, I did these things:
 - switched to refcount_t
 - stopped grabbing a reference on the netns (now able to use kfree_rcu)
 - re-ordered ipv6_chk_acast_addr variable definitions to reverse xmas tree

With regards to your question in __ipv6_dev_ac_inc():

> + err = ipv6_add_acaddr_hash(dev_net(idev->dev), addr);
> + if (err) {
> + fib6_info_release(f6i);
> + fib6_info_release(f6i);
Double call to fib6_info_release() ? Why ?

Unless I mis-understand, both addrconf_f6i_alloc() (indirectly through
fib6_info_alloc()) and aca_alloc() increment fib6_ref, so it seemed like
to fully cleanup/backout, we needed to to decrement twice.  Please let me know
if I'm wrong here.

I'll re-submit the patch after agreement on the double call and
testing with the new changes.

Thanks!
Jeff
On Tue, Oct 23, 2018 at 11:12 PM Eric Dumazet  wrote:
>
>
>
> On 10/23/2018 06:58 PM, Jeff Barnhill wrote:
> > icmp6_send() function is expensive on systems with a large number of
> > interfaces. Every time it’s called, it has to verify that the source
> > address does not correspond to an existing anycast address by looping
> > through every device and every anycast address on the device.  This can
> > result in significant delays for a CPU when there are a large number of
> > neighbors and ND timers are frequently timing out and calling
> > neigh_invalidate().
> >
> > Add anycast addresses to a global hashtable to allow quick searching for
> > matching anycast addresses.  This is based on inet6_addr_lst in addrconf.c.
> >
> > Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
> > ---
> >  include/net/addrconf.h |   2 +
> >  include/net/if_inet6.h |   8 +++
> >  net/ipv6/af_inet6.c|   5 ++
> >  net/ipv6/anycast.c | 132 
> > -
> >  4 files changed, 145 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/net/addrconf.h b/include/net/addrconf.h
> > index 6def0351bcc3..0cee3f99c41d 100644
> > --- a/include/net/addrconf.h
> > +++ b/include/net/addrconf.h
> > @@ -312,6 +312,8 @@ bool ipv6_chk_acast_addr(struct net *net, struct 
> > net_device *dev,
> >const struct in6_addr *addr);
> >  bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
> >const struct in6_addr *addr);
> > +int anycast_init(void);
> > +void anycast_cleanup(void);
> >
> >  /* Device notifier */
> >  int register_inet6addr_notifier(struct notifier_block *nb);
> > diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
> > index d7578cf49c3a..55a4a1d8cebc 100644
> > --- a/include/net/if_inet6.h
> > +++ b/include/net/if_inet6.h
> > @@ -142,6 +142,14 @@ struct ipv6_ac_socklist {
> >   struct ipv6_ac_socklist *acl_next;
> >  };
> >
> > +struct ipv6_ac_addrlist {
> > + struct in6_addr acal_addr;
> > + possible_net_t  acal_pnet;
> > + int acal_users;
>
> That would be a refcount_t acal_users; so that CONFIG_REFCOUNT_FULL brings 
> debugging for free.
>
> > + struct hlist_node   acal_lst; /* inet6_acaddr_lst */
> > + struct rcu_head rcu;
> > +};
> > +
> >  struct ifacaddr6 {
> >   struct in6_addr aca_addr;
> >   struct fib6_info*aca_rt;
> > diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
> > index 9a4261e50272..971a05fdd3bd 100644
> > --- a/net/ipv6/af_inet6.c
> > +++ b/net/ipv6/af_inet6.c
> > @@ -1001,6 +1001,9 @@ static int __init inet6_init(void)
> >   err = ip6_flowlabel_init();
> >   if (err)
> >   goto ip6_flowlabel_fail;
> > + err = anycast_init();
> > + if (err)
> > + goto anycast_fail;
> >   err = addrconf_init();
> >   if (err)
> >   goto addrconf_fail;
> > @@ -1091,6 +1094,8 @@ static int __init inet6_init(void)
> >  ipv6_exthdrs_fail:
> >   addrconf_cleanup();
> >  addrconf_fail:
> > + anycast_cleanup();
> > +anycast_fail:
> >   ip6_flowlabel_cleanup();
> >  ip6_flowlabel_fail:
> >   ndisc_late_cleanup();
> > diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
> > index 4e0ff7031edd..def1e156d857 100644
> > --- a/net/ipv6/anycast.c
> > +++ b/net/ipv6/anycast.c
> > @@ -44,8 +44,22 @@
> >
> >  #include 
> >
> > +#define IN6_ADDR_HSIZE_SHIFT 8
> > +#define IN6_ADDR_HSIZE   BIT(IN6_ADDR_HSIZE_SHIFT)
> > +/*   anycast address hash t

[PATCH net v2] net/ipv6: Add anycast addresses to a global hashtable

2018-10-23 Thread Jeff Barnhill
icmp6_send() function is expensive on systems with a large number of
interfaces. Every time it’s called, it has to verify that the source
address does not correspond to an existing anycast address by looping
through every device and every anycast address on the device.  This can
result in significant delays for a CPU when there are a large number of
neighbors and ND timers are frequently timing out and calling
neigh_invalidate().

Add anycast addresses to a global hashtable to allow quick searching for
matching anycast addresses.  This is based on inet6_addr_lst in addrconf.c.

Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 include/net/addrconf.h |   2 +
 include/net/if_inet6.h |   8 +++
 net/ipv6/af_inet6.c|   5 ++
 net/ipv6/anycast.c | 132 -
 4 files changed, 145 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 6def0351bcc3..0cee3f99c41d 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -312,6 +312,8 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device 
*dev,
 const struct in6_addr *addr);
 bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
 const struct in6_addr *addr);
+int anycast_init(void);
+void anycast_cleanup(void);
 
 /* Device notifier */
 int register_inet6addr_notifier(struct notifier_block *nb);
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d7578cf49c3a..55a4a1d8cebc 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -142,6 +142,14 @@ struct ipv6_ac_socklist {
struct ipv6_ac_socklist *acl_next;
 };
 
+struct ipv6_ac_addrlist {
+   struct in6_addr acal_addr;
+   possible_net_t  acal_pnet;
+   int acal_users;
+   struct hlist_node   acal_lst; /* inet6_acaddr_lst */
+   struct rcu_head rcu;
+};
+
 struct ifacaddr6 {
struct in6_addr aca_addr;
struct fib6_info*aca_rt;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9a4261e50272..971a05fdd3bd 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1001,6 +1001,9 @@ static int __init inet6_init(void)
err = ip6_flowlabel_init();
if (err)
goto ip6_flowlabel_fail;
+   err = anycast_init();
+   if (err)
+   goto anycast_fail;
err = addrconf_init();
if (err)
goto addrconf_fail;
@@ -1091,6 +1094,8 @@ static int __init inet6_init(void)
 ipv6_exthdrs_fail:
addrconf_cleanup();
 addrconf_fail:
+   anycast_cleanup();
+anycast_fail:
ip6_flowlabel_cleanup();
 ip6_flowlabel_fail:
ndisc_late_cleanup();
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..def1e156d857 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -44,8 +44,22 @@
 
 #include 
 
+#define IN6_ADDR_HSIZE_SHIFT   8
+#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT)
+/* anycast address hash table
+ */
+static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(acaddr_hash_lock);
+
 static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr 
*addr);
 
+static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);
+
+   return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
+}
+
 /*
  * socket join an anycast group
  */
@@ -204,6 +218,83 @@ void ipv6_sock_ac_close(struct sock *sk)
rtnl_unlock();
 }
 
+static struct ipv6_ac_addrlist *acal_alloc(struct net *net,
+  const struct in6_addr *addr)
+{
+   struct ipv6_ac_addrlist *acal;
+
+   acal = kzalloc(sizeof(*acal), GFP_ATOMIC);
+   if (!acal)
+   return NULL;
+
+   acal->acal_addr = *addr;
+   write_pnet(>acal_pnet, get_net(net));
+   acal->acal_users = 1;
+   INIT_HLIST_NODE(>acal_lst);
+
+   return acal;
+}
+
+static void acal_free_rcu(struct rcu_head *h)
+{
+   struct ipv6_ac_addrlist *acal;
+
+   acal = container_of(h, struct ipv6_ac_addrlist, rcu);
+   WARN_ON(acal->acal_users);
+   put_net(read_pnet(>acal_pnet));
+   kfree(acal);
+}
+
+static int ipv6_add_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   unsigned int hash = inet6_acaddr_hash(net, addr);
+   struct ipv6_ac_addrlist *acal;
+   int err = 0;
+
+   spin_lock(_hash_lock);
+   hlist_for_each_entry(acal, _acaddr_lst[hash], acal_lst) {
+   if (!net_eq(read_pnet(>acal_pnet), net))
+   continue;
+   if (ipv6_addr_equal(>acal_addr, addr)) {
+   acal->acal_users++;
+   goto out;
+   }
+   }
+
+   acal = acal_alloc(net, addr);
+   if (!acal) {
+   err = 

Re: [PATCH net] net/ipv6: Add anycast addresses to a global hashtable

2018-10-23 Thread Jeff Barnhill
Thanks! You are right. I mis-understood net->ifindex.  I think I need
to instead hold the net pointer in the new ipv6_ac_addrlist structure.
Do you see a problem with that?
On Mon, Oct 22, 2018 at 10:26 PM Eric Dumazet  wrote:
>
>
>
> On 10/22/2018 07:12 PM, Jeff Barnhill wrote:
> > icmp6_send() function is expensive on systems with a large number of
> > interfaces. Every time it’s called, it has to verify that the source
> > address does not correspond to an existing anycast address by looping
> > through every device and every anycast address on the device.  This can
> > result in significant delays for a CPU when there are a large number of
> > neighbors and ND timers are frequently timing out and calling
> > neigh_invalidate().
> >
> > Add anycast addresses to a global hashtable to allow quick searching for
> > matching anycast addresses.  This is based on inet6_addr_lst in addrconf.c.
> >
>
> I do not see this patch being netns aware ?
>
> Also I believe you misunderstood what was stored in net->ifindex
> You can look at dev_new_index() for what I mean.
>


[PATCH net] net/ipv6: Add anycast addresses to a global hashtable

2018-10-22 Thread Jeff Barnhill
icmp6_send() function is expensive on systems with a large number of
interfaces. Every time it’s called, it has to verify that the source
address does not correspond to an existing anycast address by looping
through every device and every anycast address on the device.  This can
result in significant delays for a CPU when there are a large number of
neighbors and ND timers are frequently timing out and calling
neigh_invalidate().

Add anycast addresses to a global hashtable to allow quick searching for
matching anycast addresses.  This is based on inet6_addr_lst in addrconf.c.

Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 include/net/addrconf.h |   2 +
 include/net/if_inet6.h |   8 
 net/ipv6/af_inet6.c|   5 ++
 net/ipv6/anycast.c | 122 -
 4 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 6def0351bcc3..0cee3f99c41d 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -312,6 +312,8 @@ bool ipv6_chk_acast_addr(struct net *net, struct net_device 
*dev,
 const struct in6_addr *addr);
 bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
 const struct in6_addr *addr);
+int anycast_init(void);
+void anycast_cleanup(void);
 
 /* Device notifier */
 int register_inet6addr_notifier(struct notifier_block *nb);
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index d7578cf49c3a..ac02b2cf2ba1 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -142,6 +142,14 @@ struct ipv6_ac_socklist {
struct ipv6_ac_socklist *acl_next;
 };
 
+struct ipv6_ac_addrlist {
+   struct in6_addr acal_addr;
+   int acal_ifindex; /* net */
+   int acal_users;
+   struct hlist_node   acal_lst; /* inet6_acaddr_lst */
+   struct rcu_head rcu;
+};
+
 struct ifacaddr6 {
struct in6_addr aca_addr;
struct fib6_info*aca_rt;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9a4261e50272..971a05fdd3bd 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1001,6 +1001,9 @@ static int __init inet6_init(void)
err = ip6_flowlabel_init();
if (err)
goto ip6_flowlabel_fail;
+   err = anycast_init();
+   if (err)
+   goto anycast_fail;
err = addrconf_init();
if (err)
goto addrconf_fail;
@@ -1091,6 +1094,8 @@ static int __init inet6_init(void)
 ipv6_exthdrs_fail:
addrconf_cleanup();
 addrconf_fail:
+   anycast_cleanup();
+anycast_fail:
ip6_flowlabel_cleanup();
 ip6_flowlabel_fail:
ndisc_late_cleanup();
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..58d31e0980aa 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -44,8 +44,22 @@
 
 #include 
 
+#define IN6_ADDR_HSIZE_SHIFT   8
+#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT)
+/* anycast address hash table
+ */
+static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(acaddr_hash_lock);
+
 static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr 
*addr);
 
+static u32 inet6_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);
+
+   return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
+}
+
 /*
  * socket join an anycast group
  */
@@ -204,6 +218,73 @@ void ipv6_sock_ac_close(struct sock *sk)
rtnl_unlock();
 }
 
+static struct ipv6_ac_addrlist *acal_alloc(int ifindex,
+  const struct in6_addr *addr)
+{
+   struct ipv6_ac_addrlist *acal;
+
+   acal = kzalloc(sizeof(*acal), GFP_ATOMIC);
+   if (!acal)
+   return NULL;
+
+   acal->acal_addr = *addr;
+   acal->acal_ifindex = ifindex;
+   acal->acal_users = 1;
+   INIT_HLIST_NODE(>acal_lst);
+
+   return acal;
+}
+
+static int ipv6_add_acaddr_hash(struct net *net, const struct in6_addr *addr)
+{
+   unsigned int hash = inet6_acaddr_hash(net, addr);
+   struct ipv6_ac_addrlist *acal;
+   int err = 0;
+
+   spin_lock(_hash_lock);
+   hlist_for_each_entry(acal, _acaddr_lst[hash], acal_lst) {
+   if (acal->acal_ifindex != net->ifindex)
+   continue;
+   if (ipv6_addr_equal(>acal_addr, addr)) {
+   acal->acal_users++;
+   goto out;
+   }
+   }
+
+   acal = acal_alloc(net->ifindex, addr);
+   if (!acal) {
+   err = -ENOMEM;
+   goto out;
+   }
+
+   hlist_add_head_rcu(>acal_lst, _acaddr_lst[hash]);
+
+out:
+   spin_unlock(_hash_lock);
+   return err;
+}
+
+static void ipv6_del_acaddr_hash(struct net *net, const struct in6_addr

icmp6_send() is too expensive

2018-10-04 Thread Jeff Barnhill
As mentioned here:
https://www.spinics.net/lists/netdev/msg505054.html

icmp6_send() can be expensive when there are a lot of devices and
anycast addresses. One solution I've prototyped is adding a global
hash table to store and allow more efficient searches for anycast
addresses.  This works and prevents the long delays I've seen when
many neighbors are invalidated in large numbers.

A colleague mentioned that another alternative may be to do something
like an rt6_lookup on the address and check for the RTF_ANYCAST flag.
This is much simpler, and I'm in the process of testing it now.

Is there any preference or recommendation as to which way to proceed
before I submit a patch?

Thanks,
Jeff


[PATCH net] net/ipv6: Display all addresses in output of /proc/net/if_inet6

2018-09-20 Thread Jeff Barnhill
The backend handling for /proc/net/if_inet6 in addrconf.c doesn't properly
handle starting/stopping the iteration.  The problem is that at some point
during the iteration, an overflow is detected and the process is
subsequently stopped.  The item being shown via seq_printf() when the
overflow occurs is not actually shown, though.  When start() is
subsequently called to resume iterating, it returns the next item, and
thus the item that was being processed when the overflow occurred never
gets printed.

Alter the meaning of the private data member "offset".  Currently, when it
is not 0 (which only happens at the very beginning), "offset" represents
the next hlist item to be printed.  After this change, "offset" always
represents the current item.

This is also consistent with the private data member "bucket", which
represents the current bucket, and also the use of "pos" as defined in
seq_file.txt:
The pos passed to start() will always be either zero, or the most
recent pos used in the previous session.

Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 net/ipv6/addrconf.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d51a8c0b3372..c63ccce6425f 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4201,7 +4201,6 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file 
*seq, loff_t pos)
p++;
continue;
}
-   state->offset++;
return ifa;
}
 
@@ -4225,13 +4224,12 @@ static struct inet6_ifaddr *if6_get_next(struct 
seq_file *seq,
return ifa;
}
 
+   state->offset = 0;
while (++state->bucket < IN6_ADDR_HSIZE) {
-   state->offset = 0;
hlist_for_each_entry_rcu(ifa,
 _addr_lst[state->bucket], addr_lst) {
if (!net_eq(dev_net(ifa->idev->dev), net))
continue;
-   state->offset++;
return ifa;
}
}
-- 
2.14.1



Re: v6/sit tunnels and VRFs

2018-04-14 Thread Jeff Barnhill
I didn't see an easy way to achieve this behavior without affecting
the non-VRF routing lookups (such as deleting non-VRF rules).  We have
some automated tests that were looking for specific responses, but, of
course, those can be changed.  Among a few of my colleagues, this
became a discussion about maintaining consistent behavior between VRF
and non-VRF, such that a ping or some other tool wouldn't respond
differently.  That's the main reason I asked the question here - to
see how important this was in general use. It sounds like in your
experience, the specific error message/code hasn't been an issue.

Thanks,
Jeff


On Fri, Apr 13, 2018 at 4:31 PM, David Ahern <dsah...@gmail.com> wrote:
> On 4/13/18 2:23 PM, Jeff Barnhill wrote:
>> It seems that the ENETUNREACH response is still desirable in the VRF
>> case since the only difference (when using VRF vs. not) is that the
>> lookup should be restrained to a specific VRF.
>
> VRF is just policy routing to a table. If the table wants the lookup to
> stop, then it needs a default route. What you are referring to is the
> lookup goes through all tables and does not find an answer so it fails
> with -ENETUNREACH. I do not know of any way to make that happen with the
> existing default route options and in the past 2+ years we have not hit
> any s/w that discriminates -ENETUNREACH from -EHOSTUNREACH.
>
> I take it this is code from your internal code base. Why does it care
> between those two failures?


Re: v6/sit tunnels and VRFs

2018-04-13 Thread Jeff Barnhill
Thanks for the response, David. I'm not questioning the need to stop
the fib lookup once the end of the VRF table is reached - I agree that
is needed.  I'm concerned with the difference in the response/error
returned from the failed lookup.

For instance, with vrf "unreachable default" route, I get this:
# ip route get 1.1.1.1 vrf vrf_258
RTNETLINK answers: No route to host

Without it (and assuming no match for 1.1.1.1 in local/main/default
tables), I get this:
# ip route get 1.1.1.1 vrf vrf_258
RTNETLINK answers: Network is unreachable

Which is also what happens when not using VRFs at all.

It seems that the ENETUNREACH response is still desirable in the VRF
case since the only difference (when using VRF vs. not) is that the
lookup should be restrained to a specific VRF.

Jeff



On Thu, Apr 12, 2018 at 10:25 PM, David Ahern <dsah...@gmail.com> wrote:
> On 4/12/18 10:54 AM, Jeff Barnhill wrote:
>> Hi David,
>>
>> In the slides referenced, you recommend adding an "unreachable
>> default" route to the end of each VRF route table.  In my testing (for
>> v4) this results in a change to fib lookup failures such that instead
>> of ENETUNREACH being returned, EHOSTUNREACH is returned since the fib
>> finds the unreachable route, versus failing to find a route
>> altogether.
>>
>> Have the implications of this been considered?  I don't see a
>> clean/easy way to achieve the old behavior without affecting non-VRF
>> routing (eg. remove the unreachable route and delete the non-VRF
>> rules).  I'm guessing that programmatically, it may not make much
>> difference, ie. lookup fails, but for debugging or to a user looking
>> at it, the difference matters.  Do you (or anyone else) have any
>> thoughts on this?
>
> We have recommended moving the local table down in the FIB rules:
>
> # ip ru ls
> 1000:   from all lookup [l3mdev-table]
> 32765:  from all lookup local
> 32766:  from all lookup main
> 32767:  from all lookup default
>
> and adding a default route to VRF tables:
>
> # ip ro ls vrf red
> unreachable default  metric 4278198272
> 172.16.2.0/24  proto bgp  metric 20
> nexthop via 169.254.0.1  dev swp3 weight 1 onlink
> nexthop via 169.254.0.1  dev swp4 weight 1 onlink
>
> # ip -6 ro ls vrf red
> 2001:db8:2::/64  proto bgp  metric 20
> nexthop via fe80::202:ff:fe00:e  dev swp3 weight 1
> nexthop via fe80::202:ff:fe00:f  dev swp4 weight 1
> anycast fe80:: dev lo  proto kernel  metric 0  pref medium
> anycast fe80:: dev lo  proto kernel  metric 0  pref medium
> fe80::/64 dev swp3  proto kernel  metric 256  pref medium
> fe80::/64 dev swp4  proto kernel  metric 256  pref medium
> ff00::/8 dev swp3  metric 256  pref medium
> ff00::/8 dev swp4  metric 256  pref medium
> unreachable default dev lo  metric 4278198272  error -101 pref medium
>
> Over the last 2 years we have not seen any negative side effects to this
> and is what you want to have proper VRF separation.
>
> Without a default route lookups will proceed to the next fib rule which
> means a lookup in the next table and barring other PBR rules will be the
> main table. It will lead to wrong lookups.
>
> Here is an example:
>   ip netns add foo
>   ip netns exec foo bash
>   ip li set lo up
>   ip li add red type vrf table 123
>   ip li set red up
>   ip li add dummy1 type dummy
>   ip addr add 10.100.1.1/24 dev dummy1
>   ip li set dummy1 master red
>   ip li set dummy1 up
>   ip li add dummy2 type dummy
>   ip addr add 10.100.1.1/24 dev dummy2
>   ip li set dummy2 up
>   ip ro get 10.100.2.2
>   ip ro get 10.100.2.2 vrf red
>
> # ip ru ls
> 0:  from all lookup local
> 1000:   from all lookup [l3mdev-table]
> 32766:  from all lookup main
> 32767:  from all lookup default
>
> # ip ro ls
> 10.100.1.0/24 dev dummy2 proto kernel scope link src 10.100.1.1
> 10.100.2.0/24 via 10.100.1.2 dev dummy2
>
> # ip ro ls vrf red
> 10.100.1.0/24 dev dummy1 proto kernel scope link src 10.100.1.1
>
> That's the setup. What happens on route lookups?
> # ip ro get vrf red 10.100.2.1
> 10.100.2.1 via 10.100.1.2 dev dummy2 src 10.100.1.1 uid 0
> cache
>
> which is clearly wrong. Let's look at the lookup sequence
>
> # perf record -e fib:* ip ro get vrf red 10.100.2.1
> 10.100.2.1 via 10.100.1.2 dev dummy2 src 10.100.1.1 uid 0
> cache
> [ perf record: Woken up 1 times to write data ]
> [ perf record: Captured and wrote 0.003 MB perf.data (4 samples) ]
>
> #  perf script --fields trace:trace
> table 255 oif 2 iif 1 src 0.0.0.0 dst 10.100.2.1 tos 0 scope 0 flags 4
> table 123 oif 2 iif 1 src 0.0.0.0 dst 10.100.2.1 tos 0 scope 0 flags 4
> table 254 oif 2 iif 1 src 0.0.0.

Re: v6/sit tunnels and VRFs

2018-04-12 Thread Jeff Barnhill
Hi David,

In the slides referenced, you recommend adding an "unreachable
default" route to the end of each VRF route table.  In my testing (for
v4) this results in a change to fib lookup failures such that instead
of ENETUNREACH being returned, EHOSTUNREACH is returned since the fib
finds the unreachable route, versus failing to find a route
altogether.

Have the implications of this been considered?  I don't see a
clean/easy way to achieve the old behavior without affecting non-VRF
routing (eg. remove the unreachable route and delete the non-VRF
rules).  I'm guessing that programmatically, it may not make much
difference, ie. lookup fails, but for debugging or to a user looking
at it, the difference matters.  Do you (or anyone else) have any
thoughts on this?

Thanks,
Jeff


On Sun, Oct 29, 2017 at 11:48 AM, David Ahern <dsah...@gmail.com> wrote:
> On 10/27/17 8:43 PM, Jeff Barnhill wrote:
>> ping v4 loopback...
>>
>> jeff@VM2:~$ ip route list vrf myvrf
>> 127.0.0.0/8 dev myvrf proto kernel scope link src 127.0.0.1
>> 192.168.200.0/24 via 192.168.210.3 dev enp0s8
>> 192.168.210.0/24 dev enp0s8 proto kernel scope link src 192.168.210.2
>>
>> Lookups shown in perf script were for table 255.  Is it necessary to
>> put the l3mdev table first?  If I re-order the tables, it starts
>> working:
>
> Yes, we advise moving the local table down to avoid false hits (e.g.,
> duplicate addresses like this between the default VRF and another VRF).
>
> I covered that and a few other things at OSS 2017. Latest VRF slides for
> users:
>   http://schd.ws/hosted_files/ossna2017/fe/vrf-tutorial-oss.pdf


[PATCH net] net/ipv6: Increment OUTxxx counters after netfilter hook

2018-04-05 Thread Jeff Barnhill
At the end of ip6_forward(), IPSTATS_MIB_OUTFORWDATAGRAMS and
IPSTATS_MIB_OUTOCTETS are incremented immediately before the NF_HOOK call
for NFPROTO_IPV6 / NF_INET_FORWARD.  As a result, these counters get
incremented regardless of whether or not the netfilter hook allows the
packet to continue being processed.  This change increments the counters
in ip6_forward_finish() so that it will not happen if the netfilter hook
chooses to terminate the packet, which is similar to how IPv4 works.

Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 net/ipv6/ip6_output.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index b8ee50e94af3..2e891d2c30ef 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -375,6 +375,11 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 struct sk_buff *skb)
 {
+   struct dst_entry *dst = skb_dst(skb);
+
+   __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+   __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, 
skb->len);
+
return dst_output(net, sk, skb);
 }
 
@@ -569,8 +574,6 @@ int ip6_forward(struct sk_buff *skb)
 
hdr->hop_limit--;
 
-   __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
-   __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, 
skb->len);
return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
   net, NULL, skb, skb->dev, dst->dev,
   ip6_forward_finish);
-- 
2.14.1



ip6_forward / NF_HOOK and counters

2018-03-26 Thread Jeff Barnhill
At the end of ip6_forward(), is there a good reason why
IPSTATS_MIB_OUTFORWDATAGRAMS and IPSTATS_MIB_OUTOCTETS are incremented
before the NF_HOOK?  If the hook steals or drops the packet, this
counts still go up, which seems incorrect.

v4/ip_forward() increments these counters in ip_forward_finish().  It
seems that v6 should do it in ip6_forward_finish() ?? Thoughts?

Jeff


[PATCH] net: vrf: correct FRA_L3MDEV encode type

2017-11-01 Thread Jeff Barnhill
FRA_L3MDEV is defined as U8, but is being added as a U32 attribute. On
big endian architecture, this results in the l3mdev entry not being
added to the FIB rules.

Fixes: 1aa6c4f6b8cd8 ("net: vrf: Add l3mdev rules on first device create")
Signed-off-by: Jeff Barnhill <0xeff...@gmail.com>
---
 drivers/net/vrf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 9b243e6f3008..7dc3bcac3506 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1165,7 +1165,7 @@ static int vrf_fib_rule(const struct net_device *dev, 
__u8 family, bool add_it)
frh->family = family;
frh->action = FR_ACT_TO_TBL;
 
-   if (nla_put_u32(skb, FRA_L3MDEV, 1))
+   if (nla_put_u8(skb, FRA_L3MDEV, 1))
goto nla_put_failure;
 
if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF))
-- 
2.14.1



[PATCH] net: vrf: correct FRA_L3MDEV encode type

2017-10-31 Thread Jeff Barnhill
FRA_L3MDEV is defined as U8, but is being added as a U32 attribute. On
big endian architecture, this results in the l3mdev entry not being
added to the FIB rules.

Fixes: 1aa6c4f6b8cd8 ("net: vrf: Add l3mdev rules on first device create")
---
 drivers/net/vrf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 9b243e6f3008..7dc3bcac3506 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -1165,7 +1165,7 @@ static int vrf_fib_rule(const struct net_device *dev, 
__u8 family, bool add_it)
frh->family = family;
frh->action = FR_ACT_TO_TBL;
 
-   if (nla_put_u32(skb, FRA_L3MDEV, 1))
+   if (nla_put_u8(skb, FRA_L3MDEV, 1))
goto nla_put_failure;
 
if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF))
-- 
2.14.1



Re: v6/sit tunnels and VRFs

2017-10-31 Thread Jeff Barnhill
Thanks, David.  Those slides are extremely helpful.

Also, I ran into a bug that manifested on big endian architecture:

diff --git i/drivers/net/vrf.c w/drivers/net/vrf.c
index b23bb2fae5f8..a5f984689aee 100644
--- i/drivers/net/vrf.c
+++ w/drivers/net/vrf.c
@@ -1130,7 +1130,7 @@ static int vrf_fib_rule(const struct net_device
*dev, __u8 family, bool add_it)
frh->family = family;
frh->action = FR_ACT_TO_TBL;

-   if (nla_put_u32(skb, FRA_L3MDEV, 1))
+   if (nla_put_u8(skb, FRA_L3MDEV, 1))
goto nla_put_failure;

if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF))

I was surprised that nlmsg_parse in fib_nl_newrule() didn't pick this
up, but I verified that the received value for this attribute was 0,
not 1 (w/o the patch).

Jeff



On Sun, Oct 29, 2017 at 11:48 AM, David Ahern <dsah...@gmail.com> wrote:
> On 10/27/17 8:43 PM, Jeff Barnhill wrote:
>> ping v4 loopback...
>>
>> jeff@VM2:~$ ip route list vrf myvrf
>> 127.0.0.0/8 dev myvrf proto kernel scope link src 127.0.0.1
>> 192.168.200.0/24 via 192.168.210.3 dev enp0s8
>> 192.168.210.0/24 dev enp0s8 proto kernel scope link src 192.168.210.2
>>
>> Lookups shown in perf script were for table 255.  Is it necessary to
>> put the l3mdev table first?  If I re-order the tables, it starts
>> working:
>
> Yes, we advise moving the local table down to avoid false hits (e.g.,
> duplicate addresses like this between the default VRF and another VRF).
>
> I covered that and a few other things at OSS 2017. Latest VRF slides for
> users:
>   http://schd.ws/hosted_files/ossna2017/fe/vrf-tutorial-oss.pdf


Re: v6/sit tunnels and VRFs

2017-10-27 Thread Jeff Barnhill
Your comments on the tunnel VRF and underlay VRF being different make
sense and is more flexible.  I think assigning the dev/link to the
same VRF as the tunnel master accomplishes the same thing that I
originally had in mind.

ping v4 loopback...

jeff@VM2:~$ ip route list vrf myvrf
127.0.0.0/8 dev myvrf proto kernel scope link src 127.0.0.1
192.168.200.0/24 via 192.168.210.3 dev enp0s8
192.168.210.0/24 dev enp0s8 proto kernel scope link src 192.168.210.2

Lookups shown in perf script were for table 255.  Is it necessary to
put the l3mdev table first?  If I re-order the tables, it starts
working:

jeff@VM2:~$ ip rule list
0:  from all lookup local
1000:   from all lookup [l3mdev-table]
32766:  from all lookup main
32767:  from all lookup default

jeff@VM2:~$ sudo ip rule del pref 0
jeff@VM2:~$ sudo ip rule add pref 32765 table 255

jeff@VM2:~$ ip rule list
1000:   from all lookup [l3mdev-table]
32765:  from all lookup local
32766:  from all lookup main
32767:  from all lookup default

jeff@VM2:~$ ping -c1 -I myvrf 127.0.0.1
PING 127.0.0.1 (127.0.0.1) from 127.0.0.1 myvrf: 56(84) bytes of data.
64 bytes from 127.0.0.1: icmp_seq=1 ttl=64 time=0.029 ms

--- 127.0.0.1 ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 0.029/0.029/0.029/0.000 ms


On Fri, Oct 27, 2017 at 6:53 PM, David Ahern <dsah...@gmail.com> wrote:
> On 10/27/17 2:59 PM, Jeff Barnhill wrote:
>> w/regards to this comment:
>>You have a remote address with no qualification about which VRF to
>> use for the lookup.
>>
>> I was using this to enslave the tunnel:
>> sudo ip link set jtun vrf myvrf
>>
>> and assumed this would be enough to cause all tunnel traffic to be
>> part of this VRF.
>>
>> You are right about the tunnel link/dev configuration. (re-stating to
>> be sure we are saying the same thing)  I can use either the VRF device
>> or the v4 device and the packet will get sent.  If I use the VRF
>> device, when the reply packet is received, the tunnel is found
>> successfully.  If I use the v4 device, then I need the patch you
>> provided earlier to successfully look up the tunnel.  You mentioned
>> that both should be supported...Back to my previous comment about
>> enslaving the tunnel...shouldn't a tunnel being enslaved to a VRF also
>> be enough to allow sending and receiving on any device in that VRF
>> (that is, not having to configure the link/dev on the tunnel) ?
>
> Sure, it could be done that way. I guess the question is whether we want
> to allow the tunnel device and the underlying device to be in separate VRFs.
>
> I have always taken the approach that every device with an address can
> be in a separate routing domain. Here, that means the tunnel devices can
> be in 1 VRF, and the underlying connection in a separate VRF (or no VRF).
>
>>
>> Fyi...with regards to the v4 ping, adding the loopback address/network
>> didn't work for me.  I'll see if I can get anymore info on this and
>> maybe start a new thread.
>>
>> Thanks,
>> Jeff
>>
>>
>> jeff@VM2:~$ ping -I myvrf 127.0.0.1
>> ping: Warning: source address might be selected on device other than myvrf.
>> PING 127.0.0.1 (127.0.0.1) from 127.0.0.1 myvrf: 56(84) bytes of data.
>> ^C
>> --- 127.0.0.1 ping statistics ---
>> 3 packets transmitted, 0 received, 100% packet loss, time 2051ms
>>
>> jeff@VM2:~$ sudo ip addr add dev myvrf 127.0.0.1/8
>> jeff@VM2:~$ ping -I myvrf 127.0.0.1
>> PING 127.0.0.1 (127.0.0.1) from 127.0.0.1 myvrf: 56(84) bytes of data.
>> ^C
>> --- 127.0.0.1 ping statistics ---
>> 7 packets transmitted, 0 received, 100% packet loss, time 6149ms
>>
>> jeff@VM2:~$ ip addr list myvrf
>> 4: myvrf: <NOARP,MASTER,UP,LOWER_UP> mtu 65536 qdisc noqueue state UP
>> group default qlen 1000
>> link/ether 82:2b:08:8f:a9:f3 brd ff:ff:ff:ff:ff:ff
>> inet 127.0.0.1/8 scope host myvrf
>>valid_lft forever preferred_lft forever
>
> This is v4.14 kernel? Something else? The 127.0.0.1 address on vrf
> device has been supported since the v4.9 kernel.
>
> ip ro ls vrf myvrf?
>
>
> perf record -e fib:* -a -g -- ping -I myvrf -c1 -w1 127.0.0.1
> perf script
> --> does it show lookups in the correct table for this address?


Re: v6/sit tunnels and VRFs

2017-10-27 Thread Jeff Barnhill
w/regards to this comment:
   You have a remote address with no qualification about which VRF to
use for the lookup.

I was using this to enslave the tunnel:
sudo ip link set jtun vrf myvrf

and assumed this would be enough to cause all tunnel traffic to be
part of this VRF.

You are right about the tunnel link/dev configuration. (re-stating to
be sure we are saying the same thing)  I can use either the VRF device
or the v4 device and the packet will get sent.  If I use the VRF
device, when the reply packet is received, the tunnel is found
successfully.  If I use the v4 device, then I need the patch you
provided earlier to successfully look up the tunnel.  You mentioned
that both should be supported...Back to my previous comment about
enslaving the tunnel...shouldn't a tunnel being enslaved to a VRF also
be enough to allow sending and receiving on any device in that VRF
(that is, not having to configure the link/dev on the tunnel) ?

Fyi...with regards to the v4 ping, adding the loopback address/network
didn't work for me.  I'll see if I can get anymore info on this and
maybe start a new thread.

Thanks,
Jeff


jeff@VM2:~$ ping -I myvrf 127.0.0.1
ping: Warning: source address might be selected on device other than myvrf.
PING 127.0.0.1 (127.0.0.1) from 127.0.0.1 myvrf: 56(84) bytes of data.
^C
--- 127.0.0.1 ping statistics ---
3 packets transmitted, 0 received, 100% packet loss, time 2051ms

jeff@VM2:~$ sudo ip addr add dev myvrf 127.0.0.1/8
jeff@VM2:~$ ping -I myvrf 127.0.0.1
PING 127.0.0.1 (127.0.0.1) from 127.0.0.1 myvrf: 56(84) bytes of data.
^C
--- 127.0.0.1 ping statistics ---
7 packets transmitted, 0 received, 100% packet loss, time 6149ms

jeff@VM2:~$ ip addr list myvrf
4: myvrf: <NOARP,MASTER,UP,LOWER_UP> mtu 65536 qdisc noqueue state UP
group default qlen 1000
link/ether 82:2b:08:8f:a9:f3 brd ff:ff:ff:ff:ff:ff
inet 127.0.0.1/8 scope host myvrf
   valid_lft forever preferred_lft forever

On Fri, Oct 27, 2017 at 12:25 PM, David Ahern <dsah...@gmail.com> wrote:
> On 10/26/17 11:19 PM, Jeff Barnhill wrote:
>> Thanks, David.
>>
>> I corrected the static route, applied the patch, and set the
>> link/output dev on the tunnel and it works now.  Is it required to set
>> the link/output dev?  I was thinking that this should not be required
>> for cases where the outgoing device is not known, for instance on a
>> router or device with multiple interfaces.
>
> In a VRF environment, addresses have to be qualified with a VRF. Running
> the command:
>
> ip tunnel add jtun mode sit remote 192.168.200.1 local 192.168.210.2
>
> You have a remote address with no qualification about which VRF to use
> for the lookup.
>
> Digging into the sit code and how the link parameter is used, the
> existing code works just fine if you use:
>
> sudo ip tunnel add jtun mode sit remote 192.168.200.1 local
> 192.168.210.2 dev myvrf
>
> The tunnel link parameter is myvrf and it is used for tunnel lookups and
> route lookups to forward the packet. So, really I should allow both
> cases -- dev is a VRF and dev is a link that could be in a vrf.
>
>>
>> Also, what is the expected behavior of loopback addresses in a VRF
>> context?  For instance, if an application were being run under "ip vrf
>> exec" and it tried to use these addresses.
>
> The 127.0.0.1 address needs to exist in the VRF. The one on 'lo' is in
> the default VRF only. VRF devices act as the VRF-local loopback, so you
> can put the IPv4 loopback address on it.
>
>>
>> jeff@VM2:~$ ping -I myvrf 127.0.0.1
>> PING 127.0.0.1 (127.0.0.1) from 127.0.0.1 myvrf: 56(84) bytes of data.
>> ^C
>> --- 127.0.0.1 ping statistics ---
>> 3 packets transmitted, 0 received, 100% packet loss, time 2033ms
>
> If you add the loopback address to the
> ip addr add dev myvrf 127.0.0.1/8
>
> root@kenny-jessie3:~# ip addr add dev myvrf 127.0.0.1/8
> root@kenny-jessie3:~# ping -I myvrf 127.0.0.1
> PING 127.0.0.1 (127.0.0.1) from 127.0.0.1 myvrf: 56(84) bytes of data.
> 64 bytes from 127.0.0.1: icmp_seq=1 ttl=64 time=0.045 ms
> ^C
> --- 127.0.0.1 ping statistics ---
> 1 packets transmitted, 1 received, 0% packet loss, time 0ms
> rtt min/avg/max/mdev = 0.045/0.045/0.045/0.000 ms
>
>
>>
>> jeff@VM2:~$ ping -I myvrf ::1
>> connect: Network is unreachable
>
> I need to add support for ::1/128 on a VRF device.


Re: v6/sit tunnels and VRFs

2017-10-26 Thread Jeff Barnhill
Thanks, David.

I corrected the static route, applied the patch, and set the
link/output dev on the tunnel and it works now.  Is it required to set
the link/output dev?  I was thinking that this should not be required
for cases where the outgoing device is not known, for instance on a
router or device with multiple interfaces.

Also, what is the expected behavior of loopback addresses in a VRF
context?  For instance, if an application were being run under "ip vrf
exec" and it tried to use these addresses.

jeff@VM2:~$ ping -I myvrf 127.0.0.1
PING 127.0.0.1 (127.0.0.1) from 127.0.0.1 myvrf: 56(84) bytes of data.
^C
--- 127.0.0.1 ping statistics ---
3 packets transmitted, 0 received, 100% packet loss, time 2033ms

jeff@VM2:~$ ping -I myvrf ::1
connect: Network is unreachable

Thanks,
Jeff


On Thu, Oct 26, 2017 at 1:24 PM, David Ahern <dsah...@gmail.com> wrote:
> On 10/25/17 9:28 PM, Jeff Barnhill wrote:
>> Thanks, David.
>>
>> VM1:
>> sudo ip addr add 192.168.200.1/24 dev enp0s8 broadcast 192.168.200.255
>> sudo ip link set enp0s8 up
>> sudo ip route add 192.168.210.0/24 nexthop via 192.168.200.3 dev enp0s8
>> sudo ip tunnel add jtun mode sit remote 192.168.210.2 local 192.168.200.1
>> sudo ip -6 addr add 2001::1/64 dev jtun
>> sudo ip link set jtun up
>>
>> VM2:
>> sudo ip addr add 192.168.210.2/24 dev enp0s8 broadcast 192.168.210.255
>> sudo ip link set enp0s8 up
>> sudo ip route add 192.168.200.0/24 nexthop via 192.168.210.3 dev enp0s8
>> sudo ip link add dev myvrf type vrf table 256
>> sudo ip link set myvrf up
>> sudo ip link set enp0s8 vrf myvrf
>
> You lost the static route by doing the enslaving here. When the device
> is added to or removed from a VRF it is cycled specifically to dump
> routes and neighbor entries associated with the prior vrf. Always create
> the vrf and enslave first, then add routes:
>
> sudo ip link add dev myvrf type vrf table 256
> sudo ip link set myvrf up
> sudo ip link set enp0s8 vrf myvrf
>
> sudo ip addr add 192.168.210.2/24 dev enp0s8 broadcast 192.168.210.255
> sudo ip link set enp0s8 up
> sudo ip route add 192.168.200.0/24 nexthop via 192.168.210.3 dev enp0s8
>
> That said, the above works for the wrong reason -- it is not really
> doing VRF based routing. For that to happen, the static route should be
> added to the vrf table:
>
> sudo ip route add vrf myvrf 192.168.200.0/24 nexthop via 192.168.210.3
> dev enp0s8
>
> And ...
>
>> sudo ip tunnel add jtun mode sit remote 192.168.200.1 local 192.168.210.2
>
> you need to specify the link on the tunnel create:
>
> sudo ip tunnel add jtun mode sit remote 192.168.200.1 local
> 192.168.210.2 dev enp0s8.
>
> And ...
>
> The tunnel lookup needs to account for the VRF device switch:
>
> (whitespace damaged on paste)
>
> diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
> index a799f5258614..cf0512054fa7 100644
> --- a/net/ipv6/sit.c
> +++ b/net/ipv6/sit.c
> @@ -632,11 +632,18 @@ static bool packet_is_spoofed(struct sk_buff *skb,
>  static int ipip6_rcv(struct sk_buff *skb)
>  {
> const struct iphdr *iph = ip_hdr(skb);
> +   struct net_device *dev = skb->dev;
> +   struct net *net = dev_net(dev);
> struct ip_tunnel *tunnel;
> int err;
>
> -   tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
> -iph->saddr, iph->daddr);
> +   if (netif_is_l3_master(dev)) {
> +   dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
> +   if (!dev)
> +   goto out;
> +   }
> +
> +   tunnel = ipip6_tunnel_lookup(net, dev, iph->saddr, iph->daddr);
> if (tunnel) {
> struct pcpu_sw_netstats *tstats;
>


Re: v6/sit tunnels and VRFs

2017-10-25 Thread Jeff Barnhill
Thanks, David.

VM1:
sudo ip addr add 192.168.200.1/24 dev enp0s8 broadcast 192.168.200.255
sudo ip link set enp0s8 up
sudo ip route add 192.168.210.0/24 nexthop via 192.168.200.3 dev enp0s8
sudo ip tunnel add jtun mode sit remote 192.168.210.2 local 192.168.200.1
sudo ip -6 addr add 2001::1/64 dev jtun
sudo ip link set jtun up

VM2:
sudo ip addr add 192.168.210.2/24 dev enp0s8 broadcast 192.168.210.255
sudo ip link set enp0s8 up
sudo ip route add 192.168.200.0/24 nexthop via 192.168.210.3 dev enp0s8
sudo ip link add dev myvrf type vrf table 256
sudo ip link set myvrf up
sudo ip link set enp0s8 vrf myvrf
sudo ip tunnel add jtun mode sit remote 192.168.200.1 local 192.168.210.2
sudo ip link set jtun vrf myvrf
sudo ip -6 addr add 2001::2/64 dev jtun
sudo ip link set jtun up

VM3:
sudo ip addr add 192.168.200.3/24 dev enp0s8 broadcast 192.168.200.255
sudo ip addr add 192.168.210.3/24 dev enp0s9 broadcast 192.168.210.255
sudo ip link set enp0s8 up
sudo ip link set enp0s9 up
sudo sysctl net.ipv4.conf.enp0s8.forwarding=1
sudo sysctl net.ipv4.conf.enp0s9.forwarding=1

jeff@VM2:~$ ping -c 3 -I jtun 2001::1
PING 2001::1(2001::1) from 2001::2 jtun: 56 data bytes
>From 2001::2 icmp_seq=1 Destination unreachable: Address unreachable
>From 2001::2 icmp_seq=2 Destination unreachable: Address unreachable
>From 2001::2 icmp_seq=3 Destination unreachable: Address unreachable

--- 2001::1 ping statistics ---
3 packets transmitted, 0 received, +3 errors, 100% packet loss, time 2039ms

jeff@VM2:~$ ping -c 3 -I myvrf 2001::1
ping6: Warning: source address might be selected on device other than myvrf.
PING 2001::1(2001::1) from 2001::2 myvrf: 56 data bytes
>From 2001::2 icmp_seq=1 Destination unreachable: Address unreachable
>From 2001::2 icmp_seq=2 Destination unreachable: Address unreachable
>From 2001::2 icmp_seq=3 Destination unreachable: Address unreachable

--- 2001::1 ping statistics ---
3 packets transmitted, 0 received, +3 errors, 100% packet loss, time 2045ms

Let me know if you have any questions or if you think I've done something wrong.

Thanks,
Jeff


On Wed, Oct 25, 2017 at 5:31 PM, David Ahern <dsah...@gmail.com> wrote:
> On 10/25/17 2:45 PM, Jeff Barnhill wrote:
>> Are v6/sit tunnels working with VRFs?
>>
>> For instance, I have a very simple configuration with three VMs
>> running 4.13.0-16 (Ubuntu Server 17.10) kernels.  VM3 is setup as a
>> router for separation.  VM1 and VM2 have static routes to each other
>> via VM3.  All VMs have v4 interfaces configured.  If I setup a sit
>> tunnel with v6 addrs from V1 to V2, tunneled data flows as expected
>> (verified with ping) and can be seen via tcpdump on VM3.  However, if
>> I create a VRF on VM2 and enslave the v4 interface and tunnel to that
>> VRF, data does not leave VM2 and ping displays "Destination Host
>> Unreachable".  I did verify that basic v4 ping works between VM1 and
>> VM2 with the v4 interface on VM2 enslaved to VRF device.
>>
>> If this should work, I can provide more details with configuration commands.
>
> Please provide configuration details and I'll take a look


v6/sit tunnels and VRFs

2017-10-25 Thread Jeff Barnhill
Are v6/sit tunnels working with VRFs?

For instance, I have a very simple configuration with three VMs
running 4.13.0-16 (Ubuntu Server 17.10) kernels.  VM3 is setup as a
router for separation.  VM1 and VM2 have static routes to each other
via VM3.  All VMs have v4 interfaces configured.  If I setup a sit
tunnel with v6 addrs from V1 to V2, tunneled data flows as expected
(verified with ping) and can be seen via tcpdump on VM3.  However, if
I create a VRF on VM2 and enslave the v4 interface and tunnel to that
VRF, data does not leave VM2 and ping displays "Destination Host
Unreachable".  I did verify that basic v4 ping works between VM1 and
VM2 with the v4 interface on VM2 enslaved to VRF device.

If this should work, I can provide more details with configuration commands.

Thanks,
Jeff