From: David Ahern <dsah...@gmail.com>

Another difference between IPv4 and IPv6 is the generation of RTM_DELROUTE
notifications when a device is taken down (admin down) or deleted. IPv4
does not generate a message for routes evicted by the down or delete;
IPv6 does. A NOS at scale really needs to avoid these messages and have
IPv4 and IPv6 behave similarly, relying on userspace to handle link
notifications and evict the routes.

At this point existing user behavior needs to be preserved. Since
notifications are a global action (not per app) the only way to preserve
existing behavior and allow the messages to be skipped is to add a new
sysctl (net/ipv6/route/skip_notify_on_dev_down) which can be set to
disable the notificatioons.

IPv6 route code already supports the option to skip the message (it is
used for multipath routes for example). Besides the new sysctl we need
to pass the skip_notify setting through the generic fib6_clean and
fib6_walk functions to fib6_clean_node and to set skip_notify on calls
to __ip_del_rt for the addrconf_ifdown path.

Signed-off-by: David Ahern <dsah...@gmail.com>
---
 Documentation/networking/ip-sysctl.txt |  8 +++++++
 include/net/addrconf.h                 |  3 ++-
 include/net/ip6_fib.h                  |  3 +++
 include/net/ip6_route.h                |  1 +
 include/net/netns/ipv6.h               |  1 +
 net/ipv6/addrconf.c                    | 44 ++++++++++++++++++++++------------
 net/ipv6/anycast.c                     | 10 +++++---
 net/ipv6/ip6_fib.c                     | 20 ++++++++++++----
 net/ipv6/route.c                       | 30 ++++++++++++++++++++++-
 9 files changed, 95 insertions(+), 25 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 960de8fe3f40..163b5ff1073c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1442,6 +1442,14 @@ max_hbh_length - INTEGER
        header.
        Default: INT_MAX (unlimited)
 
+skip_notify_on_dev_down - BOOLEAN
+       Controls whether an RTM_DELROUTE message is generated for routes
+       removed when a device is taken down or deleted. IPv4 does not
+       generate this message; IPv6 does by default. Setting this sysctl
+       to true skips the message, making IPv4 and IPv6 on par in relying
+       on userspace caches to track link events and evict routes.
+       Default: false (generate message)
+
 IPv6 Fragmentation:
 
 ip6frag_high_thresh - INTEGER
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 6def0351bcc3..ee6292f64c86 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -306,7 +306,8 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex,
 void ipv6_sock_ac_close(struct sock *sk);
 
 int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr);
-int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr);
+int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr,
+                     bool skip_notify);
 void ipv6_ac_destroy_dev(struct inet6_dev *idev);
 bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
                         const struct in6_addr *addr);
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index f06e968f1992..caabfd84a098 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -407,6 +407,9 @@ struct fib6_node *fib6_locate(struct fib6_node *root,
 
 void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void 
*arg),
                    void *arg);
+void fib6_clean_all_skip_notify(struct net *net,
+                               int (*func)(struct fib6_info *, void *arg),
+                               void *arg);
 
 int fib6_add(struct fib6_node *root, struct fib6_info *rt,
             struct nl_info *info, struct netlink_ext_ack *extack);
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index cef186dbd2ce..7c140cb2eeb0 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -104,6 +104,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
                  struct netlink_ext_ack *extack);
 int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
 int ip6_del_rt(struct net *net, struct fib6_info *f6i);
+int ip6_del_rt_skip_notify(struct net *net, struct fib6_info *f6i);
 
 void rt6_flush_exceptions(struct fib6_info *f6i);
 void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index f0e396ab9bec..ef1ed529f33c 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -45,6 +45,7 @@ struct netns_sysctl_ipv6 {
        int max_dst_opts_len;
        int max_hbh_opts_len;
        int seg6_flowlabel;
+       bool skip_notify_on_dev_down;
 };
 
 struct netns_ipv6 {
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2496b12bf721..cf591cf66884 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -164,7 +164,7 @@ static struct workqueue_struct *addrconf_wq;
 static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work);
 
 static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
-static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp, bool skip_notify);
 
 static void addrconf_type_change(struct net_device *dev,
                                 unsigned long event);
@@ -181,7 +181,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr 
*ifp, bool bump_id,
                                   bool send_na);
 static void addrconf_dad_run(struct inet6_dev *idev);
 static void addrconf_rs_timer(struct timer_list *t);
-static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
+static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa,
+                             bool skip_notify);
 static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
 
 static void inet6_prefix_notify(int event, struct inet6_dev *idev,
@@ -779,7 +780,7 @@ static void dev_forward_change(struct inet6_dev *idev)
                if (idev->cnf.forwarding)
                        addrconf_join_anycast(ifa);
                else
-                       addrconf_leave_anycast(ifa);
+                       addrconf_leave_anycast(ifa, false);
        }
        inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
                                     NETCONFA_FORWARDING,
@@ -2141,7 +2142,7 @@ static void addrconf_join_anycast(struct inet6_ifaddr 
*ifp)
 }
 
 /* caller must hold RTNL */
-static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
+static void addrconf_leave_anycast(struct inet6_ifaddr *ifp, bool skip_notify)
 {
        struct in6_addr addr;
 
@@ -2150,7 +2151,7 @@ static void addrconf_leave_anycast(struct inet6_ifaddr 
*ifp)
        ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
        if (ipv6_addr_any(&addr))
                return;
-       __ipv6_dev_ac_dec(ifp->idev, &addr);
+       __ipv6_dev_ac_dec(ifp->idev, &addr, skip_notify);
 }
 
 static int addrconf_ifid_6lowpan(u8 *eui, struct net_device *dev)
@@ -3655,6 +3656,7 @@ static int addrconf_ifdown(struct net_device *dev, int 
how)
 {
        unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN;
        struct net *net = dev_net(dev);
+       bool skip_notify = net->ipv6.sysctl.skip_notify_on_dev_down;
        struct inet6_dev *idev;
        struct inet6_ifaddr *ifa, *tmp;
        bool keep_addr = false;
@@ -3772,15 +3774,19 @@ static int addrconf_ifdown(struct net_device *dev, int 
how)
 
                spin_unlock_bh(&ifa->lock);
 
-               if (rt)
-                       ip6_del_rt(net, rt);
+               if (rt) {
+                       if (skip_notify)
+                               ip6_del_rt_skip_notify(net, rt);
+                       else
+                               ip6_del_rt(net, rt);
+               }
 
                if (state != INET6_IFADDR_STATE_DEAD) {
-                       __ipv6_ifa_notify(RTM_DELADDR, ifa);
+                       __ipv6_ifa_notify(RTM_DELADDR, ifa, skip_notify);
                        inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
                } else {
                        if (idev->cnf.forwarding)
-                               addrconf_leave_anycast(ifa);
+                               addrconf_leave_anycast(ifa, skip_notify);
                        addrconf_leave_solict(ifa->idev, &ifa->addr);
                }
 
@@ -5830,7 +5836,8 @@ static void inet6_prefix_notify(int event, struct 
inet6_dev *idev,
                rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
 }
 
-static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
+static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp,
+                             bool skip_notify)
 {
        struct net *net = dev_net(ifp->idev->dev);
 
@@ -5858,18 +5865,25 @@ static void __ipv6_ifa_notify(int event, struct 
inet6_ifaddr *ifp)
                break;
        case RTM_DELADDR:
                if (ifp->idev->cnf.forwarding)
-                       addrconf_leave_anycast(ifp);
+                       addrconf_leave_anycast(ifp, skip_notify);
                addrconf_leave_solict(ifp->idev, &ifp->addr);
                if (!ipv6_addr_any(&ifp->peer_addr)) {
                        struct fib6_info *rt;
 
                        rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
                                                       ifp->idev->dev, 0, 0);
-                       if (rt)
-                               ip6_del_rt(net, rt);
+                       if (rt) {
+                               if (skip_notify)
+                                       ip6_del_rt_skip_notify(net, rt);
+                               else
+                                       ip6_del_rt(net, rt);
+                       }
                }
                if (ifp->rt) {
-                       ip6_del_rt(net, ifp->rt);
+                       if (skip_notify)
+                               ip6_del_rt_skip_notify(net, ifp->rt);
+                       else
+                               ip6_del_rt(net, ifp->rt);
                        ifp->rt = NULL;
                }
                rt_genid_bump_ipv6(net);
@@ -5882,7 +5896,7 @@ static void ipv6_ifa_notify(int event, struct 
inet6_ifaddr *ifp)
 {
        rcu_read_lock_bh();
        if (likely(ifp->idev->dead == 0))
-               __ipv6_ifa_notify(event, ifp);
+               __ipv6_ifa_notify(event, ifp, false);
        rcu_read_unlock_bh();
 }
 
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..762fb1de58a8 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -299,7 +299,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct 
in6_addr *addr)
 /*
  *     device anycast group decrement
  */
-int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
+int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr,
+                     bool skip_notify)
 {
        struct ifacaddr6 *aca, *prev_aca;
 
@@ -327,7 +328,10 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct 
in6_addr *addr)
        write_unlock_bh(&idev->lock);
        addrconf_leave_solict(idev, &aca->aca_addr);
 
-       ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
+       if (skip_notify)
+               ip6_del_rt_skip_notify(dev_net(idev->dev), aca->aca_rt);
+       else
+               ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
 
        aca_put(aca);
        return 0;
@@ -340,7 +344,7 @@ static int ipv6_dev_ac_dec(struct net_device *dev, const 
struct in6_addr *addr)
 
        if (!idev)
                return -ENODEV;
-       return __ipv6_dev_ac_dec(idev, addr);
+       return __ipv6_dev_ac_dec(idev, addr, false);
 }
 
 void ipv6_ac_destroy_dev(struct inet6_dev *idev)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index e14d244c551f..9ba72d94d60f 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -47,6 +47,7 @@ struct fib6_cleaner {
        int (*func)(struct fib6_info *, void *arg);
        int sernum;
        void *arg;
+       bool skip_notify;
 };
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1956,6 +1957,7 @@ static int fib6_clean_node(struct fib6_walker *w)
        struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
        struct nl_info info = {
                .nl_net = c->net,
+               .skip_notify = c->skip_notify,
        };
 
        if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
@@ -2007,7 +2009,7 @@ static int fib6_clean_node(struct fib6_walker *w)
 
 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
                            int (*func)(struct fib6_info *, void *arg),
-                           int sernum, void *arg)
+                           int sernum, void *arg, bool skip_notify)
 {
        struct fib6_cleaner c;
 
@@ -2019,13 +2021,14 @@ static void fib6_clean_tree(struct net *net, struct 
fib6_node *root,
        c.sernum = sernum;
        c.arg = arg;
        c.net = net;
+       c.skip_notify = skip_notify;
 
        fib6_walk(net, &c.w);
 }
 
 static void __fib6_clean_all(struct net *net,
                             int (*func)(struct fib6_info *, void *),
-                            int sernum, void *arg)
+                            int sernum, void *arg, bool skip_notify)
 {
        struct fib6_table *table;
        struct hlist_head *head;
@@ -2037,7 +2040,7 @@ static void __fib6_clean_all(struct net *net,
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        spin_lock_bh(&table->tb6_lock);
                        fib6_clean_tree(net, &table->tb6_root,
-                                       func, sernum, arg);
+                                       func, sernum, arg, skip_notify);
                        spin_unlock_bh(&table->tb6_lock);
                }
        }
@@ -2047,14 +2050,21 @@ static void __fib6_clean_all(struct net *net,
 void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
                    void *arg)
 {
-       __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
+       __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
+}
+
+void fib6_clean_all_skip_notify(struct net *net,
+                               int (*func)(struct fib6_info *, void *),
+                               void *arg)
+{
+       __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
 }
 
 static void fib6_flush_trees(struct net *net)
 {
        int new_sernum = fib6_new_sernum(net);
 
-       __fib6_clean_all(net, NULL, new_sernum, NULL);
+       __fib6_clean_all(net, NULL, new_sernum, NULL, false);
 }
 
 /*
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7c38e0e058ae..de161808c540 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3143,6 +3143,16 @@ static int __ip6_del_rt(struct fib6_info *rt, struct 
nl_info *info)
        return err;
 }
 
+int ip6_del_rt_skip_notify(struct net *net, struct fib6_info *rt)
+{
+       struct nl_info info = {
+               .nl_net = net,
+               .skip_notify = true,
+       };
+
+       return __ip6_del_rt(rt, &info);
+}
+
 int ip6_del_rt(struct net *net, struct fib6_info *rt)
 {
        struct nl_info info = { .nl_net = net };
@@ -4026,8 +4036,12 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned 
long event)
                        .event = event,
                },
        };
+       struct net *net = dev_net(dev);
 
-       fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
+       if (net->ipv6.sysctl.skip_notify_on_dev_down)
+               fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
+       else
+               fib6_clean_all(net, fib6_ifdown, &arg);
 }
 
 void rt6_disable_ip(struct net_device *dev, unsigned long event)
@@ -5031,6 +5045,9 @@ int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int 
write,
        return 0;
 }
 
+static int zero;
+static int one = 1;
+
 struct ctl_table ipv6_route_table_template[] = {
        {
                .procname       =       "flush",
@@ -5102,6 +5119,15 @@ struct ctl_table ipv6_route_table_template[] = {
                .mode           =       0644,
                .proc_handler   =       proc_dointvec_ms_jiffies,
        },
+       {
+               .procname       =       "skip_notify_on_dev_down",
+               .data           =       
&init_net.ipv6.sysctl.skip_notify_on_dev_down,
+               .maxlen         =       sizeof(int),
+               .mode           =       0644,
+               .proc_handler   =       proc_dointvec,
+               .extra1         =       &zero,
+               .extra2         =       &one,
+       },
        { }
 };
 
@@ -5125,6 +5151,7 @@ struct ctl_table * __net_init 
ipv6_route_sysctl_init(struct net *net)
                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+               table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
 
                /* Don't export sysctls to unprivileged users */
                if (net->user_ns != &init_user_ns)
@@ -5189,6 +5216,7 @@ static int __net_init ip6_route_net_init(struct net *net)
        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+       net->ipv6.sysctl.skip_notify_on_dev_down = 0;
 
        net->ipv6.ip6_rt_gc_expire = 30*HZ;
 
-- 
2.11.0

Reply via email to