Cc Wei Wang

On Sun, Feb 04, 2018 at 01:21:18PM +0200, Eyal Birger wrote:
> Hi,
> 
> We've encountered a non released device reference upon device
> unregistration which seems to stem from xfrm policy code.
> 
> The setup includes:
> - an underlay device (e.g. eth0) using IPv4
> - an xfrm IPv6 over IPv4 tunnel routed via the underlay device
> - an ipip6 tunnel over the xfrm IPv6 tunnel
> 
> When tearing down the underlay device, after traffic had passed via the ipip6
> tunnel, log messages of the following form are observed:
> 
> unregister_netdevice: waiting for eth0 to become free. Usage count = 2

Looks like this happened when the dst garbage collection code was
removed. I could not point to a commit that introduced it so I
did a bisection and this pointed to:

commit 9514528d92d4cbe086499322370155ed69f5d06c
ipv6: call dst_dev_put() properly

With this commit we leak the one refcount and some further commit
leaked the second one.

> 
> The below synthetic script reproduces this consistently on a fresh ubuntu vm
> running net-next v4.15-6066-ge9522a5:
> ---------------------------------------------------------
> #!/bin/bash
> 
> ipsec_underlay_dst=192.168.6.1
> ipsec_underlay_src=192.168.5.2
> ipv6_pfx=1234
> local_ipv6_addr="$ipv6_pfx::1"
> remote_ipv6_addr="$ipv6_pfx::2"
> 
> # create dummy ipsec underlay
> ip l add dev dummy1 type dummy
> ip l set dev dummy1 up
> ip r add "$ipsec_underlay_dst/32" dev dummy1
> ip -6 r add "$ipv6_pfx::/16" dev dummy1
> 
> ip a add dev dummy1 "$local_ipv6_addr/128"
> ip a add dev dummy1 "$ipsec_underlay_src/24"
> 
> # add xfrm policy and state
> ip x p add src "$local_ipv6_addr/128" dst "$ipv6_pfx::/16" dir out tmpl src 
> "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp reqid 1 mode tunnel
> ip x s add src "$ipsec_underlay_src" dst "$ipsec_underlay_dst" proto esp spi 
> 0xcd440ce6 reqid 1 mode tunnel auth-trunc 'hmac(sha1)' 
> 0x34a546d309031628962b814ef073aff1a638ad21 96 enc 'cbc(aes)' 
> 0xf31e14149c328297fe7925ad7448420e encap espinudp 4500 4500 0.0.0.0
> 
> # add 4o6 tunnel
> ip l add tnl46 type ip6tnl mode ipip6 local "$local_ipv6_addr" remote 
> "$remote_ipv6_addr"
> ip l set dev tnl46 up
> ip r add 10.64.0.0/10 dev tnl46 
> 
> # pass traffic so route is cached
> ping -w 1 -c 1 10.64.0.1
> 
> # remove dummy underlay
> ip l del dummy1
> ---------------------------------------------------------
> 
> Analysis:
> 
> ip6_tunnel holds a dst_cache which caches its underlay dst objects.
> When devices are unregistered, non-xfrm dst objects are invlidated by their
> original creators (ipv4/ipv6/...) and thus are wiped from dst_cache.
> 
> xfrm created routes otoh are not tracked by xfrm, and are not invalidated upon
> device unregistration, thus hold the device upon unregistration.
> 
> The following rough sketch patch illustrates an approach overcoming this
> issue:
> ---------------------------------------------------------
> >From e188dc5295e3500bc59e8780049840afa2eb3e24 Mon Sep 17 00:00:00 2001
> From: Eyal Birger <e...@metanetworks.com>
> Date: Sun, 4 Feb 2018 13:08:02 +0200
> Subject: [PATCH] net: xfrm_policy: invalidate xfrm_dsts on device
>  unregistration
> 
> ---
>  include/net/xfrm.h     | 10 ++-----
>  net/xfrm/xfrm_policy.c | 75 
> ++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 78 insertions(+), 7 deletions(-)
> 
> diff --git a/include/net/xfrm.h b/include/net/xfrm.h
> index 7d20776..c1c8493 100644
> --- a/include/net/xfrm.h
> +++ b/include/net/xfrm.h
> @@ -994,6 +994,8 @@ struct xfrm_dst {
>       u32 child_mtu_cached;
>       u32 route_cookie;
>       u32 path_cookie;
> +     struct list_head xfrm_dst_gc;
> +     struct xfrm_dst_list *xfrm_dst_gc_list;
>  };
>  
>  static inline struct dst_entry *xfrm_dst_path(const struct dst_entry *dst)
> @@ -1025,13 +1027,7 @@ static inline void xfrm_dst_set_child(struct xfrm_dst 
> *xdst, struct dst_entry *c
>       xdst->child = child;
>  }
>  
> -static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
> -{
> -     xfrm_pols_put(xdst->pols, xdst->num_pols);
> -     dst_release(xdst->route);
> -     if (likely(xdst->u.dst.xfrm))
> -             xfrm_state_put(xdst->u.dst.xfrm);
> -}
> +void xfrm_dst_destroy(struct xfrm_dst *xdst);
>  #endif
>  
>  void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
> diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
> index 7a23078..d446641 100644
> --- a/net/xfrm/xfrm_policy.c
> +++ b/net/xfrm/xfrm_policy.c
> @@ -94,6 +94,58 @@ __xfrm6_selector_match(const struct xfrm_selector *sel, 
> const struct flowi *fl)
>               (fl6->flowi6_oif == sel->ifindex || !sel->ifindex);
>  }
>  
> +struct xfrm_dst_list {
> +     spinlock_t              lock;
> +     struct list_head        head;
> +};
> +
> +static DEFINE_PER_CPU_ALIGNED(struct xfrm_dst_list, xfrm_dst_gc_list);
> +
> +static void xfrm_add_dst_list(struct xfrm_dst *xdst)
> +{
> +     struct xfrm_dst_list *xl = raw_cpu_ptr(&xfrm_dst_gc_list);
> +
> +     xdst->xfrm_dst_gc_list = xl;
> +
> +     spin_lock_bh(&xl->lock);
> +     list_add_tail(&xdst->xfrm_dst_gc, &xl->head);
> +     spin_unlock_bh(&xl->lock);
> +}
> +
> +void xfrm_dst_destroy(struct xfrm_dst *xdst)
> +{
> +     xfrm_pols_put(xdst->pols, xdst->num_pols);
> +     dst_release(xdst->route);
> +     if (likely(xdst->u.dst.xfrm))
> +             xfrm_state_put(xdst->u.dst.xfrm);
> +     if (!list_empty(&xdst->xfrm_dst_gc)) {
> +             struct xfrm_dst_list *xl = xdst->xfrm_dst_gc_list;
> +
> +             spin_lock_bh(&xl->lock);
> +             list_del(&xdst->xfrm_dst_gc);
> +             spin_unlock_bh(&xl->lock);
> +     }
> +}
> +EXPORT_SYMBOL_GPL(xfrm_dst_destroy);
> +
> +void xfrm_flush_dev(struct net_device *dev)
> +{
> +     struct xfrm_dst *xdst;
> +     int cpu;
> +
> +     for_each_possible_cpu(cpu) {
> +             struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu);
> +
> +             spin_lock_bh(&xl->lock);
> +             list_for_each_entry(xdst, &xl->head, xfrm_dst_gc) {
> +                     if (xdst->u.dst.dev != dev)
> +                             continue;
> +                     dst_dev_put(&xdst->u.dst);
> +             }
> +             spin_unlock_bh(&xl->lock);
> +     }
> +}
> +
>  bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi 
> *fl,
>                        unsigned short family)
>  {
> @@ -1581,6 +1633,8 @@ static struct dst_entry *xfrm_bundle_create(struct 
> xfrm_policy *policy,
>               }
>  
>               bundle[i] = xdst;
> +             xfrm_add_dst_list(xdst);
> +
>               if (!xdst_prev)
>                       xdst0 = xdst;
>               else
> @@ -2984,8 +3038,22 @@ static struct pernet_operations __net_initdata 
> xfrm_net_ops = {
>       .exit = xfrm_net_exit,
>  };
>  
> +static int xfrm_netdev_event(struct notifier_block *this, unsigned long 
> event, void *ptr)
> +{
> +     struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> +
> +     if (event == NETDEV_UNREGISTER)
> +             xfrm_flush_dev(dev);
> +     return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block xfrm_netdev_notifier = {
> +     .notifier_call = xfrm_netdev_event,
> +};
> +
>  void __init xfrm_init(void)
>  {
> +     int cpu;
>       int i;
>  
>       xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
> @@ -2998,6 +3066,13 @@ void __init xfrm_init(void)
>       register_pernet_subsys(&xfrm_net_ops);
>       seqcount_init(&xfrm_policy_hash_generation);
>       xfrm_input_init();
> +     for_each_possible_cpu(cpu) {
> +             struct xfrm_dst_list *xl = &per_cpu(xfrm_dst_gc_list, cpu);
> +
> +             INIT_LIST_HEAD(&xl->head);
> +             spin_lock_init(&xl->lock);
> +     }
> +     register_netdevice_notifier(&xfrm_netdev_notifier);
>  }
>  
>  #ifdef CONFIG_AUDITSYSCALL
> -- 
> 2.7.4
> 
> ---------------------------------------------------------
> 
> This approach has the unfortunate side effects of adding a spin lock for the
> tracked list, as well as increasing struct xfrm_dst.

Reintroducing garbage collection is probably not a so good idea. I think
the patch below should fix it a bit less intrusive.


Subject: [PATCH RFC] xfrm: Fix netdev refcount leak when flushing the percpu 
dst cache.

The dst garbage collection code is removed, so we need to call
dst_dev_put() on cached dst entries before we release them.
Otherwise we leak the refcount to the netdev.

Signed-off-by: Steffen Klassert <steffen.klass...@secunet.com>
---
 net/xfrm/xfrm_policy.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 7a23078132cf..7836b7601b49 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1715,8 +1715,10 @@ static int xfrm_expand_policies(const struct flowi *fl, 
u16 family,
 static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
 {
        this_cpu_write(xfrm_last_dst, xdst);
-       if (old)
+       if (old) {
+               dst_dev_put(&old->u.dst);
                dst_release(&old->u.dst);
+       }
 }
 
 static void __xfrm_pcpu_work_fn(void)
@@ -1787,6 +1789,7 @@ void xfrm_policy_cache_flush(void)
                old = per_cpu(xfrm_last_dst, cpu);
                if (old && !xfrm_bundle_ok(old)) {
                        per_cpu(xfrm_last_dst, cpu) = NULL;
+                       dst_dev_put(&old->u.dst);
                        dst_release(&old->u.dst);
                }
                rcu_read_unlock();
-- 
2.14.1

Reply via email to