After the patch 'ipv6: Only create RTF_CACHE routes after encountering pmtu exception', we need to compensate the performance hit (bouncing dst->__refcnt).
Signed-off-by: Martin KaFai Lau <ka...@fb.com> Reviewed-by: Hannes Frederic Sowa <han...@stressinduktion.org> Cc: Steffen Klassert <steffen.klass...@secunet.com> --- include/net/ip6_fib.h | 8 +++ include/net/ip6_route.h | 2 +- include/uapi/linux/ipv6_route.h | 1 + net/ipv6/ip6_fib.c | 25 ++++++- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/route.c | 150 +++++++++++++++++++++++++++++++++++----- net/ipv6/tcp_ipv6.c | 3 +- net/ipv6/xfrm6_policy.c | 6 +- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- net/sctp/ipv6.c | 2 +- 10 files changed, 171 insertions(+), 30 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index e000180..4b1fc9b 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -121,6 +121,7 @@ struct rt6_info { struct rt6key rt6i_prefsrc; struct inet6_dev *rt6i_idev; + struct rt6_info __rcu * __percpu *rt6i_pcpu; u32 rt6i_metric; u32 rt6i_pmtu; @@ -159,6 +160,13 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) rt0->rt6i_flags |= RTF_EXPIRES; } +static inline u32 rt6_get_cookie(const struct rt6_info *rt) +{ + if (rt->rt6i_flags & RTF_PCPU) + rt = (struct rt6_info *)(rt->dst.from); + return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; +} + static inline void ip6_rt_put(struct rt6_info *rt) { /* dst_release() accepts a NULL parameter. diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 0e4d170..397dd3a 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -145,7 +145,7 @@ static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr; #endif - np->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + np->dst_cookie = rt6_get_cookie(rt); } static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index 2be7bd1..f6598d1 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -34,6 +34,7 @@ #define RTF_PREF(pref) ((pref) << 27) #define RTF_PREF_MASK 0x18000000 +#define RTF_PCPU 0x40000000 #define RTF_LOCAL 0x80000000 diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 96dbfff..bf12be7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -154,10 +154,33 @@ static void node_free(struct fib6_node *fn) kmem_cache_free(fib6_node_kmem, fn); } +static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) +{ + int cpu; + + if (!non_pcpu_rt->rt6i_pcpu) + return; + + for_each_possible_cpu(cpu) { + struct rt6_info **ppcpu_rt; + struct rt6_info *pcpu_rt; + + ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu); + pcpu_rt = rcu_dereference_protected(*ppcpu_rt, + lockdep_is_held(&non_pcpu_rt->rt6i_table->tb6_lock)); + if (pcpu_rt) { + dst_free(&pcpu_rt->dst); + *ppcpu_rt = NULL; + } + } +} + static void rt6_release(struct rt6_info *rt) { - if (atomic_dec_and_test(&rt->rt6i_ref)) + if (atomic_dec_and_test(&rt->rt6i_ref)) { + rt6_free_pcpu(rt); dst_free(&rt->dst); + } } static void fib6_link_table(struct net *net, struct fib6_table *tb) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 5cafd92..2e67b66 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -151,7 +151,7 @@ EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *) dst; - t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + t->dst_cookie = rt6_get_cookie(rt); dst_release(t->dst_cache); t->dst_cache = dst; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index cb3c585..29227a0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -108,11 +108,18 @@ static struct rt6_info *rt6_get_route_info(struct net *net, const struct in6_addr *gwaddr, int ifindex); #endif +static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt) +{ + return dst_metrics_write_ptr(rt->dst.from); +} + static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old) { struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_flags & RTF_CACHE) + if (rt->rt6i_flags & RTF_PCPU) + return rt6_pcpu_cow_metrics(rt); + else if (rt->rt6i_flags & RTF_CACHE) return NULL; else return dst_cow_metrics_generic(dst, old); @@ -252,10 +259,10 @@ static const struct rt6_info ip6_blk_hole_entry_template = { #endif /* allocate dst with ip6_dst_ops */ -static inline struct rt6_info *ip6_dst_alloc(struct net *net, - struct net_device *dev, - int flags, - struct fib6_table *table) +static struct rt6_info *__ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags, + struct fib6_table *table) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0, DST_OBSOLETE_FORCE_CHK, flags); @@ -269,6 +276,34 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, return rt; } +static struct rt6_info *ip6_dst_alloc(struct net *net, + struct net_device *dev, + int flags, + struct fib6_table *table) +{ + struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table); + + if (rt) { + rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); + if (rt->rt6i_pcpu) { + int cpu; + + for_each_possible_cpu(cpu) { + struct rt6_info **p; + + p = per_cpu_ptr(rt->rt6i_pcpu, cpu); + /* no one shares rt */ + *p = NULL; + } + } else { + dst_destroy((struct dst_entry *)rt); + return NULL; + } + } + + return rt; +} + static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; @@ -277,6 +312,9 @@ static void ip6_dst_destroy(struct dst_entry *dst) dst_destroy_metrics_generic(dst); + if (rt->rt6i_pcpu) + free_percpu(rt->rt6i_pcpu); + if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); @@ -870,11 +908,69 @@ static struct rt6_info *ip6_pmtu_rt_cache_alloc(struct rt6_info *ort, return rt; } +static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) +{ + struct rt6_info *pcpu_rt; + + pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev), + rt->dst.dev, rt->dst.flags, + rt->rt6i_table); + + if (!pcpu_rt) + return NULL; + ip6_rt_copy_init(pcpu_rt, rt, NULL); + pcpu_rt->rt6i_protocol = rt->rt6i_protocol; + pcpu_rt->rt6i_flags |= RTF_PCPU; + return pcpu_rt; +} + +static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) +{ + struct rt6_info *pcpu_rt, *orig, *prev, **p; + struct net *net = dev_net(rt->dst.dev); + + if (rt == net->ipv6.ip6_null_entry || rt->rt6i_flags & RTF_CACHE) + goto done; + + rcu_read_lock(); + p = raw_cpu_ptr(rt->rt6i_pcpu); + orig = rcu_dereference_check(*p, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + if (orig) { + rt6_dst_from_metrics_check(orig); + dst_hold(&orig->dst); + rcu_read_unlock(); + return orig; + } + rcu_read_unlock(); + + pcpu_rt = ip6_rt_pcpu_alloc(rt); + if (!pcpu_rt) { + rt = net->ipv6.ip6_null_entry; + goto done; + } + dst_hold(&pcpu_rt->dst); + + prev = cmpxchg(p, orig, pcpu_rt); + if (prev == orig) { + if (orig) + call_rcu(&orig->dst.rcu_head, dst_rcu_free); + } else { + pcpu_rt->dst.flags |= DST_NOCACHE; + } + return pcpu_rt; + +done: + rt6_dst_from_metrics_check(rt); + dst_hold(&rt->dst); + return rt; +} + static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt; + struct rt6_info *rt, *pcpu_rt; int strict = 0; strict |= flags & RT6_LOOKUP_F_IFACE; @@ -902,14 +998,13 @@ redo_rt6_select: } } - dst_hold(&rt->dst); + pcpu_rt = rt6_get_pcpu_route(rt); read_unlock_bh(&table->tb6_lock); - rt6_dst_from_metrics_check(rt); rt->dst.lastuse = jiffies; rt->dst.__use++; - return rt; + return pcpu_rt; } static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, @@ -1020,6 +1115,26 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt) dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true); } +static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) +{ + if (!rt->rt6i_node || rt->rt6i_node->fn_sernum != cookie) + return NULL; + + if (rt6_check_expired(rt)) + return NULL; + + return &rt->dst; +} + +static struct dst_entry *rt6_pcpu_check(struct rt6_info *rt, u32 cookie) +{ + if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + rt6_check((struct rt6_info *)(rt->dst.from), cookie)) + return &rt->dst; + else + return NULL; +} + static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct rt6_info *rt; @@ -1030,15 +1145,12 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) - return NULL; - - if (rt6_check_expired(rt)) - return NULL; - rt6_dst_from_metrics_check(rt); - return dst; + if (rt->rt6i_flags & RTF_PCPU) + return rt6_pcpu_check(rt, cookie); + else + return rt6_check(rt, cookie); } static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) @@ -1943,11 +2055,11 @@ static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort, { struct rt6_info *rt; - if (ort->rt6i_flags & RTF_CACHE) + if (ort->rt6i_flags & (RTF_PCPU | RTF_CACHE)) ort = (struct rt6_info *)ort->dst.from; - rt = ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, - 0, ort->rt6i_table); + rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, + 0, ort->rt6i_table); if (!rt) return NULL; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 042a645..b416305 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -99,8 +99,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) dst_hold(dst); sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - if (rt->rt6i_node) - inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; + inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); } } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 6ae256b..ed0583c 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -76,8 +76,7 @@ static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, { if (dst->ops->family == AF_INET6) { struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_node) - path->path_cookie = rt->rt6i_node->fn_sernum; + path->path_cookie = rt6_get_cookie(rt); } path->u.rt6.rt6i_nfheader_len = nfheader_len; @@ -105,8 +104,7 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, RTF_LOCAL); xdst->u.rt6.rt6i_metric = rt->rt6i_metric; xdst->u.rt6.rt6i_node = rt->rt6i_node; - if (rt->rt6i_node) - xdst->route_cookie = rt->rt6i_node->fn_sernum; + xdst->route_cookie = rt6_get_cookie(rt); xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway; xdst->u.rt6.rt6i_dst = rt->rt6i_dst; xdst->u.rt6.rt6i_src = rt->rt6i_src; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 38f8627..5eff9f6 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -435,7 +435,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, goto err_unreach; } rt = (struct rt6_info *) dst; - cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 9fa13f6..d012834 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -331,7 +331,7 @@ out: rt = (struct rt6_info *)dst; t->dst = dst; - t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + t->dst_cookie = rt6_get_cookie(rt); pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &fl6->saddr); -- 1.8.1 -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html