From: Shaohua Li <s...@fb.com> In a syn flooding test, the fib6_table rwlock is a significant bottleneck. While converting the rwlock to rcu sounds straighforward, but is very challenging if it's possible. A percpu spinlock is quite trival for this problem since updating the routing table is a rare event. In my test, the server receives around 1.5 Mpps in syn flooding test without the patch in a dual sockets and 56-CPU system. With the patch, the server receives around 3.8Mpps, and perf report doesn't show the locking issue.
Cc: Wei Wang <wei...@google.com> Signed-off-by: Shaohua Li <s...@fb.com> --- include/net/ip6_fib.h | 51 +++++++++++++++++++++++++++++++++- net/ipv6/addrconf.c | 8 +++--- net/ipv6/ip6_fib.c | 76 ++++++++++++++++++++++++++++----------------------- net/ipv6/route.c | 54 ++++++++++++++++++------------------ 4 files changed, 123 insertions(+), 66 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1a88008..3c000ce 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -229,13 +229,62 @@ struct rt6_statistics { struct fib6_table { struct hlist_node tb6_hlist; u32 tb6_id; - rwlock_t tb6_lock; + spinlock_t __percpu *percpu_tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; +static inline void fib6_table_read_lock_bh(struct fib6_table *table) +{ + preempt_disable(); + spin_lock_bh(this_cpu_ptr(table->percpu_tb6_lock)); +} + +static inline void fib6_table_read_unlock_bh(struct fib6_table *table) +{ + spin_unlock_bh(this_cpu_ptr(table->percpu_tb6_lock)); + preempt_enable(); +} + +static inline void fib6_table_read_lock(struct fib6_table *table) +{ + preempt_disable(); + spin_lock(this_cpu_ptr(table->percpu_tb6_lock)); +} + +static inline void fib6_table_read_unlock(struct fib6_table *table) +{ + spin_unlock(this_cpu_ptr(table->percpu_tb6_lock)); + preempt_enable(); +} + +static inline void fib6_table_write_lock_bh(struct fib6_table *table) +{ + int i; + + spin_lock_bh(per_cpu_ptr(table->percpu_tb6_lock, 0)); + for_each_possible_cpu(i) { + if (i == 0) + continue; + spin_lock_nest_lock(per_cpu_ptr(table->percpu_tb6_lock, i), + per_cpu_ptr(table->percpu_tb6_lock, 0)); + } +} + +static inline void fib6_table_write_unlock_bh(struct fib6_table *table) +{ + int i; + + for_each_possible_cpu(i) { + if (i == 0) + continue; + spin_unlock(per_cpu_ptr(table->percpu_tb6_lock, i)); + } + spin_unlock_bh(per_cpu_ptr(table->percpu_tb6_lock, 0)); +} + #define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC #define RT6_TABLE_MAIN RT_TABLE_MAIN #define RT6_TABLE_DFLT RT6_TABLE_MAIN diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3c46e95..428512b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2313,7 +2313,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); + fib6_table_read_lock_bh(table); fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); if (!fn) goto out; @@ -2330,7 +2330,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, break; } out: - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); return rt; } @@ -5929,7 +5929,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) struct fib6_table *table = rt->rt6i_table; int cpu; - read_lock(&table->tb6_lock); + fib6_table_read_lock(table); addrconf_set_nopolicy(ifa->rt, val); if (rt->rt6i_pcpu) { for_each_possible_cpu(cpu) { @@ -5939,7 +5939,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) addrconf_set_nopolicy(*rtp, val); } } - read_unlock(&table->tb6_lock); + fib6_table_read_unlock(table); } spin_unlock(&ifa->lock); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ebb299c..16ee1cc 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -194,8 +194,16 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb) * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ - rwlock_init(&tb->tb6_lock); - + for_each_possible_cpu(h) { + /* + * make sure the first lock and other locks have different + * lockdep map, so we can treat the first lock as nested lock + */ + if (h == 0) + spin_lock_init(per_cpu_ptr(tb->percpu_tb6_lock, h)); + else + spin_lock_init(per_cpu_ptr(tb->percpu_tb6_lock, h)); + } h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* @@ -205,23 +213,34 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb) hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - -static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) +static struct fib6_table *fib6_alloc_table(struct net *net, u32 id, gfp_t gfp) { struct fib6_table *table; - table = kzalloc(sizeof(*table), GFP_ATOMIC); - if (table) { + table = kzalloc(sizeof(*table), gfp); + if (!table) + return NULL; + table->percpu_tb6_lock = alloc_percpu_gfp(struct spinlock, gfp); + if (table->percpu_tb6_lock) { table->tb6_id = id; table->tb6_root.leaf = net->ipv6.ip6_null_entry; table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); + } else { + kfree(table); + return NULL; } return table; } +static void fib6_free_table(struct fib6_table *table) +{ + free_percpu(table->percpu_tb6_lock); + kfree(table); +} + +#ifdef CONFIG_IPV6_MULTIPLE_TABLES struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; @@ -232,7 +251,7 @@ struct fib6_table *fib6_new_table(struct net *net, u32 id) if (tb) return tb; - tb = fib6_alloc_table(net, id); + tb = fib6_alloc_table(net, id, GFP_ATOMIC); if (tb) fib6_link_table(net, tb); @@ -366,9 +385,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->count = 0; w->skip = 0; - read_lock_bh(&table->tb6_lock); + fib6_table_read_lock_bh(table); res = fib6_walk(net, w); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); if (res > 0) { cb->args[4] = 1; cb->args[5] = w->root->fn_sernum; @@ -383,9 +402,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, } else w->skip = 0; - read_lock_bh(&table->tb6_lock); + fib6_table_read_lock_bh(table); res = fib6_walk_continue(w); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; @@ -1710,10 +1729,10 @@ static void __fib6_clean_all(struct net *net, for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { - write_lock_bh(&table->tb6_lock); + fib6_table_write_lock_bh(table); fib6_clean_tree(net, &table->tb6_root, func, false, sernum, arg); - write_unlock_bh(&table->tb6_lock); + fib6_table_write_unlock_bh(table); } } rcu_read_unlock(); @@ -1856,27 +1875,16 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib_table_hash) goto out_rt6_stats; - net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), - GFP_KERNEL); + net->ipv6.fib6_main_tbl = fib6_alloc_table(net, RT6_TABLE_MAIN, + GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; - net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; - net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; - net->ipv6.fib6_main_tbl->tb6_root.fn_flags = - RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; - inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); - #ifdef CONFIG_IPV6_MULTIPLE_TABLES - net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), - GFP_KERNEL); + net->ipv6.fib6_local_tbl = fib6_alloc_table(net, RT6_TABLE_LOCAL, + GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; - net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; - net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; - net->ipv6.fib6_local_tbl->tb6_root.fn_flags = - RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; - inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); #endif fib6_tables_init(net); @@ -1884,7 +1892,7 @@ static int __net_init fib6_net_init(struct net *net) #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: - kfree(net->ipv6.fib6_main_tbl); + fib6_free_table(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); @@ -1901,10 +1909,10 @@ static void fib6_net_exit(struct net *net) #ifdef CONFIG_IPV6_MULTIPLE_TABLES inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); - kfree(net->ipv6.fib6_local_tbl); + fib6_free_table(net->ipv6.fib6_local_tbl); #endif inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); - kfree(net->ipv6.fib6_main_tbl); + fib6_free_table(net->ipv6.fib6_main_tbl); kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); } @@ -2067,9 +2075,9 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) iter_table: ipv6_route_check_sernum(iter); - read_lock(&iter->tbl->tb6_lock); + fib6_table_read_lock(iter->tbl); r = fib6_walk_continue(&iter->w); - read_unlock(&iter->tbl->tb6_lock); + fib6_table_read_unlock(iter->tbl); if (r > 0) { if (v) ++*pos; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4d30c96..a31e0de 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -877,7 +877,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_node *fn; struct rt6_info *rt; - read_lock_bh(&table->tb6_lock); + fib6_table_read_lock_bh(table); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: rt = fn->leaf; @@ -890,7 +890,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net, goto restart; } dst_use(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); @@ -944,9 +944,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, struct fib6_table *table; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + fib6_table_write_lock_bh(table); err = fib6_add(&table->tb6_root, rt, info, mxc, extack); - write_unlock_bh(&table->tb6_lock); + fib6_table_write_unlock_bh(table); return err; } @@ -1044,7 +1044,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return net->ipv6.ip6_null_entry; } - read_lock_bh(&table->tb6_lock); + fib6_table_read_lock_bh(table); if (rt->rt6i_pcpu) { p = this_cpu_ptr(rt->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); @@ -1065,7 +1065,7 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) } dst_hold(&pcpu_rt->dst); rt6_dst_from_metrics_check(pcpu_rt); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); return pcpu_rt; } @@ -1081,7 +1081,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; - read_lock_bh(&table->tb6_lock); + fib6_table_read_lock_bh(table); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1108,7 +1108,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { dst_use(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); rt6_dst_from_metrics_check(rt); @@ -1125,7 +1125,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, struct rt6_info *uncached_rt; dst_use(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); dst_release(&rt->dst); @@ -1153,14 +1153,14 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, pcpu_rt = rt6_get_pcpu_route(rt); if (pcpu_rt) { - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); } else { /* We have to do the read_unlock first * because rt6_make_pcpu_route() may trigger * ip6_dst_gc() which will take the write_lock. */ dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); pcpu_rt = rt6_make_pcpu_route(rt); dst_release(&rt->dst); } @@ -1503,7 +1503,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, * routes. */ - read_lock_bh(&table->tb6_lock); + fib6_table_read_lock_bh(table); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { @@ -1536,7 +1536,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, out: dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; @@ -2135,9 +2135,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) } table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + fib6_table_write_lock_bh(table); err = fib6_del(rt, info); - write_unlock_bh(&table->tb6_lock); + fib6_table_write_unlock_bh(table); out: ip6_rt_put(rt); @@ -2163,7 +2163,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) if (rt == net->ipv6.ip6_null_entry) goto out_put; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + fib6_table_write_lock_bh(table); if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { struct rt6_info *sibling, *next_sibling; @@ -2193,7 +2193,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) err = fib6_del(rt, info); out_unlock: - write_unlock_bh(&table->tb6_lock); + fib6_table_write_unlock_bh(table); out_put: ip6_rt_put(rt); @@ -2218,7 +2218,7 @@ static int ip6_route_del(struct fib6_config *cfg, return err; } - read_lock_bh(&table->tb6_lock); + fib6_table_read_lock_bh(table); fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, @@ -2241,7 +2241,7 @@ static int ip6_route_del(struct fib6_config *cfg, if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) continue; dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) @@ -2250,7 +2250,7 @@ static int ip6_route_del(struct fib6_config *cfg, return __ip6_del_rt_siblings(rt, cfg); } } - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); return err; } @@ -2429,7 +2429,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); + fib6_table_read_lock_bh(table); fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); if (!fn) goto out; @@ -2445,7 +2445,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net, break; } out: - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); return rt; } @@ -2490,7 +2490,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev if (!table) return NULL; - read_lock_bh(&table->tb6_lock); + fib6_table_read_lock_bh(table); for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && @@ -2499,7 +2499,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev } if (rt) dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); return rt; } @@ -2536,17 +2536,17 @@ static void __rt6_purge_dflt_routers(struct fib6_table *table) struct rt6_info *rt; restart: - read_lock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); ip6_del_rt(rt); goto restart; } } - read_unlock_bh(&table->tb6_lock); + fib6_table_read_unlock_bh(table); table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; } -- 2.9.3