Re: [RFC PATCH] openvswitch: use percpu flow stats

2016-08-19 Thread David Miller
From: Eric Dumazet 
Date: Fri, 19 Aug 2016 21:07:52 -0700

> Here is an example of a system I had in the past.
> 
> 8 cpus  (num_possible_cpus() returns 8)
> 
> Cpus were numbered : 0, 1, 2, 3, 8, 9, 10, 11  : (nr_cpu_ids = 12)
> 
> I am pretty sure they are still alive.
> 
> Since you want an array indexed by cpu numbers, you need to size it by
> nr_cpu_ids, otherwise array[11] will crash / access garbage.
> 
> This is why you find many nr_cpu_ids in the linux tree, not
> num_possible_cpus() to size arrays.

My bad, I misunderstood what nr_cpu_ids means.  You are absolutely right.


Re: [RFC PATCH] openvswitch: use percpu flow stats

2016-08-19 Thread Eric Dumazet
On Fri, 2016-08-19 at 18:09 -0700, David Miller wrote:
> From: Eric Dumazet 
> Date: Fri, 19 Aug 2016 12:56:56 -0700
> 
> > On Fri, 2016-08-19 at 16:47 -0300, Thadeu Lima de Souza Cascardo wrote:
> >> Instead of using flow stats per NUMA node, use it per CPU. When using
> >> megaflows, the stats lock can be a bottleneck in scalability.
> > 
> > ...
> > 
> >>  
> >>flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
> >> - + (nr_node_ids
> >> + + (num_possible_cpus()
> >>  * sizeof(struct flow_stats *)),
> >>   0, 0, NULL);
> >>if (flow_cache == NULL)
> > 
> > This is bogus.
> > 
> > Please use nr_cpu_ids instead of num_possible_cpus().
> 
> Then how would hot-plugged cpus be handled properly?
> 
> The only other way is you'd have to free all objects in the current
> flow_cache, destroy it, then make a new kmem_cache and reallocate
> all of the existing objects and hook them back up every time a cpu
> up event occurs.
> 
> That doesn't make any sense at all.
> 
> Therefore, the size of the objects coming out of "sw_flow" have
> to accomodate all possible cpu indexes.

Here is an example of a system I had in the past.

8 cpus  (num_possible_cpus() returns 8)

Cpus were numbered : 0, 1, 2, 3, 8, 9, 10, 11  : (nr_cpu_ids = 12)

I am pretty sure they are still alive.

Since you want an array indexed by cpu numbers, you need to size it by
nr_cpu_ids, otherwise array[11] will crash / access garbage.

This is why you find many nr_cpu_ids in the linux tree, not
num_possible_cpus() to size arrays.

# git grep -n nr_cpu_ids -- net
net/bridge/netfilter/ebtables.c:900:vmalloc(nr_cpu_ids * 
sizeof(*(newinfo->chainstack)));
net/bridge/netfilter/ebtables.c:1132:   countersize = 
COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
net/bridge/netfilter/ebtables.c:1185:   countersize = 
COUNTER_OFFSET(repl->nentries) * nr_cpu_ids;
net/bridge/netfilter/ebtables.c:2200:   countersize = 
COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
net/core/dev.c:3483:if (next_cpu < nr_cpu_ids) {
net/core/dev.c:3588: *   - Current CPU is unset (>= nr_cpu_ids).
net/core/dev.c:3596:(tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
net/core/dev.c:3603:if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
net/core/dev.c:3651:if (rflow->filter == filter_id && cpu < 
nr_cpu_ids &&
net/core/neighbour.c:2737:  for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
net/core/neighbour.c:2751:  for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
net/core/net-procfs.c:122:  while (*pos < nr_cpu_ids)
net/core/sysctl_net_core.c:70:  rps_cpu_mask = 
roundup_pow_of_two(nr_cpu_ids) - 1;
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c:360:  for (cpu = 
*pos-1; cpu < nr_cpu_ids; ++cpu) {
net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c:375:  for (cpu = 
*pos; cpu < nr_cpu_ids; ++cpu) {
net/ipv4/route.c:254:   for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
net/ipv4/route.c:267:   for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
net/ipv6/icmp.c:918:kzalloc(nr_cpu_ids * sizeof(struct sock *), 
GFP_KERNEL);
net/netfilter/nf_conntrack_netlink.c:1265:  for (cpu = cb->args[0]; cpu < 
nr_cpu_ids; cpu++) {
net/netfilter/nf_conntrack_netlink.c:2003:  if (cb->args[0] == nr_cpu_ids)
net/netfilter/nf_conntrack_netlink.c:2006:  for (cpu = cb->args[0]; cpu < 
nr_cpu_ids; cpu++) {
net/netfilter/nf_conntrack_netlink.c:3211:  if (cb->args[0] == nr_cpu_ids)
net/netfilter/nf_conntrack_netlink.c:3214:  for (cpu = cb->args[0]; cpu < 
nr_cpu_ids; cpu++) {
net/netfilter/nf_conntrack_standalone.c:309:for (cpu = *pos-1; cpu < 
nr_cpu_ids; ++cpu) {
net/netfilter/nf_conntrack_standalone.c:324:for (cpu = *pos; cpu < 
nr_cpu_ids; ++cpu) {
net/netfilter/nf_synproxy_core.c:255:   for (cpu = *pos - 1; cpu < nr_cpu_ids; 
cpu++) {
net/netfilter/nf_synproxy_core.c:270:   for (cpu = *pos; cpu < nr_cpu_ids; 
cpu++) {
net/netfilter/x_tables.c:1064:  size = sizeof(void **) * nr_cpu_ids;
net/sunrpc/svc.c:167:   unsigned int maxpools = nr_cpu_ids;





Re: [RFC PATCH] openvswitch: use percpu flow stats

2016-08-19 Thread David Miller
From: Eric Dumazet 
Date: Fri, 19 Aug 2016 12:56:56 -0700

> On Fri, 2016-08-19 at 16:47 -0300, Thadeu Lima de Souza Cascardo wrote:
>> Instead of using flow stats per NUMA node, use it per CPU. When using
>> megaflows, the stats lock can be a bottleneck in scalability.
> 
> ...
> 
>>  
>>  flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
>> -   + (nr_node_ids
>> +   + (num_possible_cpus()
>>* sizeof(struct flow_stats *)),
>> 0, 0, NULL);
>>  if (flow_cache == NULL)
> 
> This is bogus.
> 
> Please use nr_cpu_ids instead of num_possible_cpus().

Then how would hot-plugged cpus be handled properly?

The only other way is you'd have to free all objects in the current
flow_cache, destroy it, then make a new kmem_cache and reallocate
all of the existing objects and hook them back up every time a cpu
up event occurs.

That doesn't make any sense at all.

Therefore, the size of the objects coming out of "sw_flow" have
to accomodate all possible cpu indexes.


Re: [RFC PATCH] openvswitch: use percpu flow stats

2016-08-19 Thread Eric Dumazet
On Fri, 2016-08-19 at 16:47 -0300, Thadeu Lima de Souza Cascardo wrote:
> Instead of using flow stats per NUMA node, use it per CPU. When using
> megaflows, the stats lock can be a bottleneck in scalability.

...

>  
>   flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
> -+ (nr_node_ids
> ++ (num_possible_cpus()
> * sizeof(struct flow_stats *)),
>  0, 0, NULL);
>   if (flow_cache == NULL)

This is bogus.

Please use nr_cpu_ids instead of num_possible_cpus().





[RFC PATCH] openvswitch: use percpu flow stats

2016-08-19 Thread Thadeu Lima de Souza Cascardo
Instead of using flow stats per NUMA node, use it per CPU. When using
megaflows, the stats lock can be a bottleneck in scalability.

On a E5-2690 12-core system, usual throughput went from ~4Mpps to ~15Mpps
when forwarding between two 40GbE ports with a single flow configured on
the datapath.
---
 net/openvswitch/flow.c   | 39 +--
 net/openvswitch/flow.h   |  4 ++--
 net/openvswitch/flow_table.c | 21 +++--
 3 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 0ea128e..90ae248 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -72,32 +73,33 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
 {
struct flow_stats *stats;
int node = numa_node_id();
+   int cpu = get_cpu();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
-   stats = rcu_dereference(flow->stats[node]);
+   stats = rcu_dereference(flow->stats[cpu]);
 
-   /* Check if already have node-specific stats. */
+   /* Check if already have CPU-specific stats. */
if (likely(stats)) {
spin_lock(&stats->lock);
/* Mark if we write on the pre-allocated stats. */
-   if (node == 0 && unlikely(flow->stats_last_writer != node))
-   flow->stats_last_writer = node;
+   if (cpu == 0 && unlikely(flow->stats_last_writer != cpu))
+   flow->stats_last_writer = cpu;
} else {
stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */
spin_lock(&stats->lock);
 
-   /* If the current NUMA-node is the only writer on the
+   /* If the current CPU is the only writer on the
 * pre-allocated stats keep using them.
 */
-   if (unlikely(flow->stats_last_writer != node)) {
+   if (unlikely(flow->stats_last_writer != cpu)) {
/* A previous locker may have already allocated the
-* stats, so we need to check again.  If node-specific
+* stats, so we need to check again.  If CPU-specific
 * stats were already allocated, we update the pre-
 * allocated stats as we have already locked them.
 */
-   if (likely(flow->stats_last_writer != NUMA_NO_NODE)
-   && likely(!rcu_access_pointer(flow->stats[node]))) {
-   /* Try to allocate node-specific stats. */
+   if (likely(flow->stats_last_writer != -1)
+   && likely(!rcu_access_pointer(flow->stats[cpu]))) {
+   /* Try to allocate CPU-specific stats. */
struct flow_stats *new_stats;
 
new_stats =
@@ -114,12 +116,12 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
new_stats->tcp_flags = tcp_flags;
spin_lock_init(&new_stats->lock);
 
-   rcu_assign_pointer(flow->stats[node],
+   rcu_assign_pointer(flow->stats[cpu],
   new_stats);
goto unlock;
}
}
-   flow->stats_last_writer = node;
+   flow->stats_last_writer = cpu;
}
}
 
@@ -129,6 +131,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
stats->tcp_flags |= tcp_flags;
 unlock:
spin_unlock(&stats->lock);
+   put_cpu();
 }
 
 /* Must be called with rcu_read_lock or ovs_mutex. */
@@ -136,14 +139,14 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
struct ovs_flow_stats *ovs_stats,
unsigned long *used, __be16 *tcp_flags)
 {
-   int node;
+   int cpu;
 
*used = 0;
*tcp_flags = 0;
memset(ovs_stats, 0, sizeof(*ovs_stats));
 
-   for_each_node(node) {
-   struct flow_stats *stats = 
rcu_dereference_ovsl(flow->stats[node]);
+   for_each_possible_cpu(cpu) {
+   struct flow_stats *stats = 
rcu_dereference_ovsl(flow->stats[cpu]);
 
if (stats) {
/* Local CPU may write on non-local stats, so we must
@@ -163,10 +166,10 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
 /* Called with ovs_mutex. */
 void ovs_flow_stats_clear(struct sw_flow *flow)
 {
-   int node;
+   int cpu;
 
-   for_each_node(node) {
-   struct flow_stats