On 05/01/2017 23:27, Joe Stringer wrote:
On 25 December 2016 at 03:39, Paul Blakey <pa...@mellanox.com> wrote:
While dumping flows, dump flows that were offloaded to
netdev and parse them back to dpif flow.

Signed-off-by: Paul Blakey <pa...@mellanox.com>
Reviewed-by: Roi Dayan <r...@mellanox.com>
---
  lib/dpif-netlink.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++++
  1 file changed, 179 insertions(+)

diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 36f2888..3d8940e 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -38,6 +38,7 @@
  #include "flow.h"
  #include "fat-rwlock.h"
  #include "netdev.h"
+#include "netdev-provider.h"
  #include "netdev-linux.h"
  #include "netdev-vport.h"
  #include "netlink-conntrack.h"
@@ -55,6 +56,7 @@
  #include "unaligned.h"
  #include "util.h"
  #include "openvswitch/vlog.h"
+#include "openvswitch/match.h"

  VLOG_DEFINE_THIS_MODULE(dpif_netlink);
  #ifdef _WIN32
@@ -68,6 +70,8 @@ enum { MAX_PORTS = USHRT_MAX };
   * missing if we have old headers. */
  #define ETH_FLAG_LRO      (1 << 15)    /* LRO is enabled */

+#define FLOW_DUMP_MAX_BATCH 50
+
  struct dpif_netlink_dp {
      /* Generic Netlink header. */
      uint8_t cmd;
@@ -1355,6 +1359,10 @@ struct dpif_netlink_flow_dump {
      struct dpif_flow_dump up;
      struct nl_dump nl_dump;
      atomic_int status;
+    struct netdev_flow_dump **netdev_dumps;
+    int netdev_num;
+    int netdev_given;
+    struct ovs_mutex netdev_lock;
Could you add a brief comment above these variables that describes
their use? (It's also common in OVS code to mention that, eg,
netdev_lock protects the following elements. )

If there's a more descriptive name than "netdev_num", like
netdev_max_dumps or something then please use that instead. At a
glance, "given" and "num" don't provide particularly much context
about how they relate to each other or to the dump.
sure, thanks.

  };

  static struct dpif_netlink_flow_dump *
@@ -1363,6 +1371,34 @@ dpif_netlink_flow_dump_cast(struct dpif_flow_dump *dump)
      return CONTAINER_OF(dump, struct dpif_netlink_flow_dump, up);
  }

+static void start_netdev_dump(const struct dpif *dpif_,
+                              struct dpif_netlink_flow_dump *dump) {
+
+    if (!netdev_flow_api_enabled) {
+        dump->netdev_num = 0;
+        return;
+    }
Typically for style we still place all variable declarations at the
top of a function, in a christmas tree long lines to short lines,
before functional code like this.

+
+    struct netdev_list_element *element;
+    struct ovs_list port_list;
+    int ports = netdev_hmap_port_get_list(dpif_->dpif_class, &port_list);
+    int i = 0;
+
+    dump->netdev_dumps =
+        ports ? xzalloc(sizeof(struct netdev_flow_dump *) * ports) : 0;
Can this be sizeof(dump->netdev_dumps)?
Do you mean sizeof(*dump-netdev_dumps), or sizeof(dump->netdev_dumps[0]), if so yes.

+    dump->netdev_num = ports;
+    dump->netdev_given = 0;
+
+    LIST_FOR_EACH(element, node, &port_list) {
+        dump->netdev_dumps[i] = netdev_flow_dump_create(element->netdev);
+        dump->netdev_dumps[i]->port = element->port_no;
+        i++;
+    }
As a matter of style, it's easier to see that this loop is bounded by
'ports' (and that number is correct) if it's structured as

for (i = 0; i < ports; i++) {
element = get_next_node;
...
}

Also, it seems that even if the netdev doesn't support flow_dump, we
allocate a netdev_flow_dump and add it to the netdev_dumps here..
perhaps we could/should skip it for these netdevs instead?

+    netdev_port_list_del(&port_list);
+
+    ovs_mutex_init(&dump->netdev_lock);
I don't see a corresponding ovs_mutex_destroy() call for this.
Good catch, thanks.
+}
+
  static struct dpif_flow_dump *
  dpif_netlink_flow_dump_create(const struct dpif *dpif_, bool terse)
  {
@@ -1387,6 +1423,8 @@ dpif_netlink_flow_dump_create(const struct dpif *dpif_, 
bool terse)
      atomic_init(&dump->status, 0);
      dump->up.terse = terse;

+    start_netdev_dump(dpif_, dump);
+
      return &dump->up;
  }

@@ -1397,6 +1435,16 @@ dpif_netlink_flow_dump_destroy(struct dpif_flow_dump 
*dump_)
      unsigned int nl_status = nl_dump_done(&dump->nl_dump);
      int dump_status;

+    if (netdev_flow_api_enabled) {
+        for (int i = 0; i < dump->netdev_num; i++) {
+            int err = netdev_flow_dump_destroy(dump->netdev_dumps[i]);
+            if (err != 0 && err != EOPNOTSUPP) {
+                VLOG_ERR("failed dumping netdev: %s", ovs_strerror(err));
+            }
+        }
+        free(dump->netdev_dumps);
+    }
You don't really need to check for netdev_flow_api_enabled here;
netdev_num will be 0 if it is disabled, so that for loop turns into a
no-op; then you could initialize dump->netdev_dumps to NULL in that
case and unconditionally free it. It's a bit simpler to read the code
if you don't have to think about whether or not hardware offloads are
enabled.

+
      /* No other thread has access to 'dump' at this point. */
      atomic_read_relaxed(&dump->status, &dump_status);
      free(dump);
@@ -1410,6 +1458,11 @@ struct dpif_netlink_flow_dump_thread {
      struct dpif_flow_stats stats;
      struct ofpbuf nl_flows;     /* Always used to store flows. */
      struct ofpbuf *nl_actions;  /* Used if kernel does not supply actions. */
+    struct odputil_keybuf keybuf[FLOW_DUMP_MAX_BATCH];
+    struct odputil_keybuf maskbuf[FLOW_DUMP_MAX_BATCH];
+    struct odputil_keybuf actbuf[FLOW_DUMP_MAX_BATCH];
+    int netdev_cur_dump;
+    bool netdev_done;
I wonder if it's worthwhile to reuse 'nl_flows' to store all of these
netlink-formatted key/mask/acts instead of having these keybufs? It
seems that it is currently unused for the first half of the
dpif_netlink_flow_dump while the flows are being dumped from the
netdev.

Regardless of the above question, I also question whether
FLOW_DUMP_MAX_BATCH is too big for dumping from the kernel. How many
tc flows will we really get from the kernel at once?

  };

  static struct dpif_netlink_flow_dump_thread *
@@ -1429,6 +1482,8 @@ dpif_netlink_flow_dump_thread_create(struct 
dpif_flow_dump *dump_)
      thread->dump = dump;
      ofpbuf_init(&thread->nl_flows, NL_DUMP_BUFSIZE);
      thread->nl_actions = NULL;
+    thread->netdev_cur_dump = 0;
+    thread->netdev_done = !(thread->netdev_cur_dump < dump->netdev_num);

      return &thread->up;
  }
@@ -1466,6 +1521,90 @@ dpif_netlink_flow_to_dpif_flow(struct dpif *dpif, struct 
dpif_flow *dpif_flow,
      dpif_netlink_flow_get_stats(datapath_flow, &dpif_flow->stats);
  }

+static void
+dpif_netlink_advance_netdev_dump(struct dpif_netlink_flow_dump_thread *thread)
+{
+    struct dpif_netlink_flow_dump *dump = thread->dump;
+
+    ovs_mutex_lock(&dump->netdev_lock);
+    /* if we haven't finished (dumped everything) */
+    if (dump->netdev_given < dump->netdev_num) {
+        /* if we are the first to find that given dump is finished
+         * (for race condition, e.g 3 finish dump 0 at the same time) */
Why is there a race condition here if this is executed under netdev_lock?
The design is such that all threads are working together on the first dump to the last, in order. (at first they all on dump 0), and when one thread finds that the given dump is finished, they all move to the next. As the comment tried to explain, if 3 (or 2+) threads are working on the first dump, dump 0, (thread->netdev_cur_dump == 0) and finish at the same time, they all call advance func. Now the first one to get the lock advances the shared given dump, which signify which highest dump we have given (and all lower dumps have finished). The rest now enter and we check if the dump they have found to be finished is higher then the new one that was given, if not they catch up, so now all of them will work on dump 1.

The race is that if 2 or more threads worked on the same dump and finished at the same time, if we just increased netdev_given without checking (thread->cur == given) for both of them,
we would have increased given twice and skip one dump.


+        if (thread->netdev_cur_dump == dump->netdev_given) {
+            thread->netdev_cur_dump = ++dump->netdev_given;
+            /* did we just finish the last dump? done. */
+            if (dump->netdev_given == dump->netdev_num) {
+                thread->netdev_done = true;
+            }
+        } else {
+            /* otherwise, we are behind, catch up */
+            thread->netdev_cur_dump = dump->netdev_given;
+        }
+    } else {
+        /* some other thread finished */
+        thread->netdev_done = true;
+    }
+    ovs_mutex_unlock(&dump->netdev_lock);
+}
+
+static struct odp_support netdev_flow_support = {
+    .max_mpls_depth = SIZE_MAX,
+    .recirc = false,
+    .ct_state = false,
+    .ct_zone = false,
+    .ct_mark = false,
+    .ct_label = false,
+};
+
+static int
+dpif_netlink_netdev_match_to_dpif_flow(struct match *match,
+                                       struct ofpbuf *key_buf,
+                                       struct ofpbuf *mask_buf,
+                                       struct nlattr *actions,
+                                       struct dpif_flow_stats *stats,
+                                       ovs_u128 *ufid,
+                                       struct dpif_flow *flow,
+                                       bool terse OVS_UNUSED)
+{
+
+    struct odp_flow_key_parms odp_parms = {
+        .flow = &match->flow,
+        .mask = &match->wc.masks,
+        .support = netdev_flow_support,
There's also 'key_buf' field in parms that may be needed.

+    };
+    size_t offset;
+
+    memset(flow, 0, sizeof *flow);
+
+    /* Key */
+    offset = key_buf->size;
+    flow->key = ofpbuf_tail(key_buf);
+    odp_flow_key_from_flow(&odp_parms, key_buf);
+    flow->key_len = key_buf->size - offset;
+
+    /* Mask */
+    offset = mask_buf->size;
+    flow->mask = ofpbuf_tail(mask_buf);
+    odp_parms.key_buf = key_buf;
+    odp_flow_key_from_mask(&odp_parms, mask_buf);
+    flow->mask_len = mask_buf->size - offset;
+
+    /* Actions */
+    flow->actions = nl_attr_get(actions);
+    flow->actions_len = nl_attr_get_size(actions);
+
+    /* Stats */
+    memcpy(&flow->stats, stats, sizeof *stats);
+
+    /* UFID */
+    flow->ufid_present = true;
+    flow->ufid = *ufid;
+
+    flow->pmd_id = PMD_ID_NULL;
+    return 0;
+}
+
  static int
  dpif_netlink_flow_dump_next(struct dpif_flow_dump_thread *thread_,
                              struct dpif_flow *flows, int max_flows)
@@ -1475,11 +1614,51 @@ dpif_netlink_flow_dump_next(struct 
dpif_flow_dump_thread *thread_,
      struct dpif_netlink_flow_dump *dump = thread->dump;
      struct dpif_netlink *dpif = dpif_netlink_cast(thread->up.dpif);
      int n_flows;
+    int i = 0;

      ofpbuf_delete(thread->nl_actions);
      thread->nl_actions = NULL;

      n_flows = 0;
+
+    while (!thread->netdev_done && n_flows < max_flows
+           && i < FLOW_DUMP_MAX_BATCH) {
+        struct odputil_keybuf *maskbuf = &thread->maskbuf[i];
+        struct odputil_keybuf *keybuf = &thread->keybuf[i];
+        struct odputil_keybuf *actbuf = &thread->actbuf[i];
+        struct ofpbuf key, mask, act;
+        struct dpif_flow *f = &flows[n_flows];
+        int cur = thread->netdev_cur_dump;
+        struct netdev_flow_dump *netdev_dump = dump->netdev_dumps[cur];
+        struct match match;
+        struct nlattr *actions;
+        struct dpif_flow_stats stats;
+        ovs_u128 ufid;
+        bool has_next;
+
+        ofpbuf_use_stack(&key, keybuf, sizeof *keybuf);
+        ofpbuf_use_stack(&act, actbuf, sizeof *actbuf);
+        ofpbuf_use_stack(&mask, maskbuf, sizeof *maskbuf);
+        has_next = netdev_flow_dump_next(netdev_dump, &match,
+                                        &actions, &stats,
+                                        &ufid,
+                                        &thread->nl_flows,
+                                        &act);
+        if (has_next) {
+            dpif_netlink_netdev_match_to_dpif_flow(&match,
+                                                   &key, &mask,
+                                                   actions,
+                                                   &stats,
+                                                   &ufid,
+                                                   f,
+                                                   dump->up.terse);
+            n_flows++;
+            i++;
Seems like 'i' and 'n_flows' are trying to achieve the same objective.
Can we just drop 'i'?

+        } else {
+            dpif_netlink_advance_netdev_dump(thread);
+        }
+    }
+
      while (!n_flows
             || (n_flows < max_flows && thread->nl_flows.size)) {
          struct dpif_netlink_flow datapath_flow;
--
1.8.3.1


_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to