[ovs-dev] [RFC 3/3] dpif-netlink: Introduce per-cpu upcall dispatch

Mark Gray Fri, 30 Apr 2021 08:32:10 -0700

The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.


This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:

* On systems with a large number of vports, there is correspondingly
a large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=1834444)

This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.

In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:

a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.

Reported-at: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray <mark.d.g...@redhat.com>
---
 .../linux/compat/include/linux/openvswitch.h  |   7 +
 lib/dpif-netdev.c                             |   1 +
 lib/dpif-netlink.c                            | 405 +++++++++++++++---
 lib/dpif-provider.h                           |  10 +
 lib/dpif.c                                    |  17 +
 lib/dpif.h                                    |   1 +
 ofproto/ofproto-dpif-upcall.c                 |  51 ++-
 ofproto/ofproto.c                             |  12 -
 8 files changed, 430 insertions(+), 74 deletions(-)

diff --git a/datapath/linux/compat/include/linux/openvswitch.h 
b/datapath/linux/compat/include/linux/openvswitch.h
index 875de20250ce..f29265df055e 100644
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -89,6 +89,8 @@ enum ovs_datapath_cmd {
  * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
  * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
  * not be sent.
+ * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
+ * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
  * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
  * datapath.  Always present in notifications.
  * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the
@@ -105,6 +107,8 @@ enum ovs_datapath_attr {
        OVS_DP_ATTR_MEGAFLOW_STATS,     /* struct ovs_dp_megaflow_stats */
        OVS_DP_ATTR_USER_FEATURES,      /* OVS_DP_F_*  */
        OVS_DP_ATTR_PAD,
+       OVS_DP_ATTR_PAD2,
+       OVS_DP_ATTR_PER_CPU_PIDS,       /* Netlink PIDS to receive upcalls */
        __OVS_DP_ATTR_MAX
 };
 
@@ -146,6 +150,9 @@ struct ovs_vport_stats {
 /* Allow tc offload recirc sharing */
 #define OVS_DP_F_TC_RECIRC_SHARING  (1 << 2)
 
+/* Allow per-cpu dispatch of upcalls */
+#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU (1 << 3)
+
 /* Fixed logical ports. */
 #define OVSP_LOCAL      ((__u32)0)
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 251788b04965..24e6911dd4ff 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -8488,6 +8488,7 @@ const struct dpif_class dpif_netdev_class = {
     dpif_netdev_operate,
     NULL,                       /* recv_set */
     NULL,                       /* handlers_set */
+    NULL,                       /* handlers_get */
     dpif_netdev_set_config,
     dpif_netdev_queue_to_priority,
     NULL,                       /* recv */
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 2ded5fdd01b3..349897e70632 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -80,6 +80,9 @@ enum { MAX_PORTS = USHRT_MAX };
 #define FLOW_DUMP_MAX_BATCH 50
 #define OPERATE_MAX_OPS 50
 
+#define DISPATCH_MODE_PER_CPU(dpif) ((dpif)->user_features & \
+                                     OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+
 #ifndef EPOLLEXCLUSIVE
 #define EPOLLEXCLUSIVE (1u << 28)
 #endif
@@ -98,6 +101,8 @@ struct dpif_netlink_dp {
     const struct ovs_dp_stats *stats;  /* OVS_DP_ATTR_STATS. */
     const struct ovs_dp_megaflow_stats *megaflow_stats;
                                        /* OVS_DP_ATTR_MEGAFLOW_STATS.*/
+    const uint32_t *upcall_pids;       /* OVS_DP_ATTR_PER_CPU_PIDS */
+    uint32_t n_upcall_pids;
 };
 
 static void dpif_netlink_dp_init(struct dpif_netlink_dp *);
@@ -178,11 +183,16 @@ struct dpif_windows_vport_sock {
 #endif
 
 struct dpif_handler {
+    /* per-vport dispatch mode */
     struct epoll_event *epoll_events;
     int epoll_fd;                 /* epoll fd that includes channel socks. */
     int n_events;                 /* Num events returned by epoll_wait(). */
     int event_offset;             /* Offset into 'epoll_events'. */
 
+    /* per-cpu dispatch mode */
+    struct nl_sock *sock;         /* Each handler thread holds one netlink
+                                     socket. */
+
 #ifdef _WIN32
     /* Pool of sockets. */
     struct dpif_windows_vport_sock *vport_sock_pool;
@@ -201,6 +211,8 @@ struct dpif_netlink {
     struct fat_rwlock upcall_lock;
     struct dpif_handler *handlers;
     uint32_t n_handlers;           /* Num of upcall handlers. */
+
+    /* Per-vport dispatch mode */
     struct dpif_channel *channels; /* Array of channels for each port. */
     int uc_array_size;             /* Size of 'handler->channels' and */
                                    /* 'handler->epoll_events'. */
@@ -241,8 +253,12 @@ static int open_dpif(const struct dpif_netlink_dp *, 
struct dpif **);
 static uint32_t dpif_netlink_port_get_pid(const struct dpif *,
                                           odp_port_t port_no);
 static void dpif_netlink_handler_uninit(struct dpif_handler *handler);
-static int dpif_netlink_refresh_channels(struct dpif_netlink *,
-                                         uint32_t n_handlers);
+static int dpif_netlink_refresh_handlers_vport_dispatch(struct dpif_netlink *,
+                                                        uint32_t n_handlers);
+static void destroy_all_channels(struct dpif_netlink *);
+static int dpif_netlink_refresh_handlers_cpu_dispatch(struct dpif_netlink *);
+static void destroy_all_handlers(struct dpif_netlink *);
+
 static void dpif_netlink_vport_to_ofpbuf(const struct dpif_netlink_vport *,
                                          struct ofpbuf *);
 static int dpif_netlink_vport_from_ofpbuf(struct dpif_netlink_vport *,
@@ -357,11 +373,34 @@ dpif_netlink_open(const struct dpif_class *class 
OVS_UNUSED, const char *name,
         dp_request.cmd = OVS_DP_CMD_SET;
     }
 
+    /* The Open vSwitch kernel module has two modes for dispatching upcalls:
+     * per-vport and per-cpu.
+     *
+     * When dispatching upcalls per-vport, the kernel will
+     * send the upcall via a Netlink socket that has been selected based on the
+     * vport that received the packet that is causing the upcall.
+     *
+     * When dispatching upcall per-cpu, the kernel will send the upcall via
+     * a Netlink socket that has been selected based on the cpu that received
+     * the packet that is causing the upcall.
+     *
+     * First we test to see if the kernel module supports per-cpu dispatching
+     * (the preferred method). If it does not support per-cpu dispatching, we
+     * fall back to the per-vport dispatch mode.
+     */
     dp_request.user_features |= OVS_DP_F_UNALIGNED;
-    dp_request.user_features |= OVS_DP_F_VPORT_PIDS;
+    dp_request.user_features |= OVS_DP_F_DISPATCH_UPCALL_PER_CPU;
     error = dpif_netlink_dp_transact(&dp_request, &dp, &buf);
     if (error) {
-        return error;
+        dp_request.user_features &= ~OVS_DP_F_DISPATCH_UPCALL_PER_CPU;
+        dp_request.user_features |= OVS_DP_F_VPORT_PIDS;
+        error = dpif_netlink_dp_transact(&dp_request, &dp, &buf);
+        if (error) {
+            return error;
+        }
+        VLOG_INFO("Dispatch mode(per-vport)");
+    } else {
+        VLOG_INFO("Dispatch mode:(per-cpu)");
     }
 
     error = open_dpif(&dp, dpifp);
@@ -609,6 +648,24 @@ destroy_all_channels(struct dpif_netlink *dpif)
     dpif->uc_array_size = 0;
 }
 
+static void
+destroy_all_handlers(struct dpif_netlink *dpif)
+    OVS_REQ_WRLOCK(dpif->upcall_lock)
+{
+    int i = 0;
+
+    if (!dpif->handlers) {
+        return;
+    }
+    for (i = 0; i < dpif->n_handlers; i++) {
+        struct dpif_handler *handler = &dpif->handlers[i];
+        close_nl_sock(handler->sock);
+    }
+    free(dpif->handlers);
+    dpif->handlers = NULL;
+    dpif->n_handlers = 0;
+}
+
 static void
 dpif_netlink_close(struct dpif *dpif_)
 {
@@ -617,7 +674,11 @@ dpif_netlink_close(struct dpif *dpif_)
     nl_sock_destroy(dpif->port_notifier);
 
     fat_rwlock_wrlock(&dpif->upcall_lock);
-    destroy_all_channels(dpif);
+    if (DISPATCH_MODE_PER_CPU(dpif)) {
+        destroy_all_handlers(dpif);
+    } else {
+        destroy_all_channels(dpif);
+    }
     fat_rwlock_unlock(&dpif->upcall_lock);
 
     fat_rwlock_destroy(&dpif->upcall_lock);
@@ -641,11 +702,14 @@ dpif_netlink_run(struct dpif *dpif_)
 {
     struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
 
-    if (dpif->refresh_channels) {
-        dpif->refresh_channels = false;
-        fat_rwlock_wrlock(&dpif->upcall_lock);
-        dpif_netlink_refresh_channels(dpif, dpif->n_handlers);
-        fat_rwlock_unlock(&dpif->upcall_lock);
+    if (!DISPATCH_MODE_PER_CPU(dpif)) {
+        if (dpif->refresh_channels) {
+            dpif->refresh_channels = false;
+            fat_rwlock_wrlock(&dpif->upcall_lock);
+            dpif_netlink_refresh_handlers_vport_dispatch(dpif,
+                                                         dpif->n_handlers);
+            fat_rwlock_unlock(&dpif->upcall_lock);
+        }
     }
     return false;
 }
@@ -681,6 +745,41 @@ dpif_netlink_get_stats(const struct dpif *dpif_, struct 
dpif_dp_stats *stats)
     return error;
 }
 
+static int
+dpif_netlink_set_handler_pids(struct dpif *dpif_, const uint32_t *upcall_pids,
+                              uint32_t n_upcall_pids)
+{
+    struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
+    struct dpif_netlink_dp request, reply;
+    struct ofpbuf *bufp;
+    int error;
+    int n_cores;
+
+    n_cores = count_cpu_cores();
+    ovs_assert(n_cores == n_upcall_pids);
+    VLOG_DBG("Dispatch mode(per-cpu): Number of CPUs is %d", n_cores);
+
+    dpif_netlink_dp_init(&request);
+    request.cmd = OVS_DP_CMD_SET;
+    request.name = dpif_->base_name;
+    request.dp_ifindex = dpif->dp_ifindex;
+    request.user_features = dpif->user_features |
+                            OVS_DP_F_DISPATCH_UPCALL_PER_CPU;
+
+    request.upcall_pids = upcall_pids;
+    request.n_upcall_pids = n_cores;
+
+    error = dpif_netlink_dp_transact(&request, &reply, &bufp);
+    if (!error) {
+        dpif->user_features = reply.user_features;
+        ofpbuf_delete(bufp);
+         if (!(dpif->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) {
+            return -EOPNOTSUPP;
+        }
+    }
+    return error;
+}
+
 static int
 dpif_netlink_set_features(struct dpif *dpif_, uint32_t new_features)
 {
@@ -741,7 +840,7 @@ get_vport_type(const struct dpif_netlink_vport *vport)
         return "erspan";
 
     case OVS_VPORT_TYPE_IP6ERSPAN:
-        return "ip6erspan"; 
+        return "ip6erspan";
 
     case OVS_VPORT_TYPE_IP6GRE:
         return "ip6gre";
@@ -807,10 +906,16 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const 
char *name,
     uint32_t upcall_pids = 0;
     int error = 0;
 
-    if (dpif->handlers) {
-        error = create_nl_sock(dpif, &sock);
-        if (error) {
-            return error;
+    /* per-cpu dispatch mode does not require a socket per vport */
+    if (!DISPATCH_MODE_PER_CPU(dpif)) {
+        if (dpif->handlers) {
+            error = create_nl_sock(dpif, &sock);
+            if (error) {
+                return error;
+            }
+        }
+        if (sock) {
+            upcall_pids = nl_sock_pid(sock);
         }
     }
 
@@ -821,9 +926,6 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const 
char *name,
     request.name = name;
 
     request.port_no = *port_nop;
-    if (sock) {
-        upcall_pids = nl_sock_pid(sock);
-    }
     request.n_upcall_pids = 1;
     request.upcall_pids = &upcall_pids;
 
@@ -845,19 +947,21 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const 
char *name,
         goto exit;
     }
 
-    error = vport_add_channel(dpif, *port_nop, sock);
-    if (error) {
-        VLOG_INFO("%s: could not add channel for port %s",
-                    dpif_name(&dpif->dpif), name);
-
-        /* Delete the port. */
-        dpif_netlink_vport_init(&request);
-        request.cmd = OVS_VPORT_CMD_DEL;
-        request.dp_ifindex = dpif->dp_ifindex;
-        request.port_no = *port_nop;
-        dpif_netlink_vport_transact(&request, NULL, NULL);
-        close_nl_sock(sock);
-        goto exit;
+    if (!DISPATCH_MODE_PER_CPU(dpif)) {
+        error = vport_add_channel(dpif, *port_nop, sock);
+        if (error) {
+            VLOG_INFO("%s: could not add channel for port %s",
+                        dpif_name(&dpif->dpif), name);
+
+            /* Delete the port. */
+            dpif_netlink_vport_init(&request);
+            request.cmd = OVS_VPORT_CMD_DEL;
+            request.dp_ifindex = dpif->dp_ifindex;
+            request.port_no = *port_nop;
+            dpif_netlink_vport_transact(&request, NULL, NULL);
+            close_nl_sock(sock);
+            goto exit;
+        }
     }
 
 exit:
@@ -1115,6 +1219,11 @@ dpif_netlink_port_get_pid(const struct dpif *dpif_, 
odp_port_t port_no)
     const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
     uint32_t ret;
 
+    /* In per-cpu dispatch mode, vports do not have an associated PID */
+    if (DISPATCH_MODE_PER_CPU(dpif)) {
+        return 0;
+    }
+
     fat_rwlock_rdlock(&dpif->upcall_lock);
     ret = dpif_netlink_port_get_pid__(dpif, port_no);
     fat_rwlock_unlock(&dpif->upcall_lock);
@@ -2326,12 +2435,51 @@ dpif_netlink_handler_uninit(struct dpif_handler 
*handler)
 }
 #endif
 
+static int
+dpif_netlink_refresh_handlers_cpu_dispatch(struct dpif_netlink *dpif)
+    OVS_REQ_WRLOCK(dpif->upcall_lock)
+{
+    int handler_id;
+    int error = 0;
+    uint32_t n_handlers;
+    uint32_t *upcall_pids;
+
+    n_handlers = count_cpu_cores();
+    if (dpif->n_handlers != n_handlers) {
+        VLOG_DBG("Dispatch mode(per-cpu): initializing %d handlers",
+                   n_handlers);
+        destroy_all_handlers(dpif);
+        upcall_pids = xzalloc(n_handlers * sizeof *upcall_pids);
+        dpif->handlers = xzalloc(n_handlers * sizeof *dpif->handlers);
+        for (handler_id = 0; handler_id < n_handlers; handler_id++) {
+            struct dpif_handler *handler = &dpif->handlers[handler_id];
+            error = create_nl_sock(dpif, &handler->sock);
+            if (error) {
+                VLOG_ERR("Dispatch mode(per-cpu): Cannot create socket for"
+                         "handler %d", handler_id);
+                continue;
+            }
+            upcall_pids[handler_id] = nl_sock_pid(handler->sock);
+            VLOG_DBG("Dispatch mode(per-cpu): "
+                      "handler %d has Netlink PID of %u",
+                      handler_id, upcall_pids[handler_id]);
+        }
+
+        dpif->n_handlers = n_handlers;
+        error = dpif_netlink_set_handler_pids(&dpif->dpif, upcall_pids,
+                                              n_handlers);
+        free(upcall_pids);
+    }
+    return error;
+}
+
 /* Synchronizes 'channels' in 'dpif->handlers'  with the set of vports
  * currently in 'dpif' in the kernel, by adding a new set of channels for
  * any kernel vport that lacks one and deleting any channels that have no
  * backing kernel vports. */
 static int
-dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t n_handlers)
+dpif_netlink_refresh_handlers_vport_dispatch(struct dpif_netlink *dpif,
+                                             uint32_t n_handlers)
     OVS_REQ_WRLOCK(dpif->upcall_lock)
 {
     unsigned long int *keep_channels;
@@ -2458,7 +2606,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink *dpif, 
uint32_t n_handlers)
 }
 
 static int
-dpif_netlink_recv_set__(struct dpif_netlink *dpif, bool enable)
+dpif_netlink_recv_set_vport_dispatch(struct dpif_netlink *dpif, bool enable)
     OVS_REQ_WRLOCK(dpif->upcall_lock)
 {
     if ((dpif->handlers != NULL) == enable) {
@@ -2467,7 +2615,21 @@ dpif_netlink_recv_set__(struct dpif_netlink *dpif, bool 
enable)
         destroy_all_channels(dpif);
         return 0;
     } else {
-        return dpif_netlink_refresh_channels(dpif, 1);
+        return dpif_netlink_refresh_handlers_vport_dispatch(dpif, 1);
+    }
+}
+
+static int
+dpif_netlink_recv_set_cpu_dispatch(struct dpif_netlink *dpif, bool enable)
+    OVS_REQ_WRLOCK(dpif->upcall_lock)
+{
+    if ((dpif->handlers != NULL) == enable) {
+        return 0;
+    } else if (!enable) {
+        destroy_all_handlers(dpif);
+        return 0;
+    } else {
+        return dpif_netlink_refresh_handlers_cpu_dispatch(dpif);
     }
 }
 
@@ -2478,7 +2640,11 @@ dpif_netlink_recv_set(struct dpif *dpif_, bool enable)
     int error;
 
     fat_rwlock_wrlock(&dpif->upcall_lock);
-    error = dpif_netlink_recv_set__(dpif, enable);
+    if (DISPATCH_MODE_PER_CPU(dpif)) {
+        error = dpif_netlink_recv_set_cpu_dispatch(dpif, enable);
+    } else {
+        error = dpif_netlink_recv_set_vport_dispatch(dpif, enable);
+    }
     fat_rwlock_unlock(&dpif->upcall_lock);
 
     return error;
@@ -2500,13 +2666,31 @@ dpif_netlink_handlers_set(struct dpif *dpif_, uint32_t 
n_handlers)
 
     fat_rwlock_wrlock(&dpif->upcall_lock);
     if (dpif->handlers) {
-        error = dpif_netlink_refresh_channels(dpif, n_handlers);
+        if (DISPATCH_MODE_PER_CPU(dpif)) {
+            error = dpif_netlink_refresh_handlers_cpu_dispatch(dpif);
+        } else {
+            error = dpif_netlink_refresh_handlers_vport_dispatch(dpif,
+                                                                 n_handlers);
+        }
     }
     fat_rwlock_unlock(&dpif->upcall_lock);
 
     return error;
 }
 
+static bool
+dpif_netlink_number_handlers_required(struct dpif *dpif_, uint32_t *n_handlers)
+{
+    struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
+
+    if (DISPATCH_MODE_PER_CPU(dpif)) {
+        *n_handlers = count_cpu_cores();
+        return true;
+    }
+
+    return false;
+}
+
 static int
 dpif_netlink_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
                              uint32_t queue_id, uint32_t *priority)
@@ -2669,8 +2853,59 @@ dpif_netlink_recv_windows(struct dpif_netlink *dpif, 
uint32_t handler_id,
 }
 #else
 static int
-dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
-                    struct dpif_upcall *upcall, struct ofpbuf *buf)
+dpif_netlink_recv_cpu_dispatch(struct dpif_netlink *dpif, uint32_t handler_id,
+                               struct dpif_upcall *upcall, struct ofpbuf *buf)
+    OVS_REQ_RDLOCK(dpif->upcall_lock)
+{
+    struct dpif_handler *handler;
+    int read_tries = 0;
+
+    if (!dpif->handlers || handler_id >= dpif->n_handlers) {
+        return EAGAIN;
+    }
+
+    handler = &dpif->handlers[handler_id];
+
+    for (;;) {
+        int dp_ifindex;
+        int error;
+
+        if (++read_tries > 50) {
+            return EAGAIN;
+        }
+        error = nl_sock_recv(handler->sock, buf, NULL, false);
+        if (error == ENOBUFS) {
+            /* ENOBUFS typically means that we've received so many
+                * packets that the buffer overflowed.  Try again
+                * immediately because there's almost certainly a packet
+                * waiting for us. */
+            report_loss(dpif, NULL, 0, handler_id);
+            continue;
+        }
+
+        if (error) {
+            if (error == EAGAIN) {
+                break;
+            }
+            return error;
+        }
+
+        error = parse_odp_packet(buf, upcall, &dp_ifindex);
+        if (!error && dp_ifindex == dpif->dp_ifindex) {
+            return 0;
+        } else if (error) {
+            return error;
+        }
+    }
+
+    return EAGAIN;
+}
+
+static int
+dpif_netlink_recv_vport_dispatch(struct dpif_netlink *dpif,
+                                 uint32_t handler_id,
+                                 struct dpif_upcall *upcall,
+                                 struct ofpbuf *buf)
     OVS_REQ_RDLOCK(dpif->upcall_lock)
 {
     struct dpif_handler *handler;
@@ -2755,18 +2990,24 @@ dpif_netlink_recv(struct dpif *dpif_, uint32_t 
handler_id,
 #ifdef _WIN32
     error = dpif_netlink_recv_windows(dpif, handler_id, upcall, buf);
 #else
-    error = dpif_netlink_recv__(dpif, handler_id, upcall, buf);
+    if (DISPATCH_MODE_PER_CPU(dpif)) {
+        error = dpif_netlink_recv_cpu_dispatch(dpif, handler_id, upcall, buf);
+    } else {
+        error = dpif_netlink_recv_vport_dispatch(dpif,
+                                                 handler_id, upcall, buf);
+    }
 #endif
     fat_rwlock_unlock(&dpif->upcall_lock);
 
     return error;
 }
 
+#ifdef _WIN32
 static void
-dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id)
+dpif_netlink_recv_wait_windows(struct dpif_netlink *dpif, uint32_t handler_id)
     OVS_REQ_RDLOCK(dpif->upcall_lock)
 {
-#ifdef _WIN32
+
     uint32_t i;
     struct dpif_windows_vport_sock *sock_pool =
         dpif->handlers[handler_id].vport_sock_pool;
@@ -2779,13 +3020,31 @@ dpif_netlink_recv_wait__(struct dpif_netlink *dpif, 
uint32_t handler_id)
     for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
         nl_sock_wait(sock_pool[i].nl_sock, POLLIN);
     }
-#else
+}
+#endif
+
+static void
+dpif_netlink_recv_wait_vport_dispatch(struct dpif_netlink *dpif,
+                                      uint32_t handler_id)
+    OVS_REQ_RDLOCK(dpif->upcall_lock)
+{
     if (dpif->handlers && handler_id < dpif->n_handlers) {
         struct dpif_handler *handler = &dpif->handlers[handler_id];
 
         poll_fd_wait(handler->epoll_fd, POLLIN);
     }
-#endif
+}
+
+static void
+dpif_netlink_recv_wait_cpu_dispatch(struct dpif_netlink *dpif,
+                                    uint32_t handler_id)
+    OVS_REQ_RDLOCK(dpif->upcall_lock)
+{
+    if (dpif->handlers && handler_id < dpif->n_handlers) {
+        struct dpif_handler *handler = &dpif->handlers[handler_id];
+
+        poll_fd_wait(nl_sock_fd(handler->sock), POLLIN);
+    }
 }
 
 static void
@@ -2794,12 +3053,20 @@ dpif_netlink_recv_wait(struct dpif *dpif_, uint32_t 
handler_id)
     struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
 
     fat_rwlock_rdlock(&dpif->upcall_lock);
-    dpif_netlink_recv_wait__(dpif, handler_id);
+#ifdef _WIN32
+    dpif_netlink_recv_wait_windows(dpif, handler_id);
+#else
+    if (DISPATCH_MODE_PER_CPU(dpif)) {
+        dpif_netlink_recv_wait_cpu_dispatch(dpif, handler_id);
+    } else {
+        dpif_netlink_recv_wait_vport_dispatch(dpif, handler_id);
+    }
+#endif
     fat_rwlock_unlock(&dpif->upcall_lock);
 }
 
 static void
-dpif_netlink_recv_purge__(struct dpif_netlink *dpif)
+dpif_netlink_recv_purge_vport_dispatch(struct dpif_netlink *dpif)
     OVS_REQ_WRLOCK(dpif->upcall_lock)
 {
     if (dpif->handlers) {
@@ -2815,13 +3082,31 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif)
     }
 }
 
+static void
+dpif_netlink_recv_purge_cpu_dispatch(struct dpif_netlink *dpif)
+    OVS_REQ_WRLOCK(dpif->upcall_lock)
+{
+    int handler_id;
+
+    if (dpif->handlers) {
+        for (handler_id = 0; handler_id < dpif->n_handlers; handler_id++) {
+            struct dpif_handler *handler = &dpif->handlers[handler_id];
+            nl_sock_drain(handler->sock);
+        }
+    }
+}
+
 static void
 dpif_netlink_recv_purge(struct dpif *dpif_)
 {
     struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
 
     fat_rwlock_wrlock(&dpif->upcall_lock);
-    dpif_netlink_recv_purge__(dpif);
+    if (DISPATCH_MODE_PER_CPU(dpif)) {
+        dpif_netlink_recv_purge_cpu_dispatch(dpif);
+    } else {
+        dpif_netlink_recv_purge_vport_dispatch(dpif);
+    }
     fat_rwlock_unlock(&dpif->upcall_lock);
 }
 
@@ -3974,6 +4259,7 @@ const struct dpif_class dpif_netlink_class = {
     dpif_netlink_operate,
     dpif_netlink_recv_set,
     dpif_netlink_handlers_set,
+    dpif_netlink_number_handlers_required,
     NULL,                       /* set_config */
     dpif_netlink_queue_to_priority,
     dpif_netlink_recv,
@@ -4337,6 +4623,11 @@ dpif_netlink_dp_to_ofpbuf(const struct dpif_netlink_dp 
*dp, struct ofpbuf *buf)
         nl_msg_put_u32(buf, OVS_DP_ATTR_USER_FEATURES, dp->user_features);
     }
 
+    if (dp->upcall_pids) {
+        nl_msg_put_unspec(buf, OVS_DP_ATTR_PER_CPU_PIDS, dp->upcall_pids,
+                          sizeof *dp->upcall_pids * dp->n_upcall_pids);
+    }
+
     /* Skip OVS_DP_ATTR_STATS since we never have a reason to serialize it. */
 }
 
@@ -4642,7 +4933,6 @@ dpif_netlink_flow_get_stats(const struct 
dpif_netlink_flow *flow,
     stats->used = flow->used ? get_32aligned_u64(flow->used) : 0;
     stats->tcp_flags = flow->tcp_flags ? *flow->tcp_flags : 0;
 }
-
 /* Logs information about a packet that was recently lost in 'ch' (in
  * 'dpif_'). */
 static void
@@ -4656,13 +4946,18 @@ report_loss(struct dpif_netlink *dpif, struct 
dpif_channel *ch, uint32_t ch_idx,
         return;
     }
 
-    ds_init(&s);
-    if (ch->last_poll != LLONG_MIN) {
-        ds_put_format(&s, " (last polled %lld ms ago)",
-                      time_msec() - ch->last_poll);
-    }
+    if (DISPATCH_MODE_PER_CPU(dpif)) {
+        VLOG_WARN("%s: lost packet on handler %u",
+                  dpif_name(&dpif->dpif), handler_id);
+    } else {
+        ds_init(&s);
+        if (ch->last_poll != LLONG_MIN) {
+            ds_put_format(&s, " (last polled %lld ms ago)",
+                        time_msec() - ch->last_poll);
+        }
 
-    VLOG_WARN("%s: lost packet on port channel %u of handler %u%s",
-              dpif_name(&dpif->dpif), ch_idx, handler_id, ds_cstr(&s));
-    ds_destroy(&s);
+        VLOG_WARN("%s: lost packet on port channel %u of handler %u%s",
+                dpif_name(&dpif->dpif), ch_idx, handler_id, ds_cstr(&s));
+        ds_destroy(&s);
+    }
 }
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index b817fceac698..32711f19d1f6 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -357,6 +357,16 @@ struct dpif_class {
      * */
     int (*handlers_set)(struct dpif *dpif, uint32_t n_handlers);
 
+    /* Queries 'dpif' to see if a certain number of handlers are required by
+     * the implementation.
+     *
+     * If a certain number of handlers are required, returns 'true' and sets
+     * 'n_handlers' to that number of handler threads.
+     *
+     * If not, returns 'false'.
+     */
+    bool (*number_handlers_required)(struct dpif *dpif, uint32_t *n_handlers);
+
     /* Pass custom configuration options to the datapath.  The implementation
      * might postpone applying the changes until run() is called. */
     int (*set_config)(struct dpif *dpif, const struct smap *other_config);
diff --git a/lib/dpif.c b/lib/dpif.c
index 26e8bfb7db98..511383514d5b 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1489,6 +1489,23 @@ dpif_handlers_set(struct dpif *dpif, uint32_t n_handlers)
     return error;
 }
 
+/* Checks if a certain number of handlers are required.
+ *
+ * If a certain number of handlers are required, returns 'true' and sets
+ * 'n_handlers' to that number of handler threads.
+ *
+ * If not, returns 'false'
+ */
+bool
+dpif_number_handlers_required(struct dpif *dpif, uint32_t *n_handlers) {
+    bool ret = false;
+
+    if (dpif->dpif_class->number_handlers_required) {
+        ret = dpif->dpif_class->number_handlers_required(dpif, n_handlers);
+    }
+    return ret;
+}
+
 void
 dpif_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb, void *aux)
 {
diff --git a/lib/dpif.h b/lib/dpif.h
index f9728e67393b..7c322d20e6c7 100644
--- a/lib/dpif.h
+++ b/lib/dpif.h
@@ -873,6 +873,7 @@ void dpif_register_upcall_cb(struct dpif *, upcall_callback 
*, void *aux);
 
 int dpif_recv_set(struct dpif *, bool enable);
 int dpif_handlers_set(struct dpif *, uint32_t n_handlers);
+bool dpif_number_handlers_required(struct dpif *, uint32_t *n_handlers);
 int dpif_set_config(struct dpif *, const struct smap *cfg);
 int dpif_port_set_config(struct dpif *, odp_port_t, const struct smap *cfg);
 int dpif_recv(struct dpif *, uint32_t handler_id, struct dpif_upcall *,
diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index 88406fea1391..45c22d20c183 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -636,24 +636,61 @@ udpif_set_threads(struct udpif *udpif, uint32_t 
n_handlers_,
                   uint32_t n_revalidators_)
 {
     ovs_assert(udpif);
-    ovs_assert(n_handlers_ && n_revalidators_);
+    uint32_t n_handlers_requested;
+    uint32_t n_revalidators_requested;
+    bool forced = false;
+
+    if (dpif_number_handlers_required(udpif->dpif, &n_handlers_requested)) {
+        forced = true;
+        if (!n_revalidators_) {
+            n_revalidators_requested = n_handlers_requested / 4 + 1;
+        } else {
+            n_revalidators_requested = n_revalidators_;
+        }
+    } else {
+        int threads = MAX(count_cpu_cores(), 2);
+
+        n_revalidators_requested = MAX(n_revalidators_, 0);
+        n_handlers_requested = MAX(n_handlers_, 0);
 
-    if (udpif->n_handlers != n_handlers_
-        || udpif->n_revalidators != n_revalidators_) {
+        if (!n_revalidators_requested) {
+            n_revalidators_requested = n_handlers_requested
+            ? MAX(threads - (int) n_handlers_requested, 1)
+            : threads / 4 + 1;
+        }
+
+        if (!n_handlers_requested) {
+            n_handlers_requested = MAX(threads -
+                                       (int) n_revalidators_requested, 1);
+        }
+    }
+
+    if (udpif->n_handlers != n_handlers_requested
+        || udpif->n_revalidators != n_revalidators_requested) {
+        if (forced) {
+            VLOG_INFO("Overriding n-handler-threads to %u, setting "
+                      "n-revalidator-threads to %u", n_handlers_requested,
+                      n_revalidators_requested);
+            } else {
+            VLOG_INFO("Setting n-handler-threads to %u, setting "
+                      "n-revalidator-threads to %u", n_handlers_requested,
+                      n_revalidators_requested);
+        }
         udpif_stop_threads(udpif, true);
     }
 
     if (!udpif->handlers && !udpif->revalidators) {
+        VLOG_INFO("Starting %u threads", n_handlers_requested +
+                                         n_revalidators_requested);
         int error;
-
-        error = dpif_handlers_set(udpif->dpif, n_handlers_);
+        error = dpif_handlers_set(udpif->dpif, n_handlers_requested);
         if (error) {
             VLOG_ERR("failed to configure handlers in dpif %s: %s",
                      dpif_name(udpif->dpif), ovs_strerror(error));
             return;
         }
-
-        udpif_start_threads(udpif, n_handlers_, n_revalidators_);
+        udpif_start_threads(udpif, n_handlers_requested,
+                            n_revalidators_requested);
     }
 }
 
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index 69e5834e766c..1501eca9a989 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -792,20 +792,8 @@ ofproto_type_set_config(const char *datapath_type, const 
struct smap *cfg)
 void
 ofproto_set_threads(int n_handlers_, int n_revalidators_)
 {
-    int threads = MAX(count_cpu_cores(), 2);
-
     n_revalidators = MAX(n_revalidators_, 0);
     n_handlers = MAX(n_handlers_, 0);
-
-    if (!n_revalidators) {
-        n_revalidators = n_handlers
-            ? MAX(threads - (int) n_handlers, 1)
-            : threads / 4 + 1;
-    }
-
-    if (!n_handlers) {
-        n_handlers = MAX(threads - (int) n_revalidators, 1);
-    }
 }
 
 void
-- 
2.27.0

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

[ovs-dev] [RFC 3/3] dpif-netlink: Introduce per-cpu upcall dispatch

Reply via email to