On 01/07/2021 21:51, Flavio Leitner wrote: > > Hi Mark, > > I've not tested this yet. > See some comments below. > > On Wed, Jun 30, 2021 at 05:56:11AM -0400, Mark Gray wrote: >> The Open vSwitch kernel module uses the upcall mechanism to send >> packets from kernel space to user space when it misses in the kernel >> space flow table. The upcall sends packets via a Netlink socket. >> Currently, a Netlink socket is created for every vport. In this way, >> there is a 1:1 mapping between a vport and a Netlink socket. >> When a packet is received by a vport, if it needs to be sent to >> user space, it is sent via the corresponding Netlink socket. >> >> This mechanism, with various iterations of the corresponding user >> space code, has seen some limitations and issues: >> >> * On systems with a large number of vports, there is correspondingly >> a large number of Netlink sockets which can limit scaling. >> (https://bugzilla.redhat.com/show_bug.cgi?id=1526306) >> * Packet reordering on upcalls. >> (https://bugzilla.redhat.com/show_bug.cgi?id=1844576) >> * A thundering herd issue. >> (https://bugzilla.redhat.com/show_bug.cgi?id=1834444) >> >> This patch introduces an alternative, feature-negotiated, upcall >> mode using a per-cpu dispatch rather than a per-vport dispatch. >> >> In this mode, the Netlink socket to be used for the upcall is >> selected based on the CPU of the thread that is executing the upcall. >> In this way, it resolves the issues above as: >> >> a) The number of Netlink sockets scales with the number of CPUs >> rather than the number of vports. >> b) Ordering per-flow is maintained as packets are distributed to >> CPUs based on mechanisms such as RSS and flows are distributed >> to a single user space thread. >> c) Packets from a flow can only wake up one user space thread. >> >> Reported-at: https://bugzilla.redhat.com/1844576 >> Signed-off-by: Mark Gray <mark.d.g...@redhat.com> >> --- >> >> Notes: >> v1 - Reworked based on Flavio's comments: >> * change DISPATCH_MODE_PER_CPU() to inline function >> * add `ovs-appctl` command to check dispatch mode for datapaths >> * fixed issue with userspace actions (tested using `ovs-ofctl >> monitor br0 65534 -P nxt_packet_in`) >> * update documentation as requested >> >> .../linux/compat/include/linux/openvswitch.h | 7 + >> lib/dpif-netdev.c | 1 + >> lib/dpif-netlink.c | 456 ++++++++++++++++-- >> lib/dpif-provider.h | 32 +- >> lib/dpif.c | 17 + >> lib/dpif.h | 1 + >> ofproto/ofproto-dpif-upcall.c | 51 +- >> ofproto/ofproto.c | 12 - >> vswitchd/vswitch.xml | 23 +- >> 9 files changed, 504 insertions(+), 96 deletions(-) >> >> diff --git a/datapath/linux/compat/include/linux/openvswitch.h >> b/datapath/linux/compat/include/linux/openvswitch.h >> index 875de20250ce..f29265df055e 100644 >> --- a/datapath/linux/compat/include/linux/openvswitch.h >> +++ b/datapath/linux/compat/include/linux/openvswitch.h >> @@ -89,6 +89,8 @@ enum ovs_datapath_cmd { >> * set on the datapath port (for OVS_ACTION_ATTR_MISS). Only valid on >> * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should >> * not be sent. >> + * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when >> + * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set. >> * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the >> * datapath. Always present in notifications. >> * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for >> the >> @@ -105,6 +107,8 @@ enum ovs_datapath_attr { >> OVS_DP_ATTR_MEGAFLOW_STATS, /* struct ovs_dp_megaflow_stats */ >> OVS_DP_ATTR_USER_FEATURES, /* OVS_DP_F_* */ >> OVS_DP_ATTR_PAD, >> + OVS_DP_ATTR_PAD2, >> + OVS_DP_ATTR_PER_CPU_PIDS, /* Netlink PIDS to receive upcalls */ >> __OVS_DP_ATTR_MAX >> }; >> >> @@ -146,6 +150,9 @@ struct ovs_vport_stats { >> /* Allow tc offload recirc sharing */ >> #define OVS_DP_F_TC_RECIRC_SHARING (1 << 2) >> >> +/* Allow per-cpu dispatch of upcalls */ >> +#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU (1 << 3) >> + >> /* Fixed logical ports. */ >> #define OVSP_LOCAL ((__u32)0) >> >> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c >> index c5ab35d2a5a5..b2c2baadf4f3 100644 >> --- a/lib/dpif-netdev.c >> +++ b/lib/dpif-netdev.c >> @@ -8562,6 +8562,7 @@ const struct dpif_class dpif_netdev_class = { >> dpif_netdev_operate, >> NULL, /* recv_set */ >> NULL, /* handlers_set */ >> + NULL, /* number_handlers_required */ >> dpif_netdev_set_config, >> dpif_netdev_queue_to_priority, >> NULL, /* recv */ >> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c >> index f92905dd83fd..2399879aea3e 100644 >> --- a/lib/dpif-netlink.c >> +++ b/lib/dpif-netlink.c >> @@ -98,6 +98,8 @@ struct dpif_netlink_dp { >> const struct ovs_dp_stats *stats; /* OVS_DP_ATTR_STATS. */ >> const struct ovs_dp_megaflow_stats *megaflow_stats; >> /* OVS_DP_ATTR_MEGAFLOW_STATS.*/ >> + const uint32_t *upcall_pids; /* OVS_DP_ATTR_PER_CPU_PIDS */ >> + uint32_t n_upcall_pids; >> }; >> >> static void dpif_netlink_dp_init(struct dpif_netlink_dp *); >> @@ -113,6 +115,10 @@ static int dpif_netlink_dp_get(const struct dpif *, >> static int >> dpif_netlink_set_features(struct dpif *dpif_, uint32_t new_features); >> >> +static void >> +dpif_netlink_unixctl_dispatch_mode(struct unixctl_conn *conn, int argc, >> + const char *argv[], void *aux); >> + >> struct dpif_netlink_flow { >> /* Generic Netlink header. */ >> uint8_t cmd; >> @@ -178,11 +184,16 @@ struct dpif_windows_vport_sock { >> #endif >> >> struct dpif_handler { >> + /* per-vport dispatch mode */ >> struct epoll_event *epoll_events; >> int epoll_fd; /* epoll fd that includes channel socks. >> */ >> int n_events; /* Num events returned by epoll_wait(). */ >> int event_offset; /* Offset into 'epoll_events'. */ >> >> + /* per-cpu dispatch mode */ >> + struct nl_sock *sock; /* Each handler thread holds one netlink >> + socket. */ >> + >> #ifdef _WIN32 >> /* Pool of sockets. */ >> struct dpif_windows_vport_sock *vport_sock_pool; >> @@ -201,6 +212,8 @@ struct dpif_netlink { >> struct fat_rwlock upcall_lock; >> struct dpif_handler *handlers; >> uint32_t n_handlers; /* Num of upcall handlers. */ >> + >> + /* Per-vport dispatch mode */ >> struct dpif_channel *channels; /* Array of channels for each port. */ >> int uc_array_size; /* Size of 'handler->channels' and */ >> /* 'handler->epoll_events'. */ >> @@ -241,8 +254,12 @@ static int open_dpif(const struct dpif_netlink_dp *, >> struct dpif **); >> static uint32_t dpif_netlink_port_get_pid(const struct dpif *, >> odp_port_t port_no); >> static void dpif_netlink_handler_uninit(struct dpif_handler *handler); >> -static int dpif_netlink_refresh_channels(struct dpif_netlink *, >> - uint32_t n_handlers); >> +static int dpif_netlink_refresh_handlers_vport_dispatch(struct dpif_netlink >> *, >> + uint32_t >> n_handlers); >> +static void destroy_all_channels(struct dpif_netlink *); >> +static int dpif_netlink_refresh_handlers_cpu_dispatch(struct dpif_netlink >> *); >> +static void destroy_all_handlers(struct dpif_netlink *); >> + >> static void dpif_netlink_vport_to_ofpbuf(const struct dpif_netlink_vport *, >> struct ofpbuf *); >> static int dpif_netlink_vport_from_ofpbuf(struct dpif_netlink_vport *, >> @@ -294,6 +311,11 @@ dpif_netlink_cast(const struct dpif *dpif) >> return CONTAINER_OF(dpif, struct dpif_netlink, dpif); >> } >> >> +static inline bool >> +dpif_netlink_upcall_per_cpu(const struct dpif_netlink *dpif) { >> + return !!((dpif)->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU); >> +} >> + >> static int >> dpif_netlink_enumerate(struct sset *all_dps, >> const struct dpif_class *dpif_class OVS_UNUSED) >> @@ -357,11 +379,35 @@ dpif_netlink_open(const struct dpif_class *class >> OVS_UNUSED, const char *name, >> dp_request.cmd = OVS_DP_CMD_SET; >> } >> >> + /* The Open vSwitch kernel module has two modes for dispatching upcalls: >> + * per-vport and per-cpu. >> + * >> + * When dispatching upcalls per-vport, the kernel will >> + * send the upcall via a Netlink socket that has been selected based on >> the >> + * vport that received the packet that is causing the upcall. >> + * >> + * When dispatching upcall per-cpu, the kernel will send the upcall via >> + * a Netlink socket that has been selected based on the cpu that >> received >> + * the packet that is causing the upcall. >> + * >> + * First we test to see if the kernel module supports per-cpu >> dispatching >> + * (the preferred method). If it does not support per-cpu dispatching, >> we >> + * fall back to the per-vport dispatch mode. >> + */ >> dp_request.user_features |= OVS_DP_F_UNALIGNED; >> - dp_request.user_features |= OVS_DP_F_VPORT_PIDS; >> + dp_request.user_features &= ~OVS_DP_F_VPORT_PIDS; >> + dp_request.user_features |= OVS_DP_F_DISPATCH_UPCALL_PER_CPU; >> error = dpif_netlink_dp_transact(&dp_request, &dp, &buf); >> if (error) { >> - return error; >> + dp_request.user_features &= ~OVS_DP_F_DISPATCH_UPCALL_PER_CPU; >> + dp_request.user_features |= OVS_DP_F_VPORT_PIDS; >> + error = dpif_netlink_dp_transact(&dp_request, &dp, &buf); >> + if (error) { >> + return error; >> + } >> + VLOG_INFO("Dispatch mode(per-vport)"); >> + } else { >> + VLOG_INFO("Dispatch mode:(per-cpu)"); >> } >> >> error = open_dpif(&dp, dpifp); >> @@ -609,6 +655,24 @@ destroy_all_channels(struct dpif_netlink *dpif) >> dpif->uc_array_size = 0; >> } >> >> +static void >> +destroy_all_handlers(struct dpif_netlink *dpif) >> + OVS_REQ_WRLOCK(dpif->upcall_lock) >> +{ >> + int i = 0; >> + >> + if (!dpif->handlers) { >> + return; >> + } >> + for (i = 0; i < dpif->n_handlers; i++) { >> + struct dpif_handler *handler = &dpif->handlers[i]; >> + close_nl_sock(handler->sock); >> + } >> + free(dpif->handlers); >> + dpif->handlers = NULL; >> + dpif->n_handlers = 0; >> +} >> + >> static void >> dpif_netlink_close(struct dpif *dpif_) >> { >> @@ -617,7 +681,11 @@ dpif_netlink_close(struct dpif *dpif_) >> nl_sock_destroy(dpif->port_notifier); >> >> fat_rwlock_wrlock(&dpif->upcall_lock); >> - destroy_all_channels(dpif); >> + if (dpif_netlink_upcall_per_cpu(dpif)) { >> + destroy_all_handlers(dpif); >> + } else { >> + destroy_all_channels(dpif); >> + } >> fat_rwlock_unlock(&dpif->upcall_lock); >> >> fat_rwlock_destroy(&dpif->upcall_lock); >> @@ -641,11 +709,14 @@ dpif_netlink_run(struct dpif *dpif_) >> { >> struct dpif_netlink *dpif = dpif_netlink_cast(dpif_); >> >> - if (dpif->refresh_channels) { >> - dpif->refresh_channels = false; >> - fat_rwlock_wrlock(&dpif->upcall_lock); >> - dpif_netlink_refresh_channels(dpif, dpif->n_handlers); >> - fat_rwlock_unlock(&dpif->upcall_lock); >> + if (!dpif_netlink_upcall_per_cpu(dpif)) { >> + if (dpif->refresh_channels) { >> + dpif->refresh_channels = false; >> + fat_rwlock_wrlock(&dpif->upcall_lock); >> + dpif_netlink_refresh_handlers_vport_dispatch(dpif, >> + dpif->n_handlers); >> + fat_rwlock_unlock(&dpif->upcall_lock); >> + } >> } >> return false; >> } >> @@ -681,6 +752,41 @@ dpif_netlink_get_stats(const struct dpif *dpif_, struct >> dpif_dp_stats *stats) >> return error; >> } >> >> +static int >> +dpif_netlink_set_handler_pids(struct dpif *dpif_, const uint32_t >> *upcall_pids, >> + uint32_t n_upcall_pids) >> +{ >> + struct dpif_netlink *dpif = dpif_netlink_cast(dpif_); >> + struct dpif_netlink_dp request, reply; >> + struct ofpbuf *bufp; >> + int error; >> + int n_cores; >> + >> + n_cores = count_cpu_cores(); >> + ovs_assert(n_cores == n_upcall_pids); >> + VLOG_DBG("Dispatch mode(per-cpu): Number of CPUs is %d", n_cores); >> + >> + dpif_netlink_dp_init(&request); >> + request.cmd = OVS_DP_CMD_SET; >> + request.name = dpif_->base_name; >> + request.dp_ifindex = dpif->dp_ifindex; >> + request.user_features = dpif->user_features | >> + OVS_DP_F_DISPATCH_UPCALL_PER_CPU; >> + >> + request.upcall_pids = upcall_pids; >> + request.n_upcall_pids = n_cores; >> + >> + error = dpif_netlink_dp_transact(&request, &reply, &bufp); >> + if (!error) { >> + dpif->user_features = reply.user_features; >> + ofpbuf_delete(bufp); >> + if (!(dpif->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) { > > nit: perhaps use dpif_netlink_upcall_per_cpu() ?
I must have missed that one > > >> + return -EOPNOTSUPP; >> + } >> + } >> + return error; >> +} >> + >> static int >> dpif_netlink_set_features(struct dpif *dpif_, uint32_t new_features) >> { >> @@ -741,7 +847,7 @@ get_vport_type(const struct dpif_netlink_vport *vport) >> return "erspan"; >> >> case OVS_VPORT_TYPE_IP6ERSPAN: >> - return "ip6erspan"; >> + return "ip6erspan"; >> >> case OVS_VPORT_TYPE_IP6GRE: >> return "ip6gre"; >> @@ -807,10 +913,16 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, >> const char *name, >> uint32_t upcall_pids = 0; >> int error = 0; >> >> - if (dpif->handlers) { >> - error = create_nl_sock(dpif, &sock); >> - if (error) { >> - return error; >> + /* per-cpu dispatch mode does not require a socket per vport */ >> + if (!dpif_netlink_upcall_per_cpu(dpif)) { >> + if (dpif->handlers) { >> + error = create_nl_sock(dpif, &sock); >> + if (error) { >> + return error; >> + } >> + } >> + if (sock) { >> + upcall_pids = nl_sock_pid(sock); >> } >> } >> >> @@ -821,9 +933,6 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const >> char *name, >> request.name = name; >> >> request.port_no = *port_nop; >> - if (sock) { >> - upcall_pids = nl_sock_pid(sock); >> - } >> request.n_upcall_pids = 1; >> request.upcall_pids = &upcall_pids; >> >> @@ -845,19 +954,21 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, >> const char *name, >> goto exit; >> } >> >> - error = vport_add_channel(dpif, *port_nop, sock); >> - if (error) { >> - VLOG_INFO("%s: could not add channel for port %s", >> - dpif_name(&dpif->dpif), name); >> - >> - /* Delete the port. */ >> - dpif_netlink_vport_init(&request); >> - request.cmd = OVS_VPORT_CMD_DEL; >> - request.dp_ifindex = dpif->dp_ifindex; >> - request.port_no = *port_nop; >> - dpif_netlink_vport_transact(&request, NULL, NULL); >> - close_nl_sock(sock); >> - goto exit; >> + if (!dpif_netlink_upcall_per_cpu(dpif)) { >> + error = vport_add_channel(dpif, *port_nop, sock); >> + if (error) { >> + VLOG_INFO("%s: could not add channel for port %s", >> + dpif_name(&dpif->dpif), name); >> + >> + /* Delete the port. */ >> + dpif_netlink_vport_init(&request); >> + request.cmd = OVS_VPORT_CMD_DEL; >> + request.dp_ifindex = dpif->dp_ifindex; >> + request.port_no = *port_nop; >> + dpif_netlink_vport_transact(&request, NULL, NULL); >> + close_nl_sock(sock); >> + goto exit; >> + } >> } >> >> exit: >> @@ -1115,6 +1226,15 @@ dpif_netlink_port_get_pid(const struct dpif *dpif_, >> odp_port_t port_no) >> const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_); >> uint32_t ret; >> >> + /* In per-cpu dispatch mode, vports do not have an associated PID */ >> + if (dpif_netlink_upcall_per_cpu(dpif)) { >> + /* In per-cpu dispatch mode, this will be ignored as kernel space >> will >> + * select the PID before sending to user space. We set to 0xFFFFFFFF >> + * as 0 is rejected by kernel space as an invalid PID. >> + */ >> + return 0xFFFFFFFF; > > Can we have a define like below? > > /* This PID is not used by the kernel datapath when using dispatch per CPU, > * but it is required to be set (not zero). */ > #define DPIF_NETLINK_PER_CPU_PID UINT32_MAX > > >> + } >> + >> fat_rwlock_rdlock(&dpif->upcall_lock); >> ret = dpif_netlink_port_get_pid__(dpif, port_no); >> fat_rwlock_unlock(&dpif->upcall_lock); >> @@ -2326,12 +2446,51 @@ dpif_netlink_handler_uninit(struct dpif_handler >> *handler) >> } >> #endif >> >> +static int >> +dpif_netlink_refresh_handlers_cpu_dispatch(struct dpif_netlink *dpif) >> + OVS_REQ_WRLOCK(dpif->upcall_lock) >> +{ >> + int handler_id; >> + int error = 0; >> + uint32_t n_handlers; >> + uint32_t *upcall_pids; >> + >> + n_handlers = count_cpu_cores(); >> + if (dpif->n_handlers != n_handlers) { >> + VLOG_DBG("Dispatch mode(per-cpu): initializing %d handlers", >> + n_handlers); >> + destroy_all_handlers(dpif); >> + upcall_pids = xzalloc(n_handlers * sizeof *upcall_pids); >> + dpif->handlers = xzalloc(n_handlers * sizeof *dpif->handlers); >> + for (handler_id = 0; handler_id < n_handlers; handler_id++) { >> + struct dpif_handler *handler = &dpif->handlers[handler_id]; >> + error = create_nl_sock(dpif, &handler->sock); >> + if (error) { >> + VLOG_ERR("Dispatch mode(per-cpu): Cannot create socket for" >> + "handler %d", handler_id); >> + continue; >> + } >> + upcall_pids[handler_id] = nl_sock_pid(handler->sock); >> + VLOG_DBG("Dispatch mode(per-cpu): " >> + "handler %d has Netlink PID of %u", >> + handler_id, upcall_pids[handler_id]); >> + } >> + >> + dpif->n_handlers = n_handlers; >> + error = dpif_netlink_set_handler_pids(&dpif->dpif, upcall_pids, >> + n_handlers); >> + free(upcall_pids); >> + } >> + return error; >> +} >> + >> /* Synchronizes 'channels' in 'dpif->handlers' with the set of vports >> * currently in 'dpif' in the kernel, by adding a new set of channels for >> * any kernel vport that lacks one and deleting any channels that have no >> * backing kernel vports. */ >> static int >> -dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t >> n_handlers) >> +dpif_netlink_refresh_handlers_vport_dispatch(struct dpif_netlink *dpif, >> + uint32_t n_handlers) >> OVS_REQ_WRLOCK(dpif->upcall_lock) >> { >> unsigned long int *keep_channels; >> @@ -2458,7 +2617,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink >> *dpif, uint32_t n_handlers) >> } >> >> static int >> -dpif_netlink_recv_set__(struct dpif_netlink *dpif, bool enable) >> +dpif_netlink_recv_set_vport_dispatch(struct dpif_netlink *dpif, bool enable) >> OVS_REQ_WRLOCK(dpif->upcall_lock) >> { >> if ((dpif->handlers != NULL) == enable) { >> @@ -2467,7 +2626,21 @@ dpif_netlink_recv_set__(struct dpif_netlink *dpif, >> bool enable) >> destroy_all_channels(dpif); >> return 0; >> } else { >> - return dpif_netlink_refresh_channels(dpif, 1); >> + return dpif_netlink_refresh_handlers_vport_dispatch(dpif, 1); >> + } >> +} >> + >> +static int >> +dpif_netlink_recv_set_cpu_dispatch(struct dpif_netlink *dpif, bool enable) >> + OVS_REQ_WRLOCK(dpif->upcall_lock) >> +{ >> + if ((dpif->handlers != NULL) == enable) { >> + return 0; >> + } else if (!enable) { >> + destroy_all_handlers(dpif); >> + return 0; >> + } else { >> + return dpif_netlink_refresh_handlers_cpu_dispatch(dpif); >> } >> } >> >> @@ -2478,7 +2651,11 @@ dpif_netlink_recv_set(struct dpif *dpif_, bool enable) >> int error; >> >> fat_rwlock_wrlock(&dpif->upcall_lock); >> - error = dpif_netlink_recv_set__(dpif, enable); >> + if (dpif_netlink_upcall_per_cpu(dpif)) { >> + error = dpif_netlink_recv_set_cpu_dispatch(dpif, enable); >> + } else { >> + error = dpif_netlink_recv_set_vport_dispatch(dpif, enable); >> + } >> fat_rwlock_unlock(&dpif->upcall_lock); >> >> return error; >> @@ -2500,13 +2677,31 @@ dpif_netlink_handlers_set(struct dpif *dpif_, >> uint32_t n_handlers) >> >> fat_rwlock_wrlock(&dpif->upcall_lock); >> if (dpif->handlers) { >> - error = dpif_netlink_refresh_channels(dpif, n_handlers); >> + if (dpif_netlink_upcall_per_cpu(dpif)) { >> + error = dpif_netlink_refresh_handlers_cpu_dispatch(dpif); >> + } else { >> + error = dpif_netlink_refresh_handlers_vport_dispatch(dpif, >> + >> n_handlers); >> + } >> } >> fat_rwlock_unlock(&dpif->upcall_lock); >> >> return error; >> } >> >> +static bool >> +dpif_netlink_number_handlers_required(struct dpif *dpif_, uint32_t >> *n_handlers) >> +{ >> + struct dpif_netlink *dpif = dpif_netlink_cast(dpif_); >> + >> + if (dpif_netlink_upcall_per_cpu(dpif)) { >> + *n_handlers = count_cpu_cores(); >> + return true; >> + } >> + >> + return false; >> +} >> + >> static int >> dpif_netlink_queue_to_priority(const struct dpif *dpif OVS_UNUSED, >> uint32_t queue_id, uint32_t *priority) >> @@ -2669,8 +2864,59 @@ dpif_netlink_recv_windows(struct dpif_netlink *dpif, >> uint32_t handler_id, >> } >> #else >> static int >> -dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id, >> - struct dpif_upcall *upcall, struct ofpbuf *buf) >> +dpif_netlink_recv_cpu_dispatch(struct dpif_netlink *dpif, uint32_t >> handler_id, >> + struct dpif_upcall *upcall, struct ofpbuf >> *buf) >> + OVS_REQ_RDLOCK(dpif->upcall_lock) >> +{ >> + struct dpif_handler *handler; >> + int read_tries = 0; >> + >> + if (!dpif->handlers || handler_id >= dpif->n_handlers) { >> + return EAGAIN; >> + } >> + >> + handler = &dpif->handlers[handler_id]; >> + >> + for (;;) { >> + int dp_ifindex; >> + int error; >> + >> + if (++read_tries > 50) { >> + return EAGAIN; >> + } >> + error = nl_sock_recv(handler->sock, buf, NULL, false); >> + if (error == ENOBUFS) { >> + /* ENOBUFS typically means that we've received so many >> + * packets that the buffer overflowed. Try again >> + * immediately because there's almost certainly a packet >> + * waiting for us. */ >> + report_loss(dpif, NULL, 0, handler_id); >> + continue; >> + } >> + >> + if (error) { >> + if (error == EAGAIN) { >> + break; >> + } >> + return error; >> + } >> + >> + error = parse_odp_packet(buf, upcall, &dp_ifindex); >> + if (!error && dp_ifindex == dpif->dp_ifindex) { >> + return 0; >> + } else if (error) { >> + return error; >> + } >> + } >> + >> + return EAGAIN; >> +} >> + >> +static int >> +dpif_netlink_recv_vport_dispatch(struct dpif_netlink *dpif, >> + uint32_t handler_id, >> + struct dpif_upcall *upcall, >> + struct ofpbuf *buf) >> OVS_REQ_RDLOCK(dpif->upcall_lock) >> { >> struct dpif_handler *handler; >> @@ -2755,18 +3001,24 @@ dpif_netlink_recv(struct dpif *dpif_, uint32_t >> handler_id, >> #ifdef _WIN32 >> error = dpif_netlink_recv_windows(dpif, handler_id, upcall, buf); >> #else >> - error = dpif_netlink_recv__(dpif, handler_id, upcall, buf); >> + if (dpif_netlink_upcall_per_cpu(dpif)) { >> + error = dpif_netlink_recv_cpu_dispatch(dpif, handler_id, upcall, >> buf); >> + } else { >> + error = dpif_netlink_recv_vport_dispatch(dpif, >> + handler_id, upcall, buf); >> + } >> #endif >> fat_rwlock_unlock(&dpif->upcall_lock); >> >> return error; >> } >> >> +#ifdef _WIN32 >> static void >> -dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id) >> +dpif_netlink_recv_wait_windows(struct dpif_netlink *dpif, uint32_t >> handler_id) >> OVS_REQ_RDLOCK(dpif->upcall_lock) >> { >> -#ifdef _WIN32 >> + >> uint32_t i; >> struct dpif_windows_vport_sock *sock_pool = >> dpif->handlers[handler_id].vport_sock_pool; >> @@ -2779,13 +3031,31 @@ dpif_netlink_recv_wait__(struct dpif_netlink *dpif, >> uint32_t handler_id) >> for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) { >> nl_sock_wait(sock_pool[i].nl_sock, POLLIN); >> } >> -#else >> +} >> +#endif >> + >> +static void >> +dpif_netlink_recv_wait_vport_dispatch(struct dpif_netlink *dpif, >> + uint32_t handler_id) >> + OVS_REQ_RDLOCK(dpif->upcall_lock) >> +{ >> if (dpif->handlers && handler_id < dpif->n_handlers) { >> struct dpif_handler *handler = &dpif->handlers[handler_id]; >> >> poll_fd_wait(handler->epoll_fd, POLLIN); >> } >> -#endif >> +} >> + >> +static void >> +dpif_netlink_recv_wait_cpu_dispatch(struct dpif_netlink *dpif, >> + uint32_t handler_id) >> + OVS_REQ_RDLOCK(dpif->upcall_lock) >> +{ >> + if (dpif->handlers && handler_id < dpif->n_handlers) { >> + struct dpif_handler *handler = &dpif->handlers[handler_id]; >> + >> + poll_fd_wait(nl_sock_fd(handler->sock), POLLIN); >> + } >> } >> >> static void >> @@ -2794,12 +3064,20 @@ dpif_netlink_recv_wait(struct dpif *dpif_, uint32_t >> handler_id) >> struct dpif_netlink *dpif = dpif_netlink_cast(dpif_); >> >> fat_rwlock_rdlock(&dpif->upcall_lock); >> - dpif_netlink_recv_wait__(dpif, handler_id); >> +#ifdef _WIN32 >> + dpif_netlink_recv_wait_windows(dpif, handler_id); >> +#else >> + if (dpif_netlink_upcall_per_cpu(dpif)) { >> + dpif_netlink_recv_wait_cpu_dispatch(dpif, handler_id); >> + } else { >> + dpif_netlink_recv_wait_vport_dispatch(dpif, handler_id); >> + } >> +#endif >> fat_rwlock_unlock(&dpif->upcall_lock); >> } >> >> static void >> -dpif_netlink_recv_purge__(struct dpif_netlink *dpif) >> +dpif_netlink_recv_purge_vport_dispatch(struct dpif_netlink *dpif) >> OVS_REQ_WRLOCK(dpif->upcall_lock) >> { >> if (dpif->handlers) { >> @@ -2815,13 +3093,31 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif) >> } >> } >> >> +static void >> +dpif_netlink_recv_purge_cpu_dispatch(struct dpif_netlink *dpif) >> + OVS_REQ_WRLOCK(dpif->upcall_lock) >> +{ >> + int handler_id; >> + >> + if (dpif->handlers) { >> + for (handler_id = 0; handler_id < dpif->n_handlers; handler_id++) { >> + struct dpif_handler *handler = &dpif->handlers[handler_id]; >> + nl_sock_drain(handler->sock); >> + } >> + } >> +} >> + >> static void >> dpif_netlink_recv_purge(struct dpif *dpif_) >> { >> struct dpif_netlink *dpif = dpif_netlink_cast(dpif_); >> >> fat_rwlock_wrlock(&dpif->upcall_lock); >> - dpif_netlink_recv_purge__(dpif); >> + if (dpif_netlink_upcall_per_cpu(dpif)) { >> + dpif_netlink_recv_purge_cpu_dispatch(dpif); >> + } else { >> + dpif_netlink_recv_purge_vport_dispatch(dpif); >> + } >> fat_rwlock_unlock(&dpif->upcall_lock); >> } >> >> @@ -3978,6 +4274,7 @@ const struct dpif_class dpif_netlink_class = { >> dpif_netlink_operate, >> dpif_netlink_recv_set, >> dpif_netlink_handlers_set, >> + dpif_netlink_number_handlers_required, >> NULL, /* set_config */ >> dpif_netlink_queue_to_priority, >> dpif_netlink_recv, >> @@ -4065,6 +4362,9 @@ dpif_netlink_init(void) >> >> ovs_tunnels_out_of_tree = dpif_netlink_rtnl_probe_oot_tunnels(); >> >> + unixctl_command_register("dpif-netlink/dispatch-mode", "", 0, 0, >> + dpif_netlink_unixctl_dispatch_mode, NULL); >> + > > Can we please create the documentation? > I think this will require a new file (lib/dpif-netlink-unixctl.man) > with a small section for this new command. I added this new section to ovs-vswitchd.8 > > > >> ovsthread_once_done(&once); >> } >> >> @@ -4341,6 +4641,11 @@ dpif_netlink_dp_to_ofpbuf(const struct >> dpif_netlink_dp *dp, struct ofpbuf *buf) >> nl_msg_put_u32(buf, OVS_DP_ATTR_USER_FEATURES, dp->user_features); >> } >> >> + if (dp->upcall_pids) { >> + nl_msg_put_unspec(buf, OVS_DP_ATTR_PER_CPU_PIDS, dp->upcall_pids, >> + sizeof *dp->upcall_pids * dp->n_upcall_pids); >> + } >> + >> /* Skip OVS_DP_ATTR_STATS since we never have a reason to serialize it. >> */ >> } >> >> @@ -4660,13 +4965,58 @@ report_loss(struct dpif_netlink *dpif, struct >> dpif_channel *ch, uint32_t ch_idx, >> return; >> } >> >> - ds_init(&s); >> - if (ch->last_poll != LLONG_MIN) { >> - ds_put_format(&s, " (last polled %lld ms ago)", >> - time_msec() - ch->last_poll); >> + if (dpif_netlink_upcall_per_cpu(dpif)) { >> + VLOG_WARN("%s: lost packet on handler %u", >> + dpif_name(&dpif->dpif), handler_id); >> + } else { >> + ds_init(&s); >> + if (ch->last_poll != LLONG_MIN) { >> + ds_put_format(&s, " (last polled %lld ms ago)", >> + time_msec() - ch->last_poll); >> + } >> + >> + VLOG_WARN("%s: lost packet on port channel %u of handler %u%s", >> + dpif_name(&dpif->dpif), ch_idx, handler_id, ds_cstr(&s)); > > nit: indentation issue. Thanks > >> + ds_destroy(&s); >> + } >> +} >> + >> +static void >> +dpif_netlink_unixctl_dispatch_mode(struct unixctl_conn *conn, >> + int argc OVS_UNUSED, >> + const char *argv[] OVS_UNUSED, >> + void *aux OVS_UNUSED) >> +{ >> + struct ds reply = DS_EMPTY_INITIALIZER; >> + struct nl_dump dump; >> + uint64_t reply_stub[NL_DUMP_BUFSIZE / 8]; >> + struct ofpbuf msg, buf; >> + int error; >> + >> + error = dpif_netlink_init(); >> + if (error) { >> + return; >> + } >> + >> + ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub); >> + dpif_netlink_dp_dump_start(&dump); >> + while (nl_dump_next(&dump, &msg, &buf)) { >> + struct dpif_netlink_dp dp; >> + if (!dpif_netlink_dp_from_ofpbuf(&dp, &msg)) { >> + ds_put_format(&reply, "%s: ", dp.name); >> + if (dp.user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) { >> + ds_put_format(&reply, "per-cpu dispatch mode"); >> + } else { >> + ds_put_format(&reply, "per-vport dispatch mode"); >> + } >> + ds_put_format(&reply, "\n"); >> + } >> + } >> + ofpbuf_uninit(&buf); >> + error = nl_dump_done(&dump); >> + if (!error) { >> + unixctl_command_reply(conn, ds_cstr(&reply)); >> } >> >> - VLOG_WARN("%s: lost packet on port channel %u of handler %u%s", >> - dpif_name(&dpif->dpif), ch_idx, handler_id, ds_cstr(&s)); >> - ds_destroy(&s); >> + ds_destroy(&reply); >> } >> diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h >> index b817fceac698..21068534bf4d 100644 >> --- a/lib/dpif-provider.h >> +++ b/lib/dpif-provider.h >> @@ -336,26 +336,28 @@ struct dpif_class { >> * updating flows as necessary if it does this. */ >> int (*recv_set)(struct dpif *dpif, bool enable); >> >> - /* Refreshes the poll loops and Netlink sockets associated to each port, >> - * when the number of upcall handlers (upcall receiving thread) is >> changed >> - * to 'n_handlers' and receiving packets for 'dpif' is enabled by >> + /* Attempts to refresh the poll loops and Netlink sockets used for >> handling >> + * upcalls when the number of upcall handlers (upcall receiving thread) >> is >> + * changed to 'n_handlers' and receiving packets for 'dpif' is enabled >> by >> * recv_set(). >> * >> - * Since multiple upcall handlers can read upcalls simultaneously from >> - * 'dpif', each port can have multiple Netlink sockets, one per upcall >> - * handler. So, handlers_set() is responsible for the following tasks: >> + * A dpif implementation may choose to ignore 'n_handlers' while >> returning >> + * success. >> * >> - * When receiving upcall is enabled, extends or creates the >> - * configuration to support: >> - * >> - * - 'n_handlers' Netlink sockets for each port. >> + * The method for distribution of upcalls between handler threads is >> + * specific to the dpif implementation. >> + */ >> + int (*handlers_set)(struct dpif *dpif, uint32_t n_handlers); >> + >> + /* Queries 'dpif' to see if a certain number of handlers are required by >> + * the implementation. >> * >> - * - 'n_handlers' poll loops, one for each upcall handler. >> + * If a certain number of handlers are required, returns 'true' and sets >> + * 'n_handlers' to that number of handler threads. >> * >> - * - registering the Netlink sockets for the same upcall handler >> to >> - * the corresponding poll loop. >> - * */ >> - int (*handlers_set)(struct dpif *dpif, uint32_t n_handlers); >> + * If not, returns 'false'. >> + */ >> + bool (*number_handlers_required)(struct dpif *dpif, uint32_t >> *n_handlers); >> >> /* Pass custom configuration options to the datapath. The >> implementation >> * might postpone applying the changes until run() is called. */ >> diff --git a/lib/dpif.c b/lib/dpif.c >> index 26e8bfb7db98..511383514d5b 100644 >> --- a/lib/dpif.c >> +++ b/lib/dpif.c >> @@ -1489,6 +1489,23 @@ dpif_handlers_set(struct dpif *dpif, uint32_t >> n_handlers) >> return error; >> } >> >> +/* Checks if a certain number of handlers are required. >> + * >> + * If a certain number of handlers are required, returns 'true' and sets >> + * 'n_handlers' to that number of handler threads. >> + * >> + * If not, returns 'false' >> + */ >> +bool >> +dpif_number_handlers_required(struct dpif *dpif, uint32_t *n_handlers) { >> + bool ret = false; >> + >> + if (dpif->dpif_class->number_handlers_required) { >> + ret = dpif->dpif_class->number_handlers_required(dpif, n_handlers); >> + } >> + return ret; >> +} >> + >> void >> dpif_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb, void >> *aux) >> { >> diff --git a/lib/dpif.h b/lib/dpif.h >> index f9728e67393b..7c322d20e6c7 100644 >> --- a/lib/dpif.h >> +++ b/lib/dpif.h >> @@ -873,6 +873,7 @@ void dpif_register_upcall_cb(struct dpif *, >> upcall_callback *, void *aux); >> >> int dpif_recv_set(struct dpif *, bool enable); >> int dpif_handlers_set(struct dpif *, uint32_t n_handlers); >> +bool dpif_number_handlers_required(struct dpif *, uint32_t *n_handlers); >> int dpif_set_config(struct dpif *, const struct smap *cfg); >> int dpif_port_set_config(struct dpif *, odp_port_t, const struct smap *cfg); >> int dpif_recv(struct dpif *, uint32_t handler_id, struct dpif_upcall *, >> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c >> index d22f7f07361f..f9145bf23185 100644 >> --- a/ofproto/ofproto-dpif-upcall.c >> +++ b/ofproto/ofproto-dpif-upcall.c >> @@ -636,24 +636,61 @@ udpif_set_threads(struct udpif *udpif, uint32_t >> n_handlers_, >> uint32_t n_revalidators_) >> { >> ovs_assert(udpif); >> - ovs_assert(n_handlers_ && n_revalidators_); >> + uint32_t n_handlers_requested; >> + uint32_t n_revalidators_requested; >> + bool forced = false; >> + >> + if (dpif_number_handlers_required(udpif->dpif, &n_handlers_requested)) { >> + forced = true; >> + if (!n_revalidators_) { >> + n_revalidators_requested = n_handlers_requested / 4 + 1; >> + } else { >> + n_revalidators_requested = n_revalidators_; >> + } >> + } else { >> + int threads = MAX(count_cpu_cores(), 2); >> + >> + n_revalidators_requested = MAX(n_revalidators_, 0); >> + n_handlers_requested = MAX(n_handlers_, 0); >> >> - if (udpif->n_handlers != n_handlers_ >> - || udpif->n_revalidators != n_revalidators_) { >> + if (!n_revalidators_requested) { >> + n_revalidators_requested = n_handlers_requested >> + ? MAX(threads - (int) n_handlers_requested, 1) >> + : threads / 4 + 1; >> + } >> + >> + if (!n_handlers_requested) { >> + n_handlers_requested = MAX(threads - >> + (int) n_revalidators_requested, 1); >> + } >> + } >> + >> + if (udpif->n_handlers != n_handlers_requested >> + || udpif->n_revalidators != n_revalidators_requested) { >> + if (forced) { >> + VLOG_INFO("Overriding n-handler-threads to %u, setting " >> + "n-revalidator-threads to %u", n_handlers_requested, >> + n_revalidators_requested); >> + } else { >> + VLOG_INFO("Setting n-handler-threads to %u, setting " >> + "n-revalidator-threads to %u", n_handlers_requested, >> + n_revalidators_requested); >> + } >> udpif_stop_threads(udpif, true); >> } >> >> if (!udpif->handlers && !udpif->revalidators) { >> + VLOG_INFO("Starting %u threads", n_handlers_requested + >> + n_revalidators_requested); >> int error; >> - >> - error = dpif_handlers_set(udpif->dpif, n_handlers_); >> + error = dpif_handlers_set(udpif->dpif, n_handlers_requested); >> if (error) { >> VLOG_ERR("failed to configure handlers in dpif %s: %s", >> dpif_name(udpif->dpif), ovs_strerror(error)); >> return; >> } >> - >> - udpif_start_threads(udpif, n_handlers_, n_revalidators_); >> + udpif_start_threads(udpif, n_handlers_requested, >> + n_revalidators_requested); >> } >> } >> >> diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c >> index 53002f082b52..bd6103b1c88c 100644 >> --- a/ofproto/ofproto.c >> +++ b/ofproto/ofproto.c >> @@ -792,20 +792,8 @@ ofproto_type_set_config(const char *datapath_type, >> const struct smap *cfg) >> void >> ofproto_set_threads(int n_handlers_, int n_revalidators_) >> { >> - int threads = MAX(count_cpu_cores(), 2); >> - >> n_revalidators = MAX(n_revalidators_, 0); >> n_handlers = MAX(n_handlers_, 0); >> - >> - if (!n_revalidators) { >> - n_revalidators = n_handlers >> - ? MAX(threads - (int) n_handlers, 1) >> - : threads / 4 + 1; >> - } >> - >> - if (!n_handlers) { >> - n_handlers = MAX(threads - (int) n_revalidators, 1); >> - } >> } >> >> void >> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml >> index 4597a215d936..329c00e40a58 100644 >> --- a/vswitchd/vswitch.xml >> +++ b/vswitchd/vswitch.xml >> @@ -544,9 +544,9 @@ >> <column name="other_config" key="n-handler-threads" >> type='{"type": "integer", "minInteger": 1}'> >> <p> >> - Specifies the number of threads for software datapaths to use for >> - handling new flows. The default the number of online CPU cores >> minus >> - the number of revalidators. >> + Attempts to specify the number of threads for software datapaths >> to >> + use for handling new flows. Some datapaths may choose to ignore >> + this and it will be set to a sensible option for the datapath >> type. >> </p> >> <p> >> This configuration is per datapath. If you have more than one >> @@ -560,12 +560,17 @@ >> <column name="other_config" key="n-revalidator-threads" >> type='{"type": "integer", "minInteger": 1}'> >> <p> >> - Specifies the number of threads for software datapaths to use for >> - revalidating flows in the datapath. Typically, there is a direct >> - correlation between the number of revalidator threads, and the >> number >> - of flows allowed in the datapath. The default is the number of >> cpu >> - cores divided by four plus one. If >> <code>n-handler-threads</code> is >> - set, the default changes to the number of cpu cores minus the >> number >> + Attempts to specify the number of threads for software datapaths >> to >> + use for revalidating flows in the datapath. Some datapaths may >> + choose to ignore this and will set to a sensible option for the >> + datapath type. >> + </p> >> + <p> >> + Typically, there is a direct correlation between the >> + number of revalidator threads, and the number of flows allowed in >> the >> + datapath. The default is the number of cpu cores divided by four >> + plus one. If <code>n-handler-threads</code> is set, the default >> + changes to the number of cpu cores minus the number >> of handler threads. >> </p> >> <p> >> -- >> 2.27.0 >> > _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev