Hi Aaron,
Thanks for the patch. I ran some basic tests here and they passed. I could see only one handler thread becoming active with a single upcall. See my comment below. On Tue, Jul 21, 2020 at 07:27:41PM -0400, Aaron Conole wrote: > Currently, the channel handlers are polled globally. On some > systems, this causes a thundering herd issue where multiple > handler threads become active, only to do no work and immediately > sleep. > > The approach here is to push the netlink socket channels to discreet > handler threads to process, rather than polling on every thread. > This will eliminate the need to wake multiple threads. > > To check: > > ip netns add left > ip netns add right > ip link add center-left type veth peer name left0 > ip link add center-right type veth peer name right0 > ip link set left0 netns left > ip link set right0 netns right > ip link set center-left up > ip link set center-right up > ip -n left ip link set left0 up > ip -n left ip addr add 172.31.110.10/24 dev left0 > ip -n right ip link set right0 up > ip -n right ip addr add 172.31.110.11/24 dev right0 > > ovs-vsctl add-br br0 > ovs-vsctl add-port br0 center-right > ovs-vsctl add-port br0 center-left > > # in one terminal > perf record -e sched:sched_wakeup,irq:softirq_entry -ag > > # in a separate terminal > ip netns exec left arping -I left0 -c 1 172.31.110.11 > > # in the perf terminal after exiting > perf script > > Look for the number of 'handler' threads which were made active. > > Suggested-by: Ben Pfaff <b...@ovn.org> > Reported-by: David Ahern <dsah...@gmail.com> > Reported-at: > https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html > Cc: Matteo Croce <technobo...@gmail.com> > Cc: Flavio Leitner <f...@sysclose.org> > Fixes: 69c51582f ("dpif-netlink: don't allocate per thread netlink sockets") > Signed-off-by: Aaron Conole <acon...@redhat.com> > --- > v2: Oops - forgot to commit my whitespace cleanups. > > lib/dpif-netlink.c | 289 ++++++++++++++++++++++++++++----------------- > 1 file changed, 179 insertions(+), 110 deletions(-) > > diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c > index 7da4fb54d9..71d2805427 100644 > --- a/lib/dpif-netlink.c > +++ b/lib/dpif-netlink.c > @@ -183,6 +183,10 @@ struct dpif_handler { > int n_events; /* Num events returned by epoll_wait(). */ > int event_offset; /* Offset into 'epoll_events'. */ > > + struct dpif_channel *channels; /* Array of channels for each port. */ > + uint32_t *port_idx_map; /* Port idx to channel idx. */ > + size_t n_channels; > + > #ifdef _WIN32 > /* Pool of sockets. */ > struct dpif_windows_vport_sock *vport_sock_pool; > @@ -202,8 +206,6 @@ struct dpif_netlink { > struct dpif_handler *handlers; > uint32_t n_handlers; /* Num of upcall handlers. */ > struct dpif_channel *channels; /* Array of channels for each port. */ Shouldn't the above channels be removed? > - int uc_array_size; /* Size of 'handler->channels' and */ > - /* 'handler->epoll_events'. */ > > /* Change notification. */ > struct nl_sock *port_notifier; /* vport multicast group subscriber. */ > @@ -287,6 +289,55 @@ close_nl_sock(struct nl_sock *sock) > #endif > } > > +static uint32_t > +dpif_handler_port_idx_max(const struct dpif_handler *handler, > + struct dpif_channel **out_chn) Looks like there is no caller passing out_chn, so why that is needed? > +{ > + uint32_t max = 0; > + size_t i; > + > + for (i = 0; i < handler->n_channels; ++i) { OVS does have some code using '++i', but the majority of it uses 'i++', can you please swap them? > + if (handler->channels[i].sock && > + handler->port_idx_map[i] > max) { > + max = handler->port_idx_map[i]; > + if (out_chn) { > + *out_chn = &handler->channels[i]; > + } > + } > + } > + > + return max; > +} > + > +static size_t > +dpif_handler_active_channels(const struct dpif_handler *handler) > +{ > + size_t i, ret = 0; Perhaps 'count' or 'num' would be a better name. > + > + for (i = 0; i < handler->n_channels; ++i) { > + if (handler->channels[i].sock) { > + ret++; > + } > + } > + > + return ret; > +} > + > +static struct dpif_channel * > +dpif_handler_has_port_idx(const struct dpif_handler *handler, uint32_t idx) > +{ > + size_t i; > + > + for (i = 0; i < handler->n_channels; ++i) { > + if (handler->channels[i].sock && > + handler->port_idx_map[i] == idx) { > + return &handler->channels[i]; > + } > + } > + > + return NULL; > +} > + > static struct dpif_netlink * > dpif_netlink_cast(const struct dpif *dpif) > { > @@ -452,90 +503,86 @@ static bool > vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx, > uint32_t *upcall_pid) > { > - /* Since the nl_sock can only be assigned in either all > - * or none "dpif" channels, the following check > - * would suffice. */ > - if (!dpif->channels[port_idx].sock) { > - return false; > - } > + size_t i; > + > ovs_assert(!WINDOWS || dpif->n_handlers <= 1); > > - *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock); > + /* The 'port_idx' should only be valid for a single handler. */ > + for (i = 0; i < dpif->n_handlers; ++i) { > + struct dpif_channel *channel = > + dpif_handler_has_port_idx(&dpif->handlers[i], port_idx); > + > + if (channel) { > + *upcall_pid = nl_sock_pid(channel->sock); > + return true; > + } > + } > > - return true; > + return false; > } > > static int > vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no, > struct nl_sock *sock) > { > - struct epoll_event event; > uint32_t port_idx = odp_to_u32(port_no); > + struct dpif_handler *handler; > + struct dpif_channel *channel; > + struct epoll_event event; > + int error = 0; > size_t i; > - int error; > > if (dpif->handlers == NULL) { > close_nl_sock(sock); > - return 0; > + return error; > } > > - /* We assume that the datapath densely chooses port numbers, which can > - * therefore be used as an index into 'channels' and 'epoll_events' of > - * 'dpif'. */ > - if (port_idx >= dpif->uc_array_size) { > - uint32_t new_size = port_idx + 1; > + handler = &dpif->handlers[0]; > > - if (new_size > MAX_PORTS) { > - VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big", > - dpif_name(&dpif->dpif), port_no); > - return EFBIG; > - } Could you please explain why checking MAX_PORTS is not important anymore? If it isn't, then this should remove MAX_PORT declaration as that was the only use. > - > - dpif->channels = xrealloc(dpif->channels, > - new_size * sizeof *dpif->channels); > + for (i = 0; i < dpif->n_handlers; ++i) { > + if (dpif_handler_active_channels(&dpif->handlers[i]) < > + dpif_handler_active_channels(handler)) > + handler = &dpif->handlers[i]; > + } > > - for (i = dpif->uc_array_size; i < new_size; i++) { > - dpif->channels[i].sock = NULL; > + channel = NULL; > + for (i = 0; i < handler->n_channels; ++i) { > + if (!handler->channels[i].sock) { > + channel = &handler->channels[i]; > + handler->port_idx_map[i] = port_idx; > } > + } > > - for (i = 0; i < dpif->n_handlers; i++) { > - struct dpif_handler *handler = &dpif->handlers[i]; > - > - handler->epoll_events = xrealloc(handler->epoll_events, > - new_size * sizeof *handler->epoll_events); > - > - } > - dpif->uc_array_size = new_size; > + if (channel == NULL) { > + size_t new_size = handler->n_channels + 1; > + handler->channels = xrealloc(handler->channels, > + new_size * sizeof *handler->channels); > + handler->epoll_events = xrealloc(handler->epoll_events, > + new_size * > + sizeof *handler->epoll_events); > + handler->port_idx_map = xrealloc(handler->port_idx_map, > + new_size * > + sizeof *handler->port_idx_map); > + channel = &handler->channels[handler->n_channels]; > + handler->port_idx_map[handler->n_channels] = port_idx; > + i = handler->n_channels; > + handler->n_channels += 1; perhaps handler->n_channels = new_size to remove the magic number? > } > > + channel->sock = sock; > + channel->last_poll = LLONG_MIN; > + > memset(&event, 0, sizeof event); > event.events = EPOLLIN | EPOLLEXCLUSIVE; > - event.data.u32 = port_idx; > - > - for (i = 0; i < dpif->n_handlers; i++) { > - struct dpif_handler *handler = &dpif->handlers[i]; > + event.data.u32 = i; > > #ifndef _WIN32 > - if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), > - &event) < 0) { > - error = errno; > - goto error; > - } > -#endif > - } > - dpif->channels[port_idx].sock = sock; > - dpif->channels[port_idx].last_poll = LLONG_MIN; > - > - return 0; > - > -error: > -#ifndef _WIN32 > - while (i--) { > - epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL, > - nl_sock_fd(sock), NULL); > + if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), > + &event) < 0) { > + error = errno; > + channel->sock = NULL; > } > #endif > - dpif->channels[port_idx].sock = NULL; > > return error; > } > @@ -544,69 +591,81 @@ static void > vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no) > { > uint32_t port_idx = odp_to_u32(port_no); > + struct dpif_handler *handler = NULL; > + struct dpif_channel *channel = NULL; > size_t i; > > - if (!dpif->handlers || port_idx >= dpif->uc_array_size > - || !dpif->channels[port_idx].sock) { > + if (!dpif->handlers) { > return; > } > > - for (i = 0; i < dpif->n_handlers; i++) { > - struct dpif_handler *handler = &dpif->handlers[i]; > + for (i = 0; i < dpif->n_handlers && channel == NULL; i++) { > + handler = &dpif->handlers[i]; > + channel = dpif_handler_has_port_idx(handler, port_idx); > + > + if (channel == NULL) { > + continue; > + } > + > #ifndef _WIN32 > epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, > - nl_sock_fd(dpif->channels[port_idx].sock), NULL); > + nl_sock_fd(channel->sock), NULL); > #endif > handler->event_offset = handler->n_events = 0; > } > + > + if (channel) { > #ifndef _WIN32 > - nl_sock_destroy(dpif->channels[port_idx].sock); > + nl_sock_destroy(channel->sock); > #endif > - dpif->channels[port_idx].sock = NULL; > + channel->sock = NULL; Ok, so it doesn't resize the memory when ports are removed. I was wondering if it wouldn't scale better if we use hmaps and id_pools instead: --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -183,9 +183,7 @@ struct dpif_handler { int n_events; /* Num events returned by epoll_wait(). */ int event_offset; /* Offset into 'epoll_events'. */ - struct dpif_channel *channels; /* Array of channels for each port. */ - uint32_t *port_idx_map; /* Port idx to channel idx. */ - size_t n_channels; + struct hmap channels; /* Channels for each port. */ #ifdef _WIN32 /* Pool of sockets. */ @@ -205,7 +203,7 @@ struct dpif_netlink { struct fat_rwlock upcall_lock; struct dpif_handler *handlers; uint32_t n_handlers; /* Num of upcall handlers. */ - struct dpif_channel *channels; /* Array of channels for each port. */ + struct id_pool *port_ids; /* Pool of port ids. */ /* Change notification. */ struct nl_sock *port_notifier; /* vport multicast group subscriber. */ For instance, we can find the least loaded with just hmap_count() instead of iterating on all handlers and ports. Also when we add/remove a port, we could use HMAP_FOR_EACH_WITH_HASH() using port_id as hash to find the channel. What do you think? > + } > } > > static void > destroy_all_channels(struct dpif_netlink *dpif) > OVS_REQ_WRLOCK(dpif->upcall_lock) > { > - unsigned int i; > + size_t i, j; > > if (!dpif->handlers) { > return; > } > > - for (i = 0; i < dpif->uc_array_size; i++ ) { > - struct dpif_netlink_vport vport_request; > - uint32_t upcall_pids = 0; > + for (i = 0; i < dpif->n_handlers; ++i) { nitpick: Same comment as before to change to 'i++' > + struct dpif_handler *handler = &dpif->handlers[i]; > > - if (!dpif->channels[i].sock) { > - continue; > - } > + for (j = 0; j < handler->n_channels; ++j) { and here > + struct dpif_netlink_vport vport_request; > + uint32_t upcall_pids = 0; > > - /* Turn off upcalls. */ > - dpif_netlink_vport_init(&vport_request); > - vport_request.cmd = OVS_VPORT_CMD_SET; > - vport_request.dp_ifindex = dpif->dp_ifindex; > - vport_request.port_no = u32_to_odp(i); > - vport_request.n_upcall_pids = 1; > - vport_request.upcall_pids = &upcall_pids; > - dpif_netlink_vport_transact(&vport_request, NULL, NULL); > + if (!handler->channels[j].sock) { > + continue; > + } > > - vport_del_channels(dpif, u32_to_odp(i)); > - } > + /* Turn off upcalls. */ > + dpif_netlink_vport_init(&vport_request); > + vport_request.cmd = OVS_VPORT_CMD_SET; > + vport_request.dp_ifindex = dpif->dp_ifindex; > + vport_request.port_no = u32_to_odp(handler->port_idx_map[j]); > + vport_request.n_upcall_pids = 1; > + vport_request.upcall_pids = &upcall_pids; > + dpif_netlink_vport_transact(&vport_request, NULL, NULL); > > - for (i = 0; i < dpif->n_handlers; i++) { > - struct dpif_handler *handler = &dpif->handlers[i]; > + vport_del_channels(dpif, u32_to_odp(handler->port_idx_map[j])); > + } > > dpif_netlink_handler_uninit(handler); > + handler->n_channels = 0; > + > free(handler->epoll_events); > + free(handler->port_idx_map); > + free(handler->channels); > } > - free(dpif->channels); > + > free(dpif->handlers); > dpif->handlers = NULL; > - dpif->channels = NULL; > dpif->n_handlers = 0; > - dpif->uc_array_size = 0; > } > > static void > @@ -1084,23 +1143,31 @@ dpif_netlink_port_get_pid__(const struct dpif_netlink > *dpif, > odp_port_t port_no) > OVS_REQ_RDLOCK(dpif->upcall_lock) > { > + struct dpif_channel *min_channel = NULL, *found_channel = NULL; > uint32_t port_idx = odp_to_u32(port_no); > uint32_t pid = 0; > > - if (dpif->handlers && dpif->uc_array_size > 0) { > - /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s > - * channel, since it is not heavily loaded. */ > - uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx; > - > - /* Needs to check in case the socket pointer is changed in between > - * the holding of upcall_lock. A known case happens when the main > - * thread deletes the vport while the handler thread is handling > - * the upcall from that port. */ > - if (dpif->channels[idx].sock) { > - pid = nl_sock_pid(dpif->channels[idx].sock); > + if (dpif->handlers) { > + size_t i; > + for (i = 0; i < dpif->n_handlers; ++i) { > + if (dpif_handler_has_port_idx(&dpif->handlers[i], port_idx)) { > + found_channel = dpif_handler_has_port_idx(&dpif->handlers[i], > + port_idx); > + } > + > + if (dpif_handler_has_port_idx(&dpif->handlers[i], 0)) { > + min_channel = dpif_handler_has_port_idx(&dpif->handlers[i], > + 0); > + } > } The comment about the ODPP_NONE reserved port seems still relevant. Also, if it finds a channel, couldn't it just break the loop? > } > > + if (found_channel) { > + pid = nl_sock_pid(found_channel->sock); > + } else if (min_channel) { > + pid = nl_sock_pid(min_channel->sock); > + } > + > return pid; > } > > @@ -2376,7 +2443,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink > *dpif, uint32_t n_handlers) > { > unsigned long int *keep_channels; > struct dpif_netlink_vport vport; > - size_t keep_channels_nbits; > + size_t keep_channels_nbits = 0; > struct nl_dump dump; > uint64_t reply_stub[NL_DUMP_BUFSIZE / 8]; > struct ofpbuf buf; > @@ -2414,9 +2481,11 @@ dpif_netlink_refresh_channels(struct dpif_netlink > *dpif, uint32_t n_handlers) > struct dpif_handler *handler = &dpif->handlers[i]; > > handler->event_offset = handler->n_events = 0; > + if (keep_channels_nbits < dpif_handler_port_idx_max(handler, NULL)) { > + keep_channels_nbits = dpif_handler_port_idx_max(handler, NULL); > + } > } > > - keep_channels_nbits = dpif->uc_array_size; > keep_channels = bitmap_allocate(keep_channels_nbits); > > ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub); > @@ -2426,8 +2495,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink > *dpif, uint32_t n_handlers) > uint32_t upcall_pid; > int error; > > - if (port_no >= dpif->uc_array_size > - || !vport_get_pid(dpif, port_no, &upcall_pid)) { > + if (!vport_get_pid(dpif, port_no, &upcall_pid)) { > struct nl_sock *sock; > error = create_nl_sock(dpif, &sock); > > @@ -2721,14 +2789,14 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, > uint32_t handler_id, > } > > handler = &dpif->handlers[handler_id]; > - if (handler->event_offset >= handler->n_events) { > + if (handler->event_offset >= handler->n_events && handler->n_channels) { > int retval; > > handler->event_offset = handler->n_events = 0; > > do { > retval = epoll_wait(handler->epoll_fd, handler->epoll_events, > - dpif->uc_array_size, 0); > + handler->n_channels, 0); > } while (retval < 0 && errno == EINTR); > > if (retval < 0) { > @@ -2741,7 +2809,7 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t > handler_id, > > while (handler->event_offset < handler->n_events) { > int idx = handler->epoll_events[handler->event_offset].data.u32; > - struct dpif_channel *ch = &dpif->channels[idx]; > + struct dpif_channel *ch = &handler->channels[idx]; > > handler->event_offset++; > > @@ -2845,12 +2913,13 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif) > if (dpif->handlers) { > size_t i; > > - if (!dpif->channels[0].sock) { > - return; > - } > - for (i = 0; i < dpif->uc_array_size; i++ ) { > + for (i = 0; i < dpif->n_handlers; i++) { > + struct dpif_handler *handler = &dpif->handlers[i]; > + size_t j; > > - nl_sock_drain(dpif->channels[i].sock); > + for (j = 0; j < handler->n_channels; ++j) { > + nl_sock_drain(handler->channels[j].sock); > + } > } > } > } > -- > 2.25.4 > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev -- fbl _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev