Flavio Leitner <f...@sysclose.org> writes: > Mark, > > This patch is v4 and 2/3. Before it was only one patch, so I wonder > if v4 is only one patch, or if this has multiple patches (2/3).
I see the following also went out: https://mail.openvswitch.org/pipermail/ovs-dev/2020-August/374258.html and https://mail.openvswitch.org/pipermail/ovs-dev/2020-August/374260.html Maybe they didn't show up in your email client? > BTW, differently from DPDK community, we start new threads for new > patch versions. > > fbl > > On Fri, Aug 28, 2020 at 06:03:04PM +0100, Mark Gray wrote: >> From: Aaron Conole <acon...@redhat.com> >> >> Currently, the channel handlers are polled globally. On some >> systems, this causes a thundering herd issue where multiple >> handler threads become active, only to do no work and immediately >> sleep. >> >> The approach here is to push the netlink socket channels to discreet >> handler threads to process, rather than polling on every thread. >> This will eliminate the need to wake multiple threads. >> >> To check: >> >> ip netns add left >> ip netns add right >> ip link add center-left type veth peer name left0 netns left >> ip link add center-right type veth peer name right0 netns right >> ip link set center-left up >> ip link set center-right up >> ip -n left ip link set left0 up >> ip -n left ip addr add 172.31.110.10/24 dev left0 >> ip -n right ip link set right0 up >> ip -n right ip addr add 172.31.110.11/24 dev right0 >> >> ovs-vsctl add-br br0 >> ovs-vsctl add-port br0 center-right >> ovs-vsctl add-port br0 center-left >> >> # in one terminal >> perf record -e sched:sched_wakeup,irq:softirq_entry -ag >> >> # in a separate terminal >> ip netns exec left arping -I left0 -c 1 172.31.110.11 >> >> # in the perf terminal after exiting >> perf script >> >> Look for the number of 'handler' threads which were made active. >> >> Suggested-by: Ben Pfaff <b...@ovn.org> >> Co-authored-by: Mark Gray <mark.d.g...@redhat.com> >> Reported-by: David Ahern <dsah...@gmail.com> >> Reported-at: >> https://mail.openvswitch.org/pipermail/ovs-dev/2019-December/365857.html >> Cc: Matteo Croce <technobo...@gmail.com> >> Cc: Flavio Leitner <f...@sysclose.org> >> Fixes: 69c51582f ("dpif-netlink: don't allocate per thread netlink sockets") >> Signed-off-by: Aaron Conole <acon...@redhat.com> >> Signed-off-by: Mark Gray <mark.d.g...@redhat.com> >> --- >> v2: Oops - forgot to commit my whitespace cleanups. >> v3: one port latency results as per Matteo's comments >> >> Stock: >> min/avg/max/mdev = 21.5/36.5/96.5/1.0 us >> With Patch: >> min/avg/max/mdev = 5.3/9.7/98.4/0.5 us >> v4: Oops - first email did not send >> >> lib/dpif-netlink.c | 311 ++++++++++++++++++++++++++------------------- >> 1 file changed, 182 insertions(+), 129 deletions(-) >> >> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c >> index 7da4fb54d..d928d7aa1 100644 >> --- a/lib/dpif-netlink.c >> +++ b/lib/dpif-netlink.c >> @@ -37,6 +37,7 @@ >> #include "dpif-provider.h" >> #include "fat-rwlock.h" >> #include "flow.h" >> +#include "id-pool.h" >> #include "netdev-linux.h" >> #include "netdev-offload.h" >> #include "netdev-provider.h" >> @@ -160,6 +161,8 @@ static void dpif_netlink_flow_to_dpif_flow(struct >> dpif_flow *, >> >> /* One of the dpif channels between the kernel and userspace. */ >> struct dpif_channel { >> + struct hmap_node hmap_node; >> + uint32_t port_idx; >> struct nl_sock *sock; /* Netlink socket. */ >> long long int last_poll; /* Last time this channel was polled. */ >> }; >> @@ -183,6 +186,8 @@ struct dpif_handler { >> int n_events; /* Num events returned by epoll_wait(). */ >> int event_offset; /* Offset into 'epoll_events'. */ >> >> + struct hmap channels; /* map of channels for each port. */ >> + >> #ifdef _WIN32 >> /* Pool of sockets. */ >> struct dpif_windows_vport_sock *vport_sock_pool; >> @@ -201,9 +206,8 @@ struct dpif_netlink { >> struct fat_rwlock upcall_lock; >> struct dpif_handler *handlers; >> uint32_t n_handlers; /* Num of upcall handlers. */ >> - struct dpif_channel *channels; /* Array of channels for each port. */ >> - int uc_array_size; /* Size of 'handler->channels' and */ >> - /* 'handler->epoll_events'. */ >> + >> + struct id_pool *port_ids; /* Set of added port ids */ >> >> /* Change notification. */ >> struct nl_sock *port_notifier; /* vport multicast group subscriber. */ >> @@ -287,6 +291,53 @@ close_nl_sock(struct nl_sock *sock) >> #endif >> } >> >> +static size_t >> +dpif_handler_channels_count(const struct dpif_handler *handler) >> +{ >> + return hmap_count(&handler->channels); >> +} >> + >> +static struct dpif_channel * >> +dpif_handler_channel_get(struct dpif_handler *handler, uint32_t idx) >> +{ >> + struct dpif_channel *channel; >> + >> + HMAP_FOR_EACH_WITH_HASH (channel, hmap_node, hash_int(idx, 0), >> + &handler->channels) { >> + if (channel->port_idx == idx) { >> + return channel; >> + } >> + } >> + >> + return NULL; >> +} >> + >> +static void >> +dpif_handler_channel_add(struct dpif_handler *handler, uint32_t idx, >> + struct nl_sock *sock) >> +{ >> + struct dpif_channel *channel = xmalloc(sizeof *channel); >> + >> + channel->port_idx = idx; >> + channel->sock = sock; >> + channel->last_poll = LLONG_MIN; >> + >> + hmap_insert(&handler->channels, &channel->hmap_node, hash_int(idx,0)); >> +} >> + >> +static void >> +dpif_handler_channel_del(struct dpif_handler *handler, uint32_t port_idx) >> +{ >> + struct dpif_channel *channel = dpif_handler_channel_get(handler, >> port_idx); >> + >> + hmap_remove(&(handler->channels), &(channel->hmap_node)); >> + >> + free(channel); >> +} >> + >> + >> + >> + >> static struct dpif_netlink * >> dpif_netlink_cast(const struct dpif *dpif) >> { >> @@ -385,6 +436,11 @@ open_dpif(const struct dpif_netlink_dp *dp, struct dpif >> **dpifp) >> >> dpif->dp_ifindex = dp->dp_ifindex; >> dpif->user_features = dp->user_features; >> + dpif->port_ids = id_pool_create(0, MAX_PORTS); >> + if (!dpif->port_ids) { >> + return ENOMEM; >> + } >> + >> *dpifp = &dpif->dpif; >> >> return 0; >> @@ -452,161 +508,144 @@ static bool >> vport_get_pid(struct dpif_netlink *dpif, uint32_t port_idx, >> uint32_t *upcall_pid) >> { >> - /* Since the nl_sock can only be assigned in either all >> - * or none "dpif" channels, the following check >> - * would suffice. */ >> - if (!dpif->channels[port_idx].sock) { >> + size_t i; >> + ovs_assert(!WINDOWS || dpif->n_handlers <= 1); >> + >> + if (!id_pool_check(dpif->port_ids, port_idx)) { >> return false; >> } >> - ovs_assert(!WINDOWS || dpif->n_handlers <= 1); >> >> - *upcall_pid = nl_sock_pid(dpif->channels[port_idx].sock); >> + /* The 'port_idx' should only be valid for a single handler. */ >> + for (i = 0; i < dpif->n_handlers; i++) { >> + struct dpif_channel *channel = >> + dpif_handler_channel_get(&dpif->handlers[i], port_idx); >> + >> + if (channel) { >> + *upcall_pid = nl_sock_pid(channel->sock); >> + return true; >> + } >> + } >> >> - return true; >> + return false; >> } >> >> static int >> vport_add_channel(struct dpif_netlink *dpif, odp_port_t port_no, >> struct nl_sock *sock) >> { >> - struct epoll_event event; >> uint32_t port_idx = odp_to_u32(port_no); >> + struct dpif_handler *handler; >> + struct epoll_event event; >> size_t i; >> - int error; >> >> - if (dpif->handlers == NULL) { >> + if (dpif->handlers == NULL || >> + id_pool_check(dpif->port_ids, port_idx)) { >> close_nl_sock(sock); >> - return 0; >> + return EINVAL; >> } >> >> - /* We assume that the datapath densely chooses port numbers, which can >> - * therefore be used as an index into 'channels' and 'epoll_events' of >> - * 'dpif'. */ >> - if (port_idx >= dpif->uc_array_size) { >> - uint32_t new_size = port_idx + 1; >> - >> - if (new_size > MAX_PORTS) { >> + if (port_idx >= MAX_PORTS) { >> VLOG_WARN_RL(&error_rl, "%s: datapath port %"PRIu32" too big", >> dpif_name(&dpif->dpif), port_no); >> return EFBIG; >> - } >> - >> - dpif->channels = xrealloc(dpif->channels, >> - new_size * sizeof *dpif->channels); >> + } >> >> - for (i = dpif->uc_array_size; i < new_size; i++) { >> - dpif->channels[i].sock = NULL; >> - } >> + handler = &dpif->handlers[0]; >> >> - for (i = 0; i < dpif->n_handlers; i++) { >> - struct dpif_handler *handler = &dpif->handlers[i]; >> + for (i = 0; i < dpif->n_handlers; i++) { >> + if (dpif_handler_channels_count(&dpif->handlers[i]) < >> + dpif_handler_channels_count(handler)) >> + handler = &dpif->handlers[i]; >> + } >> >> - handler->epoll_events = xrealloc(handler->epoll_events, >> - new_size * sizeof *handler->epoll_events); >> + dpif_handler_channel_add(handler, port_idx, sock); >> >> - } >> - dpif->uc_array_size = new_size; >> - } >> + handler->epoll_events = xrealloc(handler->epoll_events, >> + >> dpif_handler_channels_count(handler) * >> + sizeof *handler->epoll_events); >> >> memset(&event, 0, sizeof event); >> event.events = EPOLLIN | EPOLLEXCLUSIVE; >> event.data.u32 = port_idx; >> >> - for (i = 0; i < dpif->n_handlers; i++) { >> - struct dpif_handler *handler = &dpif->handlers[i]; >> - >> #ifndef _WIN32 >> - if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), >> - &event) < 0) { >> - error = errno; >> - goto error; >> - } >> -#endif >> - } >> - dpif->channels[port_idx].sock = sock; >> - dpif->channels[port_idx].last_poll = LLONG_MIN; >> - >> - return 0; >> - >> -error: >> -#ifndef _WIN32 >> - while (i--) { >> - epoll_ctl(dpif->handlers[i].epoll_fd, EPOLL_CTL_DEL, >> - nl_sock_fd(sock), NULL); >> + if (epoll_ctl(handler->epoll_fd, EPOLL_CTL_ADD, nl_sock_fd(sock), >> + &event) < 0) { >> + dpif_handler_channel_del(handler, port_idx); >> + id_pool_free_id(dpif->port_ids, port_idx); >> + return errno; >> } >> #endif >> - dpif->channels[port_idx].sock = NULL; >> >> - return error; >> + id_pool_add(dpif->port_ids, port_idx); >> + return 0; >> } >> >> static void >> vport_del_channels(struct dpif_netlink *dpif, odp_port_t port_no) >> { >> uint32_t port_idx = odp_to_u32(port_no); >> + struct dpif_handler *handler = NULL; >> + struct dpif_channel *channel = NULL; >> size_t i; >> >> - if (!dpif->handlers || port_idx >= dpif->uc_array_size >> - || !dpif->channels[port_idx].sock) { >> + if (!dpif->handlers) { >> return; >> } >> >> for (i = 0; i < dpif->n_handlers; i++) { >> - struct dpif_handler *handler = &dpif->handlers[i]; >> + handler = &dpif->handlers[i]; >> + channel = dpif_handler_channel_get(handler, port_idx); >> + >> + if (channel) { >> #ifndef _WIN32 >> - epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, >> - nl_sock_fd(dpif->channels[port_idx].sock), NULL); >> + epoll_ctl(handler->epoll_fd, EPOLL_CTL_DEL, >> + nl_sock_fd(channel->sock), NULL); >> + nl_sock_destroy(channel->sock); >> #endif >> - handler->event_offset = handler->n_events = 0; >> + dpif_handler_channel_del(handler, port_idx); >> + id_pool_free_id(dpif->port_ids, port_idx); >> + handler->event_offset = handler->n_events = 0; >> + break; >> + } >> } >> -#ifndef _WIN32 >> - nl_sock_destroy(dpif->channels[port_idx].sock); >> -#endif >> - dpif->channels[port_idx].sock = NULL; >> } >> >> static void >> destroy_all_channels(struct dpif_netlink *dpif) >> OVS_REQ_WRLOCK(dpif->upcall_lock) >> { >> - unsigned int i; >> + size_t i; >> >> if (!dpif->handlers) { >> return; >> } >> >> - for (i = 0; i < dpif->uc_array_size; i++ ) { >> - struct dpif_netlink_vport vport_request; >> - uint32_t upcall_pids = 0; >> - >> - if (!dpif->channels[i].sock) { >> - continue; >> - } >> - >> - /* Turn off upcalls. */ >> - dpif_netlink_vport_init(&vport_request); >> - vport_request.cmd = OVS_VPORT_CMD_SET; >> - vport_request.dp_ifindex = dpif->dp_ifindex; >> - vport_request.port_no = u32_to_odp(i); >> - vport_request.n_upcall_pids = 1; >> - vport_request.upcall_pids = &upcall_pids; >> - dpif_netlink_vport_transact(&vport_request, NULL, NULL); >> - >> - vport_del_channels(dpif, u32_to_odp(i)); >> - } >> - >> for (i = 0; i < dpif->n_handlers; i++) { >> struct dpif_handler *handler = &dpif->handlers[i]; >> + struct dpif_channel *channel, *next; >> + >> + HMAP_FOR_EACH_SAFE (channel, next, hmap_node, &handler->channels) { >> + struct dpif_netlink_vport vport_request; >> + uint32_t upcall_pids = 0; >> + /* Turn off upcalls. */ >> + dpif_netlink_vport_init(&vport_request); >> + vport_request.cmd = OVS_VPORT_CMD_SET; >> + vport_request.dp_ifindex = dpif->dp_ifindex; >> + vport_request.port_no = u32_to_odp(channel->port_idx); >> + vport_request.n_upcall_pids = 1; >> + vport_request.upcall_pids = &upcall_pids; >> + dpif_netlink_vport_transact(&vport_request, NULL, NULL); >> + >> + vport_del_channels(dpif, u32_to_odp(channel->port_idx)); >> + } >> >> dpif_netlink_handler_uninit(handler); >> free(handler->epoll_events); >> } >> - free(dpif->channels); >> - free(dpif->handlers); >> + >> dpif->handlers = NULL; >> - dpif->channels = NULL; >> dpif->n_handlers = 0; >> - dpif->uc_array_size = 0; >> } >> >> static void >> @@ -618,9 +657,11 @@ dpif_netlink_close(struct dpif *dpif_) >> >> fat_rwlock_wrlock(&dpif->upcall_lock); >> destroy_all_channels(dpif); >> + id_pool_destroy(dpif->port_ids); >> fat_rwlock_unlock(&dpif->upcall_lock); >> >> fat_rwlock_destroy(&dpif->upcall_lock); >> + >> free(dpif); >> } >> >> @@ -1084,23 +1125,40 @@ dpif_netlink_port_get_pid__(const struct >> dpif_netlink *dpif, >> odp_port_t port_no) >> OVS_REQ_RDLOCK(dpif->upcall_lock) >> { >> + struct dpif_channel *min_channel = NULL, *found_channel = NULL; >> uint32_t port_idx = odp_to_u32(port_no); >> uint32_t pid = 0; >> + size_t i; >> + >> + /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s >> + * channel, since it is not heavily loaded. */ >> + uint32_t idx = port_no == ODPP_NONE ? 0 : port_idx; >> + >> + if (!id_pool_check(dpif->port_ids, port_idx)) { >> + return 0; >> + } >> + >> + if (dpif->handlers) { >> + for (i = 0; i < dpif->n_handlers; i++) { >> + if (dpif_handler_channel_get(&dpif->handlers[i], idx)) { >> + found_channel = dpif_handler_channel_get(&dpif->handlers[i], >> + idx); >> + break; >> + } >> >> - if (dpif->handlers && dpif->uc_array_size > 0) { >> - /* The ODPP_NONE "reserved" port number uses the "ovs-system"'s >> - * channel, since it is not heavily loaded. */ >> - uint32_t idx = port_idx >= dpif->uc_array_size ? 0 : port_idx; >> - >> - /* Needs to check in case the socket pointer is changed in between >> - * the holding of upcall_lock. A known case happens when the main >> - * thread deletes the vport while the handler thread is handling >> - * the upcall from that port. */ >> - if (dpif->channels[idx].sock) { >> - pid = nl_sock_pid(dpif->channels[idx].sock); >> + if (dpif_handler_channel_get(&dpif->handlers[i], 0)) { >> + min_channel = dpif_handler_channel_get(&dpif->handlers[i], >> + 0); >> + } >> } >> } >> >> + if (found_channel) { >> + pid = nl_sock_pid(found_channel->sock); >> + } else if (min_channel) { >> + pid = nl_sock_pid(min_channel->sock); >> + } >> + >> return pid; >> } >> >> @@ -2342,12 +2400,14 @@ dpif_netlink_operate(struct dpif *dpif_, struct >> dpif_op **ops, size_t n_ops, >> static void >> dpif_netlink_handler_uninit(struct dpif_handler *handler) >> { >> + hmap_destroy(&handler->channels); >> vport_delete_sock_pool(handler); >> } >> >> static int >> dpif_netlink_handler_init(struct dpif_handler *handler) >> { >> + hmap_init(&handler->channels); >> return vport_create_sock_pool(handler); >> } >> #else >> @@ -2355,6 +2415,7 @@ dpif_netlink_handler_init(struct dpif_handler *handler) >> static int >> dpif_netlink_handler_init(struct dpif_handler *handler) >> { >> + hmap_init(&handler->channels); >> handler->epoll_fd = epoll_create(10); >> return handler->epoll_fd < 0 ? errno : 0; >> } >> @@ -2362,6 +2423,7 @@ dpif_netlink_handler_init(struct dpif_handler *handler) >> static void >> dpif_netlink_handler_uninit(struct dpif_handler *handler) >> { >> + hmap_destroy(&handler->channels); >> close(handler->epoll_fd); >> } >> #endif >> @@ -2374,9 +2436,7 @@ static int >> dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t >> n_handlers) >> OVS_REQ_WRLOCK(dpif->upcall_lock) >> { >> - unsigned long int *keep_channels; >> struct dpif_netlink_vport vport; >> - size_t keep_channels_nbits; >> struct nl_dump dump; >> uint64_t reply_stub[NL_DUMP_BUFSIZE / 8]; >> struct ofpbuf buf; >> @@ -2416,9 +2476,6 @@ dpif_netlink_refresh_channels(struct dpif_netlink >> *dpif, uint32_t n_handlers) >> handler->event_offset = handler->n_events = 0; >> } >> >> - keep_channels_nbits = dpif->uc_array_size; >> - keep_channels = bitmap_allocate(keep_channels_nbits); >> - >> ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub); >> dpif_netlink_port_dump_start__(dpif, &dump); >> while (!dpif_netlink_port_dump_next__(dpif, &dump, &vport, &buf)) { >> @@ -2426,8 +2483,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink >> *dpif, uint32_t n_handlers) >> uint32_t upcall_pid; >> int error; >> >> - if (port_no >= dpif->uc_array_size >> - || !vport_get_pid(dpif, port_no, &upcall_pid)) { >> + if (!vport_get_pid(dpif, port_no, &upcall_pid)) { >> struct nl_sock *sock; >> error = create_nl_sock(dpif, &sock); >> >> @@ -2475,9 +2531,6 @@ dpif_netlink_refresh_channels(struct dpif_netlink >> *dpif, uint32_t n_handlers) >> } >> } >> >> - if (port_no < keep_channels_nbits) { >> - bitmap_set1(keep_channels, port_no); >> - } >> continue; >> >> error: >> @@ -2486,14 +2539,6 @@ dpif_netlink_refresh_channels(struct dpif_netlink >> *dpif, uint32_t n_handlers) >> nl_dump_done(&dump); >> ofpbuf_uninit(&buf); >> >> - /* Discard any saved channels that we didn't reuse. */ >> - for (i = 0; i < keep_channels_nbits; i++) { >> - if (!bitmap_is_set(keep_channels, i)) { >> - vport_del_channels(dpif, u32_to_odp(i)); >> - } >> - } >> - free(keep_channels); >> - >> return retval; >> } >> >> @@ -2714,6 +2759,8 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, >> uint32_t handler_id, >> OVS_REQ_RDLOCK(dpif->upcall_lock) >> { >> struct dpif_handler *handler; >> + size_t n_channels; >> + >> int read_tries = 0; >> >> if (!dpif->handlers || handler_id >= dpif->n_handlers) { >> @@ -2721,14 +2768,15 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, >> uint32_t handler_id, >> } >> >> handler = &dpif->handlers[handler_id]; >> - if (handler->event_offset >= handler->n_events) { >> + n_channels = dpif_handler_channels_count(handler); >> + if (handler->event_offset >= handler->n_events && n_channels) { >> int retval; >> >> handler->event_offset = handler->n_events = 0; >> >> do { >> retval = epoll_wait(handler->epoll_fd, handler->epoll_events, >> - dpif->uc_array_size, 0); >> + n_channels, 0); >> } while (retval < 0 && errno == EINTR); >> >> if (retval < 0) { >> @@ -2741,7 +2789,10 @@ dpif_netlink_recv__(struct dpif_netlink *dpif, >> uint32_t handler_id, >> >> while (handler->event_offset < handler->n_events) { >> int idx = handler->epoll_events[handler->event_offset].data.u32; >> - struct dpif_channel *ch = &dpif->channels[idx]; >> + struct dpif_channel *ch = dpif_handler_channel_get(handler, idx); >> + if (!ch) { >> + return EAGAIN; >> + } >> >> handler->event_offset++; >> >> @@ -2845,12 +2896,14 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif) >> if (dpif->handlers) { >> size_t i; >> >> - if (!dpif->channels[0].sock) { >> - return; >> - } >> - for (i = 0; i < dpif->uc_array_size; i++ ) { >> + for (i = 0; i < dpif->n_handlers; i++) { >> + struct dpif_handler *handler = &dpif->handlers[i]; >> + struct dpif_channel *channel; >> + >> + HMAP_FOR_EACH (channel, hmap_node, &handler->channels) { >> + nl_sock_drain(channel->sock); >> + } >> >> - nl_sock_drain(dpif->channels[i].sock); >> } >> } >> } >> -- >> 2.26.2 >> _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev