Re: [ovs-dev] [RFC 3/3] dpif-netlink: Introduce per-cpu upcall dispatch

Mark Gray Wed, 30 Jun 2021 02:44:38 -0700

On 08/06/2021 21:07, Flavio Leitner wrote:
> 
> Hi Mark,
> 
> This looks good to me.
> 
> Since the new scheme doesn't allow users to change the number
> of handlers, we must update ovs-vswitchd.conf.db(5) as well.


I updated this documentation

> 
> Some comments below.
> 
> On Fri, Apr 30, 2021 at 11:31:29AM -0400, Mark Gray wrote:
>> The Open vSwitch kernel module uses the upcall mechanism to send
>> packets from kernel space to user space when it misses in the kernel
>> space flow table. The upcall sends packets via a Netlink socket.
>> Currently, a Netlink socket is created for every vport. In this way,
>> there is a 1:1 mapping between a vport and a Netlink socket.
>> When a packet is received by a vport, if it needs to be sent to
>> user space, it is sent via the corresponding Netlink socket.
>>
>> This mechanism, with various iterations of the corresponding user
>> space code, has seen some limitations and issues:
>>
>> * On systems with a large number of vports, there is correspondingly
>> a large number of Netlink sockets which can limit scaling.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
>> * Packet reordering on upcalls.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
>> * A thundering herd issue.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1834444)
>>
>> This patch introduces an alternative, feature-negotiated, upcall
>> mode using a per-cpu dispatch rather than a per-vport dispatch.
>>
>> In this mode, the Netlink socket to be used for the upcall is
>> selected based on the CPU of the thread that is executing the upcall.
>> In this way, it resolves the issues above as:
>>
>> a) The number of Netlink sockets scales with the number of CPUs
>> rather than the number of vports.
>> b) Ordering per-flow is maintained as packets are distributed to
>> CPUs based on mechanisms such as RSS and flows are distributed
>> to a single user space thread.
>> c) Packets from a flow can only wake up one user space thread.
>>
>> Reported-at: https://bugzilla.redhat.com/1844576
>> Signed-off-by: Mark Gray <mark.d.g...@redhat.com>
>> ---
>>  .../linux/compat/include/linux/openvswitch.h  |   7 +
>>  lib/dpif-netdev.c                             |   1 +
>>  lib/dpif-netlink.c                            | 405 +++++++++++++++---
>>  lib/dpif-provider.h                           |  10 +
>>  lib/dpif.c                                    |  17 +
>>  lib/dpif.h                                    |   1 +
>>  ofproto/ofproto-dpif-upcall.c                 |  51 ++-
>>  ofproto/ofproto.c                             |  12 -
>>  8 files changed, 430 insertions(+), 74 deletions(-)
>>
>> diff --git a/datapath/linux/compat/include/linux/openvswitch.h 
>> b/datapath/linux/compat/include/linux/openvswitch.h
>> index 875de20250ce..f29265df055e 100644
>> --- a/datapath/linux/compat/include/linux/openvswitch.h
>> +++ b/datapath/linux/compat/include/linux/openvswitch.h
>> @@ -89,6 +89,8 @@ enum ovs_datapath_cmd {
>>   * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
>>   * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
>>   * not be sent.
>> + * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
>> + * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
>>   * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
>>   * datapath.  Always present in notifications.
>>   * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for 
>> the
>> @@ -105,6 +107,8 @@ enum ovs_datapath_attr {
>>      OVS_DP_ATTR_MEGAFLOW_STATS,     /* struct ovs_dp_megaflow_stats */
>>      OVS_DP_ATTR_USER_FEATURES,      /* OVS_DP_F_*  */
>>      OVS_DP_ATTR_PAD,
>> +    OVS_DP_ATTR_PAD2,
>> +    OVS_DP_ATTR_PER_CPU_PIDS,       /* Netlink PIDS to receive upcalls */
>>      __OVS_DP_ATTR_MAX
>>  };
>>  
>> @@ -146,6 +150,9 @@ struct ovs_vport_stats {
>>  /* Allow tc offload recirc sharing */
>>  #define OVS_DP_F_TC_RECIRC_SHARING  (1 << 2)
>>  
>> +/* Allow per-cpu dispatch of upcalls */
>> +#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU (1 << 3)
>> +
>>  /* Fixed logical ports. */
>>  #define OVSP_LOCAL      ((__u32)0)
>>  
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 251788b04965..24e6911dd4ff 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -8488,6 +8488,7 @@ const struct dpif_class dpif_netdev_class = {
>>      dpif_netdev_operate,
>>      NULL,                       /* recv_set */
>>      NULL,                       /* handlers_set */
>> +    NULL,                       /* handlers_get */
> 
> That is number_handlers_required.
> 

oops. Thanks for catching this

>>      dpif_netdev_set_config,
>>      dpif_netdev_queue_to_priority,
>>      NULL,                       /* recv */
>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>> index 2ded5fdd01b3..349897e70632 100644
>> --- a/lib/dpif-netlink.c
>> +++ b/lib/dpif-netlink.c
>> @@ -80,6 +80,9 @@ enum { MAX_PORTS = USHRT_MAX };
>>  #define FLOW_DUMP_MAX_BATCH 50
>>  #define OPERATE_MAX_OPS 50
>>  
>> +#define DISPATCH_MODE_PER_CPU(dpif) ((dpif)->user_features & \
>> +                                     OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
>> +
> 
> Perhaps a function like this:
> static inline bool
> dpif_netlink_upcall_per_cpu(struct ... dpif) {
>     return !!((dpif)->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU);
> }
> 
>>  #ifndef EPOLLEXCLUSIVE
>>  #define EPOLLEXCLUSIVE (1u << 28)
>>  #endif
>> @@ -98,6 +101,8 @@ struct dpif_netlink_dp {
>>      const struct ovs_dp_stats *stats;  /* OVS_DP_ATTR_STATS. */
>>      const struct ovs_dp_megaflow_stats *megaflow_stats;
>>                                         /* OVS_DP_ATTR_MEGAFLOW_STATS.*/
>> +    const uint32_t *upcall_pids;       /* OVS_DP_ATTR_PER_CPU_PIDS */
>> +    uint32_t n_upcall_pids;
>>  };
>>  
>>  static void dpif_netlink_dp_init(struct dpif_netlink_dp *);
>> @@ -178,11 +183,16 @@ struct dpif_windows_vport_sock {
>>  #endif
>>  
>>  struct dpif_handler {
>> +    /* per-vport dispatch mode */
>>      struct epoll_event *epoll_events;
>>      int epoll_fd;                 /* epoll fd that includes channel socks. 
>> */
>>      int n_events;                 /* Num events returned by epoll_wait(). */
>>      int event_offset;             /* Offset into 'epoll_events'. */
>>  
>> +    /* per-cpu dispatch mode */
>> +    struct nl_sock *sock;         /* Each handler thread holds one netlink
>> +                                     socket. */
>> +
>>  #ifdef _WIN32
>>      /* Pool of sockets. */
>>      struct dpif_windows_vport_sock *vport_sock_pool;
>> @@ -201,6 +211,8 @@ struct dpif_netlink {
>>      struct fat_rwlock upcall_lock;
>>      struct dpif_handler *handlers;
>>      uint32_t n_handlers;           /* Num of upcall handlers. */
>> +
>> +    /* Per-vport dispatch mode */
>>      struct dpif_channel *channels; /* Array of channels for each port. */
>>      int uc_array_size;             /* Size of 'handler->channels' and */
>>                                     /* 'handler->epoll_events'. */
>> @@ -241,8 +253,12 @@ static int open_dpif(const struct dpif_netlink_dp *, 
>> struct dpif **);
>>  static uint32_t dpif_netlink_port_get_pid(const struct dpif *,
>>                                            odp_port_t port_no);
>>  static void dpif_netlink_handler_uninit(struct dpif_handler *handler);
>> -static int dpif_netlink_refresh_channels(struct dpif_netlink *,
>> -                                         uint32_t n_handlers);
>> +static int dpif_netlink_refresh_handlers_vport_dispatch(struct dpif_netlink 
>> *,
>> +                                                        uint32_t 
>> n_handlers);
>> +static void destroy_all_channels(struct dpif_netlink *);
>> +static int dpif_netlink_refresh_handlers_cpu_dispatch(struct dpif_netlink 
>> *);
>> +static void destroy_all_handlers(struct dpif_netlink *);
>> +
>>  static void dpif_netlink_vport_to_ofpbuf(const struct dpif_netlink_vport *,
>>                                           struct ofpbuf *);
>>  static int dpif_netlink_vport_from_ofpbuf(struct dpif_netlink_vport *,
>> @@ -357,11 +373,34 @@ dpif_netlink_open(const struct dpif_class *class 
>> OVS_UNUSED, const char *name,
>>          dp_request.cmd = OVS_DP_CMD_SET;
>>      }
>>  
>> +    /* The Open vSwitch kernel module has two modes for dispatching upcalls:
>> +     * per-vport and per-cpu.
>> +     *
>> +     * When dispatching upcalls per-vport, the kernel will
>> +     * send the upcall via a Netlink socket that has been selected based on 
>> the
>> +     * vport that received the packet that is causing the upcall.
>> +     *
>> +     * When dispatching upcall per-cpu, the kernel will send the upcall via
>> +     * a Netlink socket that has been selected based on the cpu that 
>> received
>> +     * the packet that is causing the upcall.
>> +     *
>> +     * First we test to see if the kernel module supports per-cpu 
>> dispatching
>> +     * (the preferred method). If it does not support per-cpu dispatching, 
>> we
>> +     * fall back to the per-vport dispatch mode.
>> +     */
>>      dp_request.user_features |= OVS_DP_F_UNALIGNED;
>> -    dp_request.user_features |= OVS_DP_F_VPORT_PIDS;
>> +    dp_request.user_features |= OVS_DP_F_DISPATCH_UPCALL_PER_CPU;
>>      error = dpif_netlink_dp_transact(&dp_request, &dp, &buf);
>>      if (error) {
>> -        return error;
>> +        dp_request.user_features &= ~OVS_DP_F_DISPATCH_UPCALL_PER_CPU;
>> +        dp_request.user_features |= OVS_DP_F_VPORT_PIDS;
>> +        error = dpif_netlink_dp_transact(&dp_request, &dp, &buf);
>> +        if (error) {
>> +            return error;
>> +        }
>> +        VLOG_INFO("Dispatch mode(per-vport)");
>> +    } else {
>> +        VLOG_INFO("Dispatch mode:(per-cpu)");
> 
> 
> It would be nice if we could expose the mode using a command.

I added `ovs-appctl dpif-netlink/dispatch-mode`

# ovs-appctl dpif-netlink/dispatch-mode
ovs-system: per-cpu dispatch mode

> 
> 
>>      }
>>  
>>      error = open_dpif(&dp, dpifp);
>> @@ -609,6 +648,24 @@ destroy_all_channels(struct dpif_netlink *dpif)
>>      dpif->uc_array_size = 0;
>>  }
>>  
>> +static void
>> +destroy_all_handlers(struct dpif_netlink *dpif)
>> +    OVS_REQ_WRLOCK(dpif->upcall_lock)
>> +{
>> +    int i = 0;
>> +
>> +    if (!dpif->handlers) {
>> +        return;
>> +    }
>> +    for (i = 0; i < dpif->n_handlers; i++) {
>> +        struct dpif_handler *handler = &dpif->handlers[i];
>> +        close_nl_sock(handler->sock);
>> +    }
>> +    free(dpif->handlers);
>> +    dpif->handlers = NULL;
>> +    dpif->n_handlers = 0;
>> +}
>> +
>>  static void
>>  dpif_netlink_close(struct dpif *dpif_)
>>  {
>> @@ -617,7 +674,11 @@ dpif_netlink_close(struct dpif *dpif_)
>>      nl_sock_destroy(dpif->port_notifier);
>>  
>>      fat_rwlock_wrlock(&dpif->upcall_lock);
>> -    destroy_all_channels(dpif);
>> +    if (DISPATCH_MODE_PER_CPU(dpif)) {
>> +        destroy_all_handlers(dpif);
>> +    } else {
>> +        destroy_all_channels(dpif);
>> +    }
>>      fat_rwlock_unlock(&dpif->upcall_lock);
>>  
>>      fat_rwlock_destroy(&dpif->upcall_lock);
>> @@ -641,11 +702,14 @@ dpif_netlink_run(struct dpif *dpif_)
>>  {
>>      struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>  
>> -    if (dpif->refresh_channels) {
>> -        dpif->refresh_channels = false;
>> -        fat_rwlock_wrlock(&dpif->upcall_lock);
>> -        dpif_netlink_refresh_channels(dpif, dpif->n_handlers);
>> -        fat_rwlock_unlock(&dpif->upcall_lock);
>> +    if (!DISPATCH_MODE_PER_CPU(dpif)) {
>> +        if (dpif->refresh_channels) {
>> +            dpif->refresh_channels = false;
>> +            fat_rwlock_wrlock(&dpif->upcall_lock);
>> +            dpif_netlink_refresh_handlers_vport_dispatch(dpif,
>> +                                                         dpif->n_handlers);
>> +            fat_rwlock_unlock(&dpif->upcall_lock);
>> +        }
>>      }
>>      return false;
>>  }
>> @@ -681,6 +745,41 @@ dpif_netlink_get_stats(const struct dpif *dpif_, struct 
>> dpif_dp_stats *stats)
>>      return error;
>>  }
>>  
>> +static int
>> +dpif_netlink_set_handler_pids(struct dpif *dpif_, const uint32_t 
>> *upcall_pids,
>> +                              uint32_t n_upcall_pids)
>> +{
>> +    struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>> +    struct dpif_netlink_dp request, reply;
>> +    struct ofpbuf *bufp;
>> +    int error;
>> +    int n_cores;
>> +
>> +    n_cores = count_cpu_cores();
>> +    ovs_assert(n_cores == n_upcall_pids);
>> +    VLOG_DBG("Dispatch mode(per-cpu): Number of CPUs is %d", n_cores);
>> +
>> +    dpif_netlink_dp_init(&request);
>> +    request.cmd = OVS_DP_CMD_SET;
>> +    request.name = dpif_->base_name;
>> +    request.dp_ifindex = dpif->dp_ifindex;
>> +    request.user_features = dpif->user_features |
>> +                            OVS_DP_F_DISPATCH_UPCALL_PER_CPU;
>> +
>> +    request.upcall_pids = upcall_pids;
>> +    request.n_upcall_pids = n_cores;
>> +
>> +    error = dpif_netlink_dp_transact(&request, &reply, &bufp);
>> +    if (!error) {
>> +        dpif->user_features = reply.user_features;
>> +        ofpbuf_delete(bufp);
>> +         if (!(dpif->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) {
>> +            return -EOPNOTSUPP;
>> +        }
>> +    }
>> +    return error;
>> +}
>> +
>>  static int
>>  dpif_netlink_set_features(struct dpif *dpif_, uint32_t new_features)
>>  {
>> @@ -741,7 +840,7 @@ get_vport_type(const struct dpif_netlink_vport *vport)
>>          return "erspan";
>>  
>>      case OVS_VPORT_TYPE_IP6ERSPAN:
>> -        return "ip6erspan"; 
>> +        return "ip6erspan";
>>  
>>      case OVS_VPORT_TYPE_IP6GRE:
>>          return "ip6gre";
>> @@ -807,10 +906,16 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, 
>> const char *name,
>>      uint32_t upcall_pids = 0;
>>      int error = 0;
>>  
>> -    if (dpif->handlers) {
>> -        error = create_nl_sock(dpif, &sock);
>> -        if (error) {
>> -            return error;
>> +    /* per-cpu dispatch mode does not require a socket per vport */
>> +    if (!DISPATCH_MODE_PER_CPU(dpif)) {
>> +        if (dpif->handlers) {
>> +            error = create_nl_sock(dpif, &sock);
>> +            if (error) {
>> +                return error;
>> +            }
>> +        }
>> +        if (sock) {
>> +            upcall_pids = nl_sock_pid(sock);
>>          }
>>      }
>>  
>> @@ -821,9 +926,6 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, const 
>> char *name,
>>      request.name = name;
>>  
>>      request.port_no = *port_nop;
>> -    if (sock) {
>> -        upcall_pids = nl_sock_pid(sock);
>> -    }
>>      request.n_upcall_pids = 1;
>>      request.upcall_pids = &upcall_pids;
>>  
>> @@ -845,19 +947,21 @@ dpif_netlink_port_add__(struct dpif_netlink *dpif, 
>> const char *name,
>>          goto exit;
>>      }
>>  
>> -    error = vport_add_channel(dpif, *port_nop, sock);
>> -    if (error) {
>> -        VLOG_INFO("%s: could not add channel for port %s",
>> -                    dpif_name(&dpif->dpif), name);
>> -
>> -        /* Delete the port. */
>> -        dpif_netlink_vport_init(&request);
>> -        request.cmd = OVS_VPORT_CMD_DEL;
>> -        request.dp_ifindex = dpif->dp_ifindex;
>> -        request.port_no = *port_nop;
>> -        dpif_netlink_vport_transact(&request, NULL, NULL);
>> -        close_nl_sock(sock);
>> -        goto exit;
>> +    if (!DISPATCH_MODE_PER_CPU(dpif)) {
>> +        error = vport_add_channel(dpif, *port_nop, sock);
>> +        if (error) {
>> +            VLOG_INFO("%s: could not add channel for port %s",
>> +                        dpif_name(&dpif->dpif), name);
>> +
>> +            /* Delete the port. */
>> +            dpif_netlink_vport_init(&request);
>> +            request.cmd = OVS_VPORT_CMD_DEL;
>> +            request.dp_ifindex = dpif->dp_ifindex;
>> +            request.port_no = *port_nop;
>> +            dpif_netlink_vport_transact(&request, NULL, NULL);
>> +            close_nl_sock(sock);
>> +            goto exit;
>> +        }
>>      }
>>  
>>  exit:
>> @@ -1115,6 +1219,11 @@ dpif_netlink_port_get_pid(const struct dpif *dpif_, 
>> odp_port_t port_no)
>>      const struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>      uint32_t ret;
>>  
>> +    /* In per-cpu dispatch mode, vports do not have an associated PID */
>> +    if (DISPATCH_MODE_PER_CPU(dpif)) {
>> +        return 0;
>> +    }
> 
> I mentioned this in the kernel patch's review that this is used with
> userspace() action and 0 means it is 'unset'.

I resolved this and tested using `ovs-ofctl monitor br0 65534 -P
nxt_packet_in`
> 
> 
>> +
>>      fat_rwlock_rdlock(&dpif->upcall_lock);
>>      ret = dpif_netlink_port_get_pid__(dpif, port_no);
>>      fat_rwlock_unlock(&dpif->upcall_lock);
>> @@ -2326,12 +2435,51 @@ dpif_netlink_handler_uninit(struct dpif_handler 
>> *handler)
>>  }
>>  #endif
>>  
>> +static int
>> +dpif_netlink_refresh_handlers_cpu_dispatch(struct dpif_netlink *dpif)
>> +    OVS_REQ_WRLOCK(dpif->upcall_lock)
>> +{
>> +    int handler_id;
>> +    int error = 0;
>> +    uint32_t n_handlers;
>> +    uint32_t *upcall_pids;
>> +
>> +    n_handlers = count_cpu_cores();
>> +    if (dpif->n_handlers != n_handlers) {
>> +        VLOG_DBG("Dispatch mode(per-cpu): initializing %d handlers",
>> +                   n_handlers);
>> +        destroy_all_handlers(dpif);
>> +        upcall_pids = xzalloc(n_handlers * sizeof *upcall_pids);
>> +        dpif->handlers = xzalloc(n_handlers * sizeof *dpif->handlers);
>> +        for (handler_id = 0; handler_id < n_handlers; handler_id++) {
>> +            struct dpif_handler *handler = &dpif->handlers[handler_id];
>> +            error = create_nl_sock(dpif, &handler->sock);
>> +            if (error) {
>> +                VLOG_ERR("Dispatch mode(per-cpu): Cannot create socket for"
>> +                         "handler %d", handler_id);
>> +                continue;
>> +            }
>> +            upcall_pids[handler_id] = nl_sock_pid(handler->sock);
>> +            VLOG_DBG("Dispatch mode(per-cpu): "
>> +                      "handler %d has Netlink PID of %u",
>> +                      handler_id, upcall_pids[handler_id]);
>> +        }
>> +
>> +        dpif->n_handlers = n_handlers;
>> +        error = dpif_netlink_set_handler_pids(&dpif->dpif, upcall_pids,
>> +                                              n_handlers);
>> +        free(upcall_pids);
>> +    }
>> +    return error;
>> +}
>> +
>>  /* Synchronizes 'channels' in 'dpif->handlers'  with the set of vports
>>   * currently in 'dpif' in the kernel, by adding a new set of channels for
>>   * any kernel vport that lacks one and deleting any channels that have no
>>   * backing kernel vports. */
>>  static int
>> -dpif_netlink_refresh_channels(struct dpif_netlink *dpif, uint32_t 
>> n_handlers)
>> +dpif_netlink_refresh_handlers_vport_dispatch(struct dpif_netlink *dpif,
>> +                                             uint32_t n_handlers)
>>      OVS_REQ_WRLOCK(dpif->upcall_lock)
>>  {
>>      unsigned long int *keep_channels;
>> @@ -2458,7 +2606,7 @@ dpif_netlink_refresh_channels(struct dpif_netlink 
>> *dpif, uint32_t n_handlers)
>>  }
>>  
>>  static int
>> -dpif_netlink_recv_set__(struct dpif_netlink *dpif, bool enable)
>> +dpif_netlink_recv_set_vport_dispatch(struct dpif_netlink *dpif, bool enable)
>>      OVS_REQ_WRLOCK(dpif->upcall_lock)
>>  {
>>      if ((dpif->handlers != NULL) == enable) {
>> @@ -2467,7 +2615,21 @@ dpif_netlink_recv_set__(struct dpif_netlink *dpif, 
>> bool enable)
>>          destroy_all_channels(dpif);
>>          return 0;
>>      } else {
>> -        return dpif_netlink_refresh_channels(dpif, 1);
>> +        return dpif_netlink_refresh_handlers_vport_dispatch(dpif, 1);
>> +    }
>> +}
>> +
>> +static int
>> +dpif_netlink_recv_set_cpu_dispatch(struct dpif_netlink *dpif, bool enable)
>> +    OVS_REQ_WRLOCK(dpif->upcall_lock)
>> +{
>> +    if ((dpif->handlers != NULL) == enable) {
>> +        return 0;
>> +    } else if (!enable) {
>> +        destroy_all_handlers(dpif);
>> +        return 0;
>> +    } else {
>> +        return dpif_netlink_refresh_handlers_cpu_dispatch(dpif);
>>      }
>>  }
>>  
>> @@ -2478,7 +2640,11 @@ dpif_netlink_recv_set(struct dpif *dpif_, bool enable)
>>      int error;
>>  
>>      fat_rwlock_wrlock(&dpif->upcall_lock);
>> -    error = dpif_netlink_recv_set__(dpif, enable);
>> +    if (DISPATCH_MODE_PER_CPU(dpif)) {
>> +        error = dpif_netlink_recv_set_cpu_dispatch(dpif, enable);
>> +    } else {
>> +        error = dpif_netlink_recv_set_vport_dispatch(dpif, enable);
>> +    }
>>      fat_rwlock_unlock(&dpif->upcall_lock);
>>  
>>      return error;
>> @@ -2500,13 +2666,31 @@ dpif_netlink_handlers_set(struct dpif *dpif_, 
>> uint32_t n_handlers)
>>  
>>      fat_rwlock_wrlock(&dpif->upcall_lock);
>>      if (dpif->handlers) {
>> -        error = dpif_netlink_refresh_channels(dpif, n_handlers);
>> +        if (DISPATCH_MODE_PER_CPU(dpif)) {
>> +            error = dpif_netlink_refresh_handlers_cpu_dispatch(dpif);
>> +        } else {
>> +            error = dpif_netlink_refresh_handlers_vport_dispatch(dpif,
>> +                                                                 
>> n_handlers);
>> +        }
>>      }
>>      fat_rwlock_unlock(&dpif->upcall_lock);
>>  
>>      return error;
>>  }
>>  
>> +static bool
>> +dpif_netlink_number_handlers_required(struct dpif *dpif_, uint32_t 
>> *n_handlers)
>> +{
>> +    struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>> +
>> +    if (DISPATCH_MODE_PER_CPU(dpif)) {
>> +        *n_handlers = count_cpu_cores();
>> +        return true;
>> +    }
>> +
>> +    return false;
>> +}
>> +
>>  static int
>>  dpif_netlink_queue_to_priority(const struct dpif *dpif OVS_UNUSED,
>>                               uint32_t queue_id, uint32_t *priority)
>> @@ -2669,8 +2853,59 @@ dpif_netlink_recv_windows(struct dpif_netlink *dpif, 
>> uint32_t handler_id,
>>  }
>>  #else
>>  static int
>> -dpif_netlink_recv__(struct dpif_netlink *dpif, uint32_t handler_id,
>> -                    struct dpif_upcall *upcall, struct ofpbuf *buf)
>> +dpif_netlink_recv_cpu_dispatch(struct dpif_netlink *dpif, uint32_t 
>> handler_id,
>> +                               struct dpif_upcall *upcall, struct ofpbuf 
>> *buf)
>> +    OVS_REQ_RDLOCK(dpif->upcall_lock)
>> +{
>> +    struct dpif_handler *handler;
>> +    int read_tries = 0;
>> +
>> +    if (!dpif->handlers || handler_id >= dpif->n_handlers) {
>> +        return EAGAIN;
>> +    }
>> +
>> +    handler = &dpif->handlers[handler_id];
>> +
>> +    for (;;) {
>> +        int dp_ifindex;
>> +        int error;
>> +
>> +        if (++read_tries > 50) {
>> +            return EAGAIN;
>> +        }
>> +        error = nl_sock_recv(handler->sock, buf, NULL, false);
>> +        if (error == ENOBUFS) {
>> +            /* ENOBUFS typically means that we've received so many
>> +                * packets that the buffer overflowed.  Try again
>> +                * immediately because there's almost certainly a packet
>> +                * waiting for us. */
>> +            report_loss(dpif, NULL, 0, handler_id);
>> +            continue;
>> +        }
>> +
>> +        if (error) {
>> +            if (error == EAGAIN) {
>> +                break;
>> +            }
>> +            return error;
>> +        }
>> +
>> +        error = parse_odp_packet(buf, upcall, &dp_ifindex);
>> +        if (!error && dp_ifindex == dpif->dp_ifindex) {
>> +            return 0;
>> +        } else if (error) {
>> +            return error;
>> +        }
>> +    }
>> +
>> +    return EAGAIN;
>> +}
>> +
>> +static int
>> +dpif_netlink_recv_vport_dispatch(struct dpif_netlink *dpif,
>> +                                 uint32_t handler_id,
>> +                                 struct dpif_upcall *upcall,
>> +                                 struct ofpbuf *buf)
>>      OVS_REQ_RDLOCK(dpif->upcall_lock)
>>  {
>>      struct dpif_handler *handler;
>> @@ -2755,18 +2990,24 @@ dpif_netlink_recv(struct dpif *dpif_, uint32_t 
>> handler_id,
>>  #ifdef _WIN32
>>      error = dpif_netlink_recv_windows(dpif, handler_id, upcall, buf);
>>  #else
>> -    error = dpif_netlink_recv__(dpif, handler_id, upcall, buf);
>> +    if (DISPATCH_MODE_PER_CPU(dpif)) {
>> +        error = dpif_netlink_recv_cpu_dispatch(dpif, handler_id, upcall, 
>> buf);
>> +    } else {
>> +        error = dpif_netlink_recv_vport_dispatch(dpif,
>> +                                                 handler_id, upcall, buf);
>> +    }
>>  #endif
>>      fat_rwlock_unlock(&dpif->upcall_lock);
>>  
>>      return error;
>>  }
>>  
>> +#ifdef _WIN32
>>  static void
>> -dpif_netlink_recv_wait__(struct dpif_netlink *dpif, uint32_t handler_id)
>> +dpif_netlink_recv_wait_windows(struct dpif_netlink *dpif, uint32_t 
>> handler_id)
>>      OVS_REQ_RDLOCK(dpif->upcall_lock)
>>  {
>> -#ifdef _WIN32
>> +
>>      uint32_t i;
>>      struct dpif_windows_vport_sock *sock_pool =
>>          dpif->handlers[handler_id].vport_sock_pool;
>> @@ -2779,13 +3020,31 @@ dpif_netlink_recv_wait__(struct dpif_netlink *dpif, 
>> uint32_t handler_id)
>>      for (i = 0; i < VPORT_SOCK_POOL_SIZE; i++) {
>>          nl_sock_wait(sock_pool[i].nl_sock, POLLIN);
>>      }
>> -#else
>> +}
>> +#endif
>> +
>> +static void
>> +dpif_netlink_recv_wait_vport_dispatch(struct dpif_netlink *dpif,
>> +                                      uint32_t handler_id)
>> +    OVS_REQ_RDLOCK(dpif->upcall_lock)
>> +{
>>      if (dpif->handlers && handler_id < dpif->n_handlers) {
>>          struct dpif_handler *handler = &dpif->handlers[handler_id];
>>  
>>          poll_fd_wait(handler->epoll_fd, POLLIN);
>>      }
>> -#endif
>> +}
>> +
>> +static void
>> +dpif_netlink_recv_wait_cpu_dispatch(struct dpif_netlink *dpif,
>> +                                    uint32_t handler_id)
>> +    OVS_REQ_RDLOCK(dpif->upcall_lock)
>> +{
>> +    if (dpif->handlers && handler_id < dpif->n_handlers) {
>> +        struct dpif_handler *handler = &dpif->handlers[handler_id];
>> +
>> +        poll_fd_wait(nl_sock_fd(handler->sock), POLLIN);
>> +    }
>>  }
>>  
>>  static void
>> @@ -2794,12 +3053,20 @@ dpif_netlink_recv_wait(struct dpif *dpif_, uint32_t 
>> handler_id)
>>      struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>  
>>      fat_rwlock_rdlock(&dpif->upcall_lock);
>> -    dpif_netlink_recv_wait__(dpif, handler_id);
>> +#ifdef _WIN32
>> +    dpif_netlink_recv_wait_windows(dpif, handler_id);
>> +#else
>> +    if (DISPATCH_MODE_PER_CPU(dpif)) {
>> +        dpif_netlink_recv_wait_cpu_dispatch(dpif, handler_id);
>> +    } else {
>> +        dpif_netlink_recv_wait_vport_dispatch(dpif, handler_id);
>> +    }
>> +#endif
>>      fat_rwlock_unlock(&dpif->upcall_lock);
>>  }
>>  
>>  static void
>> -dpif_netlink_recv_purge__(struct dpif_netlink *dpif)
>> +dpif_netlink_recv_purge_vport_dispatch(struct dpif_netlink *dpif)
>>      OVS_REQ_WRLOCK(dpif->upcall_lock)
>>  {
>>      if (dpif->handlers) {
>> @@ -2815,13 +3082,31 @@ dpif_netlink_recv_purge__(struct dpif_netlink *dpif)
>>      }
>>  }
>>  
>> +static void
>> +dpif_netlink_recv_purge_cpu_dispatch(struct dpif_netlink *dpif)
>> +    OVS_REQ_WRLOCK(dpif->upcall_lock)
>> +{
>> +    int handler_id;
>> +
>> +    if (dpif->handlers) {
>> +        for (handler_id = 0; handler_id < dpif->n_handlers; handler_id++) {
>> +            struct dpif_handler *handler = &dpif->handlers[handler_id];
>> +            nl_sock_drain(handler->sock);
>> +        }
>> +    }
>> +}
>> +
>>  static void
>>  dpif_netlink_recv_purge(struct dpif *dpif_)
>>  {
>>      struct dpif_netlink *dpif = dpif_netlink_cast(dpif_);
>>  
>>      fat_rwlock_wrlock(&dpif->upcall_lock);
>> -    dpif_netlink_recv_purge__(dpif);
>> +    if (DISPATCH_MODE_PER_CPU(dpif)) {
>> +        dpif_netlink_recv_purge_cpu_dispatch(dpif);
>> +    } else {
>> +        dpif_netlink_recv_purge_vport_dispatch(dpif);
>> +    }
>>      fat_rwlock_unlock(&dpif->upcall_lock);
>>  }
>>  
>> @@ -3974,6 +4259,7 @@ const struct dpif_class dpif_netlink_class = {
>>      dpif_netlink_operate,
>>      dpif_netlink_recv_set,
>>      dpif_netlink_handlers_set,
>> +    dpif_netlink_number_handlers_required,
>>      NULL,                       /* set_config */
>>      dpif_netlink_queue_to_priority,
>>      dpif_netlink_recv,
>> @@ -4337,6 +4623,11 @@ dpif_netlink_dp_to_ofpbuf(const struct 
>> dpif_netlink_dp *dp, struct ofpbuf *buf)
>>          nl_msg_put_u32(buf, OVS_DP_ATTR_USER_FEATURES, dp->user_features);
>>      }
>>  
>> +    if (dp->upcall_pids) {
>> +        nl_msg_put_unspec(buf, OVS_DP_ATTR_PER_CPU_PIDS, dp->upcall_pids,
>> +                          sizeof *dp->upcall_pids * dp->n_upcall_pids);
>> +    }
>> +
>>      /* Skip OVS_DP_ATTR_STATS since we never have a reason to serialize it. 
>> */
>>  }
>>  
>> @@ -4642,7 +4933,6 @@ dpif_netlink_flow_get_stats(const struct 
>> dpif_netlink_flow *flow,
>>      stats->used = flow->used ? get_32aligned_u64(flow->used) : 0;
>>      stats->tcp_flags = flow->tcp_flags ? *flow->tcp_flags : 0;
>>  }
>> -
>>  /* Logs information about a packet that was recently lost in 'ch' (in
>>   * 'dpif_'). */
>>  static void
>> @@ -4656,13 +4946,18 @@ report_loss(struct dpif_netlink *dpif, struct 
>> dpif_channel *ch, uint32_t ch_idx,
>>          return;
>>      }
>>  
>> -    ds_init(&s);
>> -    if (ch->last_poll != LLONG_MIN) {
>> -        ds_put_format(&s, " (last polled %lld ms ago)",
>> -                      time_msec() - ch->last_poll);
>> -    }
>> +    if (DISPATCH_MODE_PER_CPU(dpif)) {
>> +        VLOG_WARN("%s: lost packet on handler %u",
>> +                  dpif_name(&dpif->dpif), handler_id);
>> +    } else {
>> +        ds_init(&s);
>> +        if (ch->last_poll != LLONG_MIN) {
>> +            ds_put_format(&s, " (last polled %lld ms ago)",
>> +                        time_msec() - ch->last_poll);
>> +        }
>>  
>> -    VLOG_WARN("%s: lost packet on port channel %u of handler %u%s",
>> -              dpif_name(&dpif->dpif), ch_idx, handler_id, ds_cstr(&s));
>> -    ds_destroy(&s);
>> +        VLOG_WARN("%s: lost packet on port channel %u of handler %u%s",
>> +                dpif_name(&dpif->dpif), ch_idx, handler_id, ds_cstr(&s));
>> +        ds_destroy(&s);
>> +    }
>>  }
>> diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
>> index b817fceac698..32711f19d1f6 100644
>> --- a/lib/dpif-provider.h
>> +++ b/lib/dpif-provider.h
>> @@ -357,6 +357,16 @@ struct dpif_class {
>>       * */
>>      int (*handlers_set)(struct dpif *dpif, uint32_t n_handlers);
>>  
>> +    /* Queries 'dpif' to see if a certain number of handlers are required by
>> +     * the implementation.
>> +     *
>> +     * If a certain number of handlers are required, returns 'true' and sets
>> +     * 'n_handlers' to that number of handler threads.
>> +     *
>> +     * If not, returns 'false'.
>> +     */
>> +    bool (*number_handlers_required)(struct dpif *dpif, uint32_t 
>> *n_handlers);
>> +
> 
> It also needs to update struct dpif_class handlers_set() documentation.
> 
> 

Good point. I updated it.

>>      /* Pass custom configuration options to the datapath.  The 
>> implementation
>>       * might postpone applying the changes until run() is called. */
>>      int (*set_config)(struct dpif *dpif, const struct smap *other_config);
>> diff --git a/lib/dpif.c b/lib/dpif.c
>> index 26e8bfb7db98..511383514d5b 100644
>> --- a/lib/dpif.c
>> +++ b/lib/dpif.c
>> @@ -1489,6 +1489,23 @@ dpif_handlers_set(struct dpif *dpif, uint32_t 
>> n_handlers)
>>      return error;
>>  }
>>  
>> +/* Checks if a certain number of handlers are required.
>> + *
>> + * If a certain number of handlers are required, returns 'true' and sets
>> + * 'n_handlers' to that number of handler threads.
>> + *
>> + * If not, returns 'false'
>> + */
>> +bool
>> +dpif_number_handlers_required(struct dpif *dpif, uint32_t *n_handlers) {
>> +    bool ret = false;
>> +
>> +    if (dpif->dpif_class->number_handlers_required) {
>> +        ret = dpif->dpif_class->number_handlers_required(dpif, n_handlers);
>> +    }
>> +    return ret;
>> +}
>> +
>>  void
>>  dpif_register_dp_purge_cb(struct dpif *dpif, dp_purge_callback *cb, void 
>> *aux)
>>  {
>> diff --git a/lib/dpif.h b/lib/dpif.h
>> index f9728e67393b..7c322d20e6c7 100644
>> --- a/lib/dpif.h
>> +++ b/lib/dpif.h
>> @@ -873,6 +873,7 @@ void dpif_register_upcall_cb(struct dpif *, 
>> upcall_callback *, void *aux);
>>  
>>  int dpif_recv_set(struct dpif *, bool enable);
>>  int dpif_handlers_set(struct dpif *, uint32_t n_handlers);
>> +bool dpif_number_handlers_required(struct dpif *, uint32_t *n_handlers);
>>  int dpif_set_config(struct dpif *, const struct smap *cfg);
>>  int dpif_port_set_config(struct dpif *, odp_port_t, const struct smap *cfg);
>>  int dpif_recv(struct dpif *, uint32_t handler_id, struct dpif_upcall *,
>> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
>> index 88406fea1391..45c22d20c183 100644
>> --- a/ofproto/ofproto-dpif-upcall.c
>> +++ b/ofproto/ofproto-dpif-upcall.c
>> @@ -636,24 +636,61 @@ udpif_set_threads(struct udpif *udpif, uint32_t 
>> n_handlers_,
>>                    uint32_t n_revalidators_)
>>  {
>>      ovs_assert(udpif);
>> -    ovs_assert(n_handlers_ && n_revalidators_);
>> +    uint32_t n_handlers_requested;
>> +    uint32_t n_revalidators_requested;
>> +    bool forced = false;
>> +
>> +    if (dpif_number_handlers_required(udpif->dpif, &n_handlers_requested)) {
>> +        forced = true;
>> +        if (!n_revalidators_) {
>> +            n_revalidators_requested = n_handlers_requested / 4 + 1;
>> +        } else {
>> +            n_revalidators_requested = n_revalidators_;
>> +        }
>> +    } else {
>> +        int threads = MAX(count_cpu_cores(), 2);
>> +
>> +        n_revalidators_requested = MAX(n_revalidators_, 0);
>> +        n_handlers_requested = MAX(n_handlers_, 0);
>>  
>> -    if (udpif->n_handlers != n_handlers_
>> -        || udpif->n_revalidators != n_revalidators_) {
>> +        if (!n_revalidators_requested) {
>> +            n_revalidators_requested = n_handlers_requested
>> +            ? MAX(threads - (int) n_handlers_requested, 1)
>> +            : threads / 4 + 1;
>> +        }
>> +
>> +        if (!n_handlers_requested) {
>> +            n_handlers_requested = MAX(threads -
>> +                                       (int) n_revalidators_requested, 1);
>> +        }
>> +    }
>> +
>> +    if (udpif->n_handlers != n_handlers_requested
>> +        || udpif->n_revalidators != n_revalidators_requested) {
>> +        if (forced) {
>> +            VLOG_INFO("Overriding n-handler-threads to %u, setting "
>> +                      "n-revalidator-threads to %u", n_handlers_requested,
>> +                      n_revalidators_requested);
>> +            } else {
>> +            VLOG_INFO("Setting n-handler-threads to %u, setting "
>> +                      "n-revalidator-threads to %u", n_handlers_requested,
>> +                      n_revalidators_requested);
>> +        }
>>          udpif_stop_threads(udpif, true);
>>      }
>>  
>>      if (!udpif->handlers && !udpif->revalidators) {
>> +        VLOG_INFO("Starting %u threads", n_handlers_requested +
>> +                                         n_revalidators_requested);
>>          int error;
>> -
>> -        error = dpif_handlers_set(udpif->dpif, n_handlers_);
>> +        error = dpif_handlers_set(udpif->dpif, n_handlers_requested);
>>          if (error) {
>>              VLOG_ERR("failed to configure handlers in dpif %s: %s",
>>                       dpif_name(udpif->dpif), ovs_strerror(error));
>>              return;
>>          }
>> -
>> -        udpif_start_threads(udpif, n_handlers_, n_revalidators_);
>> +        udpif_start_threads(udpif, n_handlers_requested,
>> +                            n_revalidators_requested);
>>      }
>>  }
>>  
>> diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
>> index 69e5834e766c..1501eca9a989 100644
>> --- a/ofproto/ofproto.c
>> +++ b/ofproto/ofproto.c
>> @@ -792,20 +792,8 @@ ofproto_type_set_config(const char *datapath_type, 
>> const struct smap *cfg)
>>  void
>>  ofproto_set_threads(int n_handlers_, int n_revalidators_)
>>  {
>> -    int threads = MAX(count_cpu_cores(), 2);
>> -
>>      n_revalidators = MAX(n_revalidators_, 0);
>>      n_handlers = MAX(n_handlers_, 0);
>> -
>> -    if (!n_revalidators) {
>> -        n_revalidators = n_handlers
>> -            ? MAX(threads - (int) n_handlers, 1)
>> -            : threads / 4 + 1;
>> -    }
>> -
>> -    if (!n_handlers) {
>> -        n_handlers = MAX(threads - (int) n_revalidators, 1);
>> -    }
>>  }
>>  
>>  void
>> -- 
>> 2.27.0
>>
>> _______________________________________________
>> dev mailing list
>> d...@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 


_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [RFC 3/3] dpif-netlink: Introduce per-cpu upcall dispatch

Reply via email to