Re: [ovs-dev] [RFC net-next] openvswitch: Introduce per-cpu upcall dispatch

2021-06-30 Thread Mark Gray
On 19/05/2021 22:47, Pravin Shelar wrote:

Thanks for the review and sorry for the delay. I will be more responsive
to any further changes from this point on.

> On Fri, Apr 30, 2021 at 8:33 AM Mark Gray  wrote:
>>
>> The Open vSwitch kernel module uses the upcall mechanism to send
>> packets from kernel space to user space when it misses in the kernel
>> space flow table. The upcall sends packets via a Netlink socket.
>> Currently, a Netlink socket is created for every vport. In this way,
>> there is a 1:1 mapping between a vport and a Netlink socket.
>> When a packet is received by a vport, if it needs to be sent to
>> user space, it is sent via the corresponding Netlink socket.
>>
>> This mechanism, with various iterations of the corresponding user
>> space code, has seen some limitations and issues:
>>
>> * On systems with a large number of vports, there is a correspondingly
>> large number of Netlink sockets which can limit scaling.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
>> * Packet reordering on upcalls.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
>> * A thundering herd issue.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=183)
>>
>> This patch introduces an alternative, feature-negotiated, upcall
>> mode using a per-cpu dispatch rather than a per-vport dispatch.
>>
>> In this mode, the Netlink socket to be used for the upcall is
>> selected based on the CPU of the thread that is executing the upcall.
>> In this way, it resolves the issues above as:
>>
>> a) The number of Netlink sockets scales with the number of CPUs
>> rather than the number of vports.
>> b) Ordering per-flow is maintained as packets are distributed to
>> CPUs based on mechanisms such as RSS and flows are distributed
>> to a single user space thread.
>> c) Packets from a flow can only wake up one user space thread.
>>
>> The corresponding user space code can be found at:
>> https://mail.openvswitch.org/pipermail/ovs-dev/2021-April/382618.html
>>
>> Bugzilla: https://bugzilla.redhat.com/1844576
>> Signed-off-by: Mark Gray 
>> ---
>>  include/uapi/linux/openvswitch.h |  8 
>>  net/openvswitch/datapath.c   | 70 +++-
>>  net/openvswitch/datapath.h   | 18 
>>  net/openvswitch/flow_netlink.c   |  4 --
>>  4 files changed, 94 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/uapi/linux/openvswitch.h 
>> b/include/uapi/linux/openvswitch.h
>> index 8d16744edc31..6571b57b2268 100644
>> --- a/include/uapi/linux/openvswitch.h
>> +++ b/include/uapi/linux/openvswitch.h
>> @@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
>>   * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
>>   * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
>>   * not be sent.
>> + * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
>> + * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
>>   * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
>>   * datapath.  Always present in notifications.
>>   * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for 
>> the
>> @@ -87,6 +89,9 @@ enum ovs_datapath_attr {
>> OVS_DP_ATTR_USER_FEATURES,  /* OVS_DP_F_*  */
>> OVS_DP_ATTR_PAD,
>> OVS_DP_ATTR_MASKS_CACHE_SIZE,
>> +   OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in 
>> per-cpu
>> +* dispatch mode
>> +*/
>> __OVS_DP_ATTR_MAX
>>  };
>>
>> @@ -127,6 +132,9 @@ struct ovs_vport_stats {
>>  /* Allow tc offload recirc sharing */
>>  #define OVS_DP_F_TC_RECIRC_SHARING (1 << 2)
>>
>> +/* Allow per-cpu dispatch of upcalls */
>> +#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU   (1 << 3)
>> +
>>  /* Fixed logical ports. */
>>  #define OVSP_LOCAL  ((__u32)0)
>>
>> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
>> index 9d6ef6cb9b26..98d54f41fdaa 100644
>> --- a/net/openvswitch/datapath.c
>> +++ b/net/openvswitch/datapath.c
>> @@ -121,6 +121,8 @@ int lockdep_ovsl_is_held(void)
>>  #endif
>>
>>  static struct vport *new_vport(const struct vport_parms *);
>> +static u32 ovs_dp_get_upcall_portid(const struct datapath *, uint32_t);
>> +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr 
>> *);
>>  static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
>>  const struct sw_flow_key *,
>>  const struct dp_upcall_info *,
>> @@ -238,7 +240,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct 
>> sw_flow_key *key)
>>
>> memset(&upcall, 0, sizeof(upcall));
>> upcall.cmd = OVS_PACKET_CMD_MISS;
>> -   upcall.portid = ovs_vport_find_upcall_portid(p, skb);
>> +
>> +   if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
>> +   upcall.portid = ovs_dp_get_upcall_portid(dp, 
>> smp_processor_id(

Re: [ovs-dev] [RFC net-next] openvswitch: Introduce per-cpu upcall dispatch

2021-06-30 Thread Mark Gray
On 28/05/2021 20:49, Flavio Leitner wrote:
> 
> Hi Mark,
> 
> I think this patch is going in the right direction but there
> are some points that I think we should address. See below.
> 
> On Fri, Apr 30, 2021 at 11:33:25AM -0400, Mark Gray wrote:
>> The Open vSwitch kernel module uses the upcall mechanism to send
>> packets from kernel space to user space when it misses in the kernel
>> space flow table. The upcall sends packets via a Netlink socket.
>> Currently, a Netlink socket is created for every vport. In this way,
>> there is a 1:1 mapping between a vport and a Netlink socket.
>> When a packet is received by a vport, if it needs to be sent to
>> user space, it is sent via the corresponding Netlink socket.
>>
>> This mechanism, with various iterations of the corresponding user
>> space code, has seen some limitations and issues:
>>
>> * On systems with a large number of vports, there is a correspondingly
>> large number of Netlink sockets which can limit scaling.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
>> * Packet reordering on upcalls.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
>> * A thundering herd issue.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=183)
>>
>> This patch introduces an alternative, feature-negotiated, upcall
>> mode using a per-cpu dispatch rather than a per-vport dispatch.
>>
>> In this mode, the Netlink socket to be used for the upcall is
>> selected based on the CPU of the thread that is executing the upcall.
>> In this way, it resolves the issues above as:
>>
>> a) The number of Netlink sockets scales with the number of CPUs
>> rather than the number of vports.
>> b) Ordering per-flow is maintained as packets are distributed to
>> CPUs based on mechanisms such as RSS and flows are distributed
>> to a single user space thread.
>> c) Packets from a flow can only wake up one user space thread.
>>
>> The corresponding user space code can be found at:
>> https://mail.openvswitch.org/pipermail/ovs-dev/2021-April/382618.html
> 
> Thanks for writing a nice commit description.
> 
>>
>> Bugzilla: https://bugzilla.redhat.com/1844576
>> Signed-off-by: Mark Gray 
>> ---
>>  include/uapi/linux/openvswitch.h |  8 
>>  net/openvswitch/datapath.c   | 70 +++-
>>  net/openvswitch/datapath.h   | 18 
>>  net/openvswitch/flow_netlink.c   |  4 --
>>  4 files changed, 94 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/uapi/linux/openvswitch.h 
>> b/include/uapi/linux/openvswitch.h
>> index 8d16744edc31..6571b57b2268 100644
>> --- a/include/uapi/linux/openvswitch.h
>> +++ b/include/uapi/linux/openvswitch.h
>> @@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
>>   * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
>>   * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
>>   * not be sent.
>> + * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
>> + * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
>>   * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
>>   * datapath.  Always present in notifications.
>>   * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for 
>> the
>> @@ -87,6 +89,9 @@ enum ovs_datapath_attr {
>>  OVS_DP_ATTR_USER_FEATURES,  /* OVS_DP_F_*  */
>>  OVS_DP_ATTR_PAD,
>>  OVS_DP_ATTR_MASKS_CACHE_SIZE,
>> +OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in 
>> per-cpu
>> + * dispatch mode
>> + */
>>  __OVS_DP_ATTR_MAX
>>  };
>>  
>> @@ -127,6 +132,9 @@ struct ovs_vport_stats {
>>  /* Allow tc offload recirc sharing */
>>  #define OVS_DP_F_TC_RECIRC_SHARING  (1 << 2)
>>  
>> +/* Allow per-cpu dispatch of upcalls */
>> +#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU(1 << 3)
>> +
>>  /* Fixed logical ports. */
>>  #define OVSP_LOCAL  ((__u32)0)
>>  
>> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
>> index 9d6ef6cb9b26..98d54f41fdaa 100644
>> --- a/net/openvswitch/datapath.c
>> +++ b/net/openvswitch/datapath.c
>> @@ -121,6 +121,8 @@ int lockdep_ovsl_is_held(void)
>>  #endif
>>  
>>  static struct vport *new_vport(const struct vport_parms *);
>> +static u32 ovs_dp_get_upcall_portid(const struct datapath *, uint32_t);
>> +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr 
>> *);
>>  static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
>>   const struct sw_flow_key *,
>>   const struct dp_upcall_info *,
>> @@ -238,7 +240,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct 
>> sw_flow_key *key)
>>  
>>  memset(&upcall, 0, sizeof(upcall));
>>  upcall.cmd = OVS_PACKET_CMD_MISS;
>> -upcall.portid = ovs_vport_find_upcall_portid(p, skb);
>> +
>> +if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
>> +upcall.portid = o

Re: [ovs-dev] [RFC net-next] openvswitch: Introduce per-cpu upcall dispatch

2021-05-28 Thread Flavio Leitner


Hi Mark,

I think this patch is going in the right direction but there
are some points that I think we should address. See below.

On Fri, Apr 30, 2021 at 11:33:25AM -0400, Mark Gray wrote:
> The Open vSwitch kernel module uses the upcall mechanism to send
> packets from kernel space to user space when it misses in the kernel
> space flow table. The upcall sends packets via a Netlink socket.
> Currently, a Netlink socket is created for every vport. In this way,
> there is a 1:1 mapping between a vport and a Netlink socket.
> When a packet is received by a vport, if it needs to be sent to
> user space, it is sent via the corresponding Netlink socket.
> 
> This mechanism, with various iterations of the corresponding user
> space code, has seen some limitations and issues:
> 
> * On systems with a large number of vports, there is a correspondingly
> large number of Netlink sockets which can limit scaling.
> (https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
> * Packet reordering on upcalls.
> (https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
> * A thundering herd issue.
> (https://bugzilla.redhat.com/show_bug.cgi?id=183)
> 
> This patch introduces an alternative, feature-negotiated, upcall
> mode using a per-cpu dispatch rather than a per-vport dispatch.
> 
> In this mode, the Netlink socket to be used for the upcall is
> selected based on the CPU of the thread that is executing the upcall.
> In this way, it resolves the issues above as:
> 
> a) The number of Netlink sockets scales with the number of CPUs
> rather than the number of vports.
> b) Ordering per-flow is maintained as packets are distributed to
> CPUs based on mechanisms such as RSS and flows are distributed
> to a single user space thread.
> c) Packets from a flow can only wake up one user space thread.
> 
> The corresponding user space code can be found at:
> https://mail.openvswitch.org/pipermail/ovs-dev/2021-April/382618.html

Thanks for writing a nice commit description.

> 
> Bugzilla: https://bugzilla.redhat.com/1844576
> Signed-off-by: Mark Gray 
> ---
>  include/uapi/linux/openvswitch.h |  8 
>  net/openvswitch/datapath.c   | 70 +++-
>  net/openvswitch/datapath.h   | 18 
>  net/openvswitch/flow_netlink.c   |  4 --
>  4 files changed, 94 insertions(+), 6 deletions(-)
> 
> diff --git a/include/uapi/linux/openvswitch.h 
> b/include/uapi/linux/openvswitch.h
> index 8d16744edc31..6571b57b2268 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
>   * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
>   * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
>   * not be sent.
> + * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
> + * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
>   * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
>   * datapath.  Always present in notifications.
>   * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for 
> the
> @@ -87,6 +89,9 @@ enum ovs_datapath_attr {
>   OVS_DP_ATTR_USER_FEATURES,  /* OVS_DP_F_*  */
>   OVS_DP_ATTR_PAD,
>   OVS_DP_ATTR_MASKS_CACHE_SIZE,
> + OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in 
> per-cpu
> +  * dispatch mode
> +  */
>   __OVS_DP_ATTR_MAX
>  };
>  
> @@ -127,6 +132,9 @@ struct ovs_vport_stats {
>  /* Allow tc offload recirc sharing */
>  #define OVS_DP_F_TC_RECIRC_SHARING   (1 << 2)
>  
> +/* Allow per-cpu dispatch of upcalls */
> +#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU (1 << 3)
> +
>  /* Fixed logical ports. */
>  #define OVSP_LOCAL  ((__u32)0)
>  
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index 9d6ef6cb9b26..98d54f41fdaa 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -121,6 +121,8 @@ int lockdep_ovsl_is_held(void)
>  #endif
>  
>  static struct vport *new_vport(const struct vport_parms *);
> +static u32 ovs_dp_get_upcall_portid(const struct datapath *, uint32_t);
> +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr 
> *);
>  static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
>const struct sw_flow_key *,
>const struct dp_upcall_info *,
> @@ -238,7 +240,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct 
> sw_flow_key *key)
>  
>   memset(&upcall, 0, sizeof(upcall));
>   upcall.cmd = OVS_PACKET_CMD_MISS;
> - upcall.portid = ovs_vport_find_upcall_portid(p, skb);
> +
> + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
> + upcall.portid = ovs_dp_get_upcall_portid(dp, 
> smp_processor_id());
> + else
> + upcall.portid = ovs_vport_find_upcall_portid(

Re: [ovs-dev] [RFC net-next] openvswitch: Introduce per-cpu upcall dispatch

2021-05-19 Thread Pravin Shelar
On Fri, Apr 30, 2021 at 8:33 AM Mark Gray  wrote:
>
> The Open vSwitch kernel module uses the upcall mechanism to send
> packets from kernel space to user space when it misses in the kernel
> space flow table. The upcall sends packets via a Netlink socket.
> Currently, a Netlink socket is created for every vport. In this way,
> there is a 1:1 mapping between a vport and a Netlink socket.
> When a packet is received by a vport, if it needs to be sent to
> user space, it is sent via the corresponding Netlink socket.
>
> This mechanism, with various iterations of the corresponding user
> space code, has seen some limitations and issues:
>
> * On systems with a large number of vports, there is a correspondingly
> large number of Netlink sockets which can limit scaling.
> (https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
> * Packet reordering on upcalls.
> (https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
> * A thundering herd issue.
> (https://bugzilla.redhat.com/show_bug.cgi?id=183)
>
> This patch introduces an alternative, feature-negotiated, upcall
> mode using a per-cpu dispatch rather than a per-vport dispatch.
>
> In this mode, the Netlink socket to be used for the upcall is
> selected based on the CPU of the thread that is executing the upcall.
> In this way, it resolves the issues above as:
>
> a) The number of Netlink sockets scales with the number of CPUs
> rather than the number of vports.
> b) Ordering per-flow is maintained as packets are distributed to
> CPUs based on mechanisms such as RSS and flows are distributed
> to a single user space thread.
> c) Packets from a flow can only wake up one user space thread.
>
> The corresponding user space code can be found at:
> https://mail.openvswitch.org/pipermail/ovs-dev/2021-April/382618.html
>
> Bugzilla: https://bugzilla.redhat.com/1844576
> Signed-off-by: Mark Gray 
> ---
>  include/uapi/linux/openvswitch.h |  8 
>  net/openvswitch/datapath.c   | 70 +++-
>  net/openvswitch/datapath.h   | 18 
>  net/openvswitch/flow_netlink.c   |  4 --
>  4 files changed, 94 insertions(+), 6 deletions(-)
>
> diff --git a/include/uapi/linux/openvswitch.h 
> b/include/uapi/linux/openvswitch.h
> index 8d16744edc31..6571b57b2268 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
>   * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
>   * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
>   * not be sent.
> + * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
> + * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
>   * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
>   * datapath.  Always present in notifications.
>   * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for 
> the
> @@ -87,6 +89,9 @@ enum ovs_datapath_attr {
> OVS_DP_ATTR_USER_FEATURES,  /* OVS_DP_F_*  */
> OVS_DP_ATTR_PAD,
> OVS_DP_ATTR_MASKS_CACHE_SIZE,
> +   OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in 
> per-cpu
> +* dispatch mode
> +*/
> __OVS_DP_ATTR_MAX
>  };
>
> @@ -127,6 +132,9 @@ struct ovs_vport_stats {
>  /* Allow tc offload recirc sharing */
>  #define OVS_DP_F_TC_RECIRC_SHARING (1 << 2)
>
> +/* Allow per-cpu dispatch of upcalls */
> +#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU   (1 << 3)
> +
>  /* Fixed logical ports. */
>  #define OVSP_LOCAL  ((__u32)0)
>
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index 9d6ef6cb9b26..98d54f41fdaa 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -121,6 +121,8 @@ int lockdep_ovsl_is_held(void)
>  #endif
>
>  static struct vport *new_vport(const struct vport_parms *);
> +static u32 ovs_dp_get_upcall_portid(const struct datapath *, uint32_t);
> +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr 
> *);
>  static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
>  const struct sw_flow_key *,
>  const struct dp_upcall_info *,
> @@ -238,7 +240,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct 
> sw_flow_key *key)
>
> memset(&upcall, 0, sizeof(upcall));
> upcall.cmd = OVS_PACKET_CMD_MISS;
> -   upcall.portid = ovs_vport_find_upcall_portid(p, skb);
> +
> +   if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
> +   upcall.portid = ovs_dp_get_upcall_portid(dp, 
> smp_processor_id());
> +   else
> +   upcall.portid = ovs_vport_find_upcall_portid(p, skb);
> +
> upcall.mru = OVS_CB(skb)->mru;
> error = ovs_dp_upcall(dp, skb, key, &upcall, 0);
> if (unlikely(error))
>