[ovs-dev] [PATCH RFC ovn 4/4] ovn-controller: Skip non-local lflows in ovn-controller before parsing.

2021-06-30 Thread Han Zhou
With the help of logical_flow's in_out_port tag, we can skip parsing a
big portion of the logical flows in SB DB, which can largely improve
ovn-controller's performance whenever a full recompute is required.

With a scale test topology of 1000 chassises, 20 LSPs per chassis, 20k
lports in total spread acrossing 200 logical switches, connected by a
logical router, the test result before & after this change:

Before:
- lflow-cache disabled:
- ovn-controller recompute: 2.7 sec
- lflow-cache enabled:
- ovn-controller recompute: 2.1 sec
- lflow cache memory: 622103 KB

After:
- lflow-cache disabled:
- ovn-controller recompute: 0.83 sec
- lflow-cache enabled:
- ovn-controller recompute: 0.71 sec
- lflow cache memory: 123641 KB

(note: DP group enabled for both)

So for this test scenario, when lflow cache is disabled, latency reduced
~70%; when lflow cache is enabled, latency reduced ~65% and lflow cache
memory reduced ~80%.

Signed-off-by: Han Zhou 
---
 controller/lflow.c  | 21 +
 controller/lflow.h  |  1 +
 controller/ovn-controller.c |  1 +
 3 files changed, 23 insertions(+)

diff --git a/controller/lflow.c b/controller/lflow.c
index b7699a309..ee05c559c 100644
--- a/controller/lflow.c
+++ b/controller/lflow.c
@@ -740,6 +740,27 @@ consider_logical_flow__(const struct sbrec_logical_flow 
*lflow,
 return true;
 }
 
+const char *io_port = smap_get(>tags, "in_out_port");
+if (io_port) {
+lflow_resource_add(l_ctx_out->lfrr, REF_TYPE_PORTBINDING, io_port,
+   >header_.uuid);
+const struct sbrec_port_binding *pb
+= lport_lookup_by_name(l_ctx_in->sbrec_port_binding_by_name,
+   io_port);
+if (!pb) {
+VLOG_DBG("lflow "UUID_FMT" matches inport/outport %s that's not "
+ "found, skip", UUID_ARGS(>header_.uuid), io_port);
+return true;
+}
+char buf[16];
+get_unique_lport_key(dp->tunnel_key, pb->tunnel_key, buf, sizeof buf);
+if (!sset_contains(l_ctx_in->local_lport_ids, buf)) {
+VLOG_DBG("lflow "UUID_FMT" matches inport/outport %s that's not "
+ "local, skip", UUID_ARGS(>header_.uuid), io_port);
+return true;
+}
+}
+
 /* Determine translation of logical table IDs to physical table IDs. */
 bool ingress = !strcmp(lflow->pipeline, "ingress");
 
diff --git a/controller/lflow.h b/controller/lflow.h
index 9d8882ae5..797d2d026 100644
--- a/controller/lflow.h
+++ b/controller/lflow.h
@@ -146,6 +146,7 @@ struct lflow_ctx_in {
 const struct shash *port_groups;
 const struct sset *active_tunnels;
 const struct sset *local_lport_ids;
+const struct sset *local_lports;
 };
 
 struct lflow_ctx_out {
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index b15ecbb5d..24da79628 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -2048,6 +2048,7 @@ init_lflow_ctx(struct engine_node *node,
 l_ctx_in->port_groups = port_groups;
 l_ctx_in->active_tunnels = _data->active_tunnels;
 l_ctx_in->local_lport_ids = _data->local_lport_ids;
+l_ctx_in->local_lports = _data->local_lports;
 
 l_ctx_out->flow_table = >flow_table;
 l_ctx_out->group_table = >group_table;
-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH RFC ovn 3/4] ovn-northd: Populate in_out_port in logical_flow table's tags.

2021-06-30 Thread Han Zhou
Populate the in_out_port tag for logical switch pipeline flows wherever
possible.

Signed-off-by: Han Zhou 
---
 northd/ovn-northd.c | 272 +---
 1 file changed, 155 insertions(+), 117 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 83746f4ab..f60dab7a9 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -4046,6 +4046,7 @@ struct ovn_lflow {
 uint16_t priority;
 char *match;
 char *actions;
+char *io_port;
 char *stage_hint;
 const char *where;
 };
@@ -4081,7 +4082,7 @@ ovn_lflow_equal(const struct ovn_lflow *a, const struct 
ovn_datapath *od,
 static void
 ovn_lflow_init(struct ovn_lflow *lflow, struct ovn_datapath *od,
enum ovn_stage stage, uint16_t priority,
-   char *match, char *actions, char *stage_hint,
+   char *match, char *actions, char *io_port, char *stage_hint,
const char *where)
 {
 hmapx_init(>od_group);
@@ -4090,6 +4091,7 @@ ovn_lflow_init(struct ovn_lflow *lflow, struct 
ovn_datapath *od,
 lflow->priority = priority;
 lflow->match = match;
 lflow->actions = actions;
+lflow->io_port = io_port;
 lflow->stage_hint = stage_hint;
 lflow->where = where;
 }
@@ -4107,7 +4109,7 @@ static struct hashrow_locks lflow_locks;
 static void
 do_ovn_lflow_add(struct hmap *lflow_map, struct ovn_datapath *od,
  uint32_t hash, enum ovn_stage stage, uint16_t priority,
- const char *match, const char *actions,
+ const char *match, const char *actions, const char *io_port,
  const struct ovsdb_idl_row *stage_hint,
  const char *where)
 {
@@ -4130,6 +4132,7 @@ do_ovn_lflow_add(struct hmap *lflow_map, struct 
ovn_datapath *od,
  * one datapath in a group, so it could be hashed correctly. */
 ovn_lflow_init(lflow, NULL, stage, priority,
xstrdup(match), xstrdup(actions),
+   io_port ? xstrdup(io_port) : NULL,
ovn_lflow_hint(stage_hint), where);
 hmapx_add(>od_group, od);
 hmap_insert_fast(lflow_map, >hmap_node, hash);
@@ -4139,7 +4142,7 @@ do_ovn_lflow_add(struct hmap *lflow_map, struct 
ovn_datapath *od,
 static void
 ovn_lflow_add_at(struct hmap *lflow_map, struct ovn_datapath *od,
  enum ovn_stage stage, uint16_t priority,
- const char *match, const char *actions,
+ const char *match, const char *actions, const char *io_port,
  const struct ovsdb_idl_row *stage_hint, const char *where)
 {
 ovs_assert(ovn_stage_to_datapath_type(stage) == ovn_datapath_get_type(od));
@@ -4154,11 +4157,11 @@ ovn_lflow_add_at(struct hmap *lflow_map, struct 
ovn_datapath *od,
 if (use_logical_dp_groups && use_parallel_build) {
 lock_hash_row(_locks, hash);
 do_ovn_lflow_add(lflow_map, od, hash, stage, priority, match,
- actions, stage_hint, where);
+ actions, io_port, stage_hint, where);
 unlock_hash_row(_locks, hash);
 } else {
 do_ovn_lflow_add(lflow_map, od, hash, stage, priority, match,
- actions, stage_hint, where);
+ actions, io_port, stage_hint, where);
 }
 }
 
@@ -4166,11 +4169,27 @@ ovn_lflow_add_at(struct hmap *lflow_map, struct 
ovn_datapath *od,
 #define ovn_lflow_add_with_hint(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, \
 ACTIONS, STAGE_HINT) \
 ovn_lflow_add_at(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, ACTIONS, \
- STAGE_HINT, OVS_SOURCE_LOCATOR)
+ NULL, STAGE_HINT, OVS_SOURCE_LOCATOR)
+
+/* This macro is similar to ovn_lflow_add_with_hint, except that it requires
+ * the IN_OUT_PORT argument, which tells the lport name that appears in the
+ * MATCH, which helps ovn-controller to bypass lflows parsing when the lport is
+ * not local to the chassis. The critiera of the lport to be added using this
+ * argument:
+ *
+ * - For ingress pipeline, the lport that is used to match "inport".
+ * - For egress pipeline, the lport that is used to match "outport".
+ *
+ * For now, only LS pipelines should use this macro.  */
+#define ovn_lflow_add_with_lport_and_hint(LFLOW_MAP, OD, STAGE, PRIORITY, \
+  MATCH, ACTIONS, IN_OUT_PORT, \
+  STAGE_HINT) \
+ovn_lflow_add_at(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, ACTIONS, \
+ IN_OUT_PORT, STAGE_HINT, OVS_SOURCE_LOCATOR)
 
 #define ovn_lflow_add(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, ACTIONS) \
 ovn_lflow_add_at(LFLOW_MAP, OD, STAGE, PRIORITY, MATCH, ACTIONS, \
- NULL, OVS_SOURCE_LOCATOR)
+ NULL, NULL, OVS_SOURCE_LOCATOR)
 
 static struct ovn_lflow *
 ovn_lflow_find(const struct hmap *lflows, const struct ovn_datapath *od,
@@ -4196,6 +4215,7 @@ 

[ovs-dev] [PATCH RFC ovn 2/4] ovn-sb: Add tags column to logical_flow table of the SB DB.

2021-06-30 Thread Han Zhou
The column will provide information to help improve efficiency of
ovn-controller lflow parsing.

Signed-off-by: Han Zhou 
---
 ovn-sb.ovsschema |  7 +--
 ovn-sb.xml   | 23 +++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/ovn-sb.ovsschema b/ovn-sb.ovsschema
index bbf60781d..33fcc1a3d 100644
--- a/ovn-sb.ovsschema
+++ b/ovn-sb.ovsschema
@@ -1,7 +1,7 @@
 {
 "name": "OVN_Southbound",
-"version": "20.18.0",
-"cksum": "1816525029 26536",
+"version": "20.19.0",
+"cksum": "4105410918 26688",
 "tables": {
 "SB_Global": {
 "columns": {
@@ -109,6 +109,9 @@
   "maxInteger": 65535}}},
 "match": {"type": "string"},
 "actions": {"type": "string"},
+"tags": {
+"type": {"key": "string", "value": "string",
+ "min": 0, "max": "unlimited"}},
 "external_ids": {
 "type": {"key": "string", "value": "string",
  "min": 0, "max": "unlimited"}}},
diff --git a/ovn-sb.xml b/ovn-sb.xml
index 69de4551b..a39778ee0 100644
--- a/ovn-sb.xml
+++ b/ovn-sb.xml
@@ -2441,6 +2441,29 @@ tcp.flags = RST;
   
 
 
+
+  Key-value pairs that provide additional information to help
+  ovn-controller processing the logical flow. Below are the tags used
+  by ovn-controller.
+
+  
+in_out_port
+
+  In the logical flow's "match" column, if a logical port P is
+  compared with "inport" and the logical flow is on a logical switch
+  ingress pipeline, or if P is compared with "outport" and the
+  logical flow is on a logical switch egress pipeline, and the
+  expression is combined with other expressions (if any) using the
+  operator , then the port P should be added as the value in
+  this tag. If there are multiple logical ports meeting this criteria,
+  one of them can be added. ovn-controller uses this information to
+  skip parsing flows that are not needed on the chassis. Failing to add
+  the tag will affect efficiency, while adding wrong value will affect
+  correctness.
+
+  
+
+
 
   Human-readable name for this flow's stage in the pipeline.
 
-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH RFC ovn 0/4] Avoid parsing non-local lflows with the help of tags in SB.

2021-06-30 Thread Han Zhou
With the help of a new column in Logical_Flow table that stores ingress/egress
lport information, ovn-controller can avoid parsing a big portion of the
logical flows in SB DB, which can largely improve ovn-controller's performance
whenever a full recompute is required.

With a scale test topology of 1000 chassises, 20 LSPs per chassis, 20k
lports in total spread acrossing 200 logical switches, connected by a
logical router, the test result before & after this change:

Before:
- lflow-cache disabled:
- ovn-controller recompute: 2.7 sec
- lflow-cache enabled:
- ovn-controller recompute: 2.1 sec
- lflow cache memory: 622103 KB

After:
- lflow-cache disabled:
- ovn-controller recompute: 0.83 sec
- lflow-cache enabled:
- ovn-controller recompute: 0.71 sec
- lflow cache memory: 123641 KB

(note: DP group enabled for both)

So for this test scenario, when lflow cache is disabled, latency reduced
~70%; when lflow cache is enabled, latency reduced ~65% and lflow cache
memory reduced ~80%.

TODO: DDlog change for ovn-northd.

Note that this series applies on top of a pending patch:
https://patchwork.ozlabs.org/project/ovn/patch/20210629192257.1699504-1-hz...@ovn.org/

Han Zhou (4):
  ovn-northd.at: Minor improvement for the dp group test case.
  ovn-sb: Add tags column to logical_flow table of the SB DB.
  ovn-northd: Populate in_out_port in logical_flow table's tags.
  ovn-controller: Skip non-local lflows in ovn-controller before
parsing.

 controller/lflow.c  |  21 +++
 controller/lflow.h  |   1 +
 controller/ovn-controller.c |   1 +
 northd/ovn-northd.c | 272 
 ovn-sb.ovsschema|   7 +-
 ovn-sb.xml  |  23 +++
 tests/ovn-northd.at |   2 +-
 7 files changed, 207 insertions(+), 120 deletions(-)

-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH RFC ovn 1/4] ovn-northd.at: Minor improvement for the dp group test case.

2021-06-30 Thread Han Zhou
When counting lsp specific flows, using format "table" for ovn-sbctl
output to make sure each record is counted at most once.

Signed-off-by: Han Zhou 
---
 tests/ovn-northd.at | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index 2c811f094..c00f5bc3e 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -2487,7 +2487,7 @@ check_row_count Logical_DP_Group 0
 
 dnl Number of logical flows that depends on logical switch or multicast group.
 dnl These will not be combined.
-n_flows_specific=$(ovn-sbctl --bare find Logical_Flow | grep -cE 'swp')
+n_flows_specific=$(ovn-sbctl -f table find Logical_Flow | grep -cE 'swp')
 echo "Number of specific flows: "${n_flows_specific}
 
 dnl Both logical switches configured identically, so there should be same
-- 
2.30.2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 1/2] dpdk: Remove default values for socket-mem and limit.

2021-06-30 Thread Michael Santana
On Wed, Jun 30, 2021 at 4:48 PM Rosemarie O'Riorden  wrote:
>
> From: Rosemarie O'Riorden 
>
> This change removes the default values for EAL args socket-mem and
> socket-limit. As DPDK supports dynamic memory allocation, there is no
> need to allocate a certain amount of memory on start-up, nor limit the
> amount of memory available, if not requested.
>
> Currently, socket-mem has a default value of 1024 when it is not
> configured by the user, and socket-limit takes on the value of socket-mem,
> 1024, by default. With this change, socket-mem is not configured by default,
> meaning that socket-limit is not either. Neither, either or both options can 
> be set.
>
> Reported at: https://bugzilla.redhat.com/show_bug.cgi?id=1949850
> Signed-off-by: Rosemarie O'Riorden 
> ---
>  Documentation/intro/install/dpdk.rst | 3 +--
>  NEWS | 2 ++
>  lib/dpdk.c   | 2 +-
>  vswitchd/vswitch.xml | 4 ++--
>  4 files changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/intro/install/dpdk.rst 
> b/Documentation/intro/install/dpdk.rst
> index 612f2fdbc..92d1c8119 100644
> --- a/Documentation/intro/install/dpdk.rst
> +++ b/Documentation/intro/install/dpdk.rst
> @@ -290,8 +290,7 @@ listed below. Defaults will be provided for all values 
> not explicitly set.
>
>  ``dpdk-socket-mem``
>Comma separated list of memory to pre-allocate from hugepages on specific
> -  sockets. If not specified, 1024 MB will be set for each numa node by
> -  default.
> +  sockets. If not specified, this option will not be set by default.
>
>  ``dpdk-hugepage-dir``
>Directory where hugetlbfs is mounted
> diff --git a/NEWS b/NEWS
> index f02f07cdf..6245b28d2 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -22,6 +22,8 @@ Post-v2.15.0
> Available only if DPDK experimantal APIs enabled during the build.
>   * Add hardware offload support for VXLAN flows (experimental).
> Available only if DPDK experimantal APIs enabled during the build.
> + * EAL option --socket-mem is no longer configured by default upon
> +   start-up.
> - ovsdb-tool:
>   * New option '--election-timer' to the 'create-cluster' command to set 
> the
> leader election timer during cluster creation.
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 2eaaa569c..1c128fca3 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -167,7 +167,7 @@ construct_dpdk_mutex_options(const struct smap 
> *ovs_other_config,
>  {"memory type",
>   {"dpdk-alloc-mem", "dpdk-socket-mem", NULL,},
>   {"-m", "--socket-mem",NULL,},
> - default_dpdk_socket_mem, 1
> +  NULL, 0
Just nit-picking. Changing from default_dpdk_socket_mem to NULL is
sort of redundant given that you already set default_option to 0. No
reason to change it. But if you are going to remove it might as well
remove declaration and initialization as well
>  },
>  };
>
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 4597a215d..52fd52ce6 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -362,8 +362,8 @@
>preallocate 2048MB and socket 3 (no value given) to preallocate 
> 0MB.
>  
>  
> -  If dpdk-socket-mem and dpdk-alloc-mem are not specified, 
> dpdk-socket-mem
> -  will be used and the default value is 1024 for each numa node. If
> +  If dpdk-socket-mem and dpdk-alloc-mem are not specified, neither
> +  will be used and there is no default value for each numa node. If
>dpdk-socket-mem and dpdk-alloc-mem are specified at same time,
>dpdk-socket-mem will be used as default. Changing this value
>requires restarting the daemon.
> --
> 2.31.1
>

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH 2/2] dpdk: Stop configuring socket-limit with the value of socket-mem.

2021-06-30 Thread Michael Santana
On Wed, Jun 30, 2021 at 4:48 PM Rosemarie O'Riorden  wrote:
>
> From: Rosemarie O'Riorden 
>
> This change removes the automatic memory limit on start-up of OVS with
> DPDK. As DPDK supports dynamic memory allocation, there is no
> need to limit the amount of memory available, if not requested.
>
> Currently, if socket-limit is not configured, it is set to the value of
> socket-mem. With this change, the user can decide to set it or have no
> memory limit.
>
> Reported at: https://bugzilla.redhat.com/show_bug.cgi?id=1949850
> Signed-off-by: Rosemarie O'Riorden 
> ---
>  NEWS | 2 ++
>  lib/dpdk.c   | 4 
>  vswitchd/vswitch.xml | 9 -
>  3 files changed, 6 insertions(+), 9 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index 6245b28d2..3d9cac918 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -24,6 +24,8 @@ Post-v2.15.0
> Available only if DPDK experimantal APIs enabled during the build.
>   * EAL option --socket-mem is no longer configured by default upon
> start-up.
> + * EAL option --socket-limit no longer takes on the value of --socket-mem
> +   by default.
> - ovsdb-tool:
>   * New option '--election-timer' to the 'create-cluster' command to set 
> the
> leader election timer during cluster creation.
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 1c128fca3..9e217f825 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -438,10 +438,6 @@ dpdk_init__(const struct smap *ovs_other_config)
>  break;
>  }
>  }
> -if (i < args.n - 1) {
> -svec_add(, "--socket-limit");
> -svec_add(, args.names[i + 1]);
> -}
Please remove the remainder of the outer if-block as well. It's only
purpose is to support these svec_add() calls. Given that we are
removing the add calls then the entire if-block is no longer needed
>  }
>
>  if (args_contains(, "-c") || args_contains(, "-l")) {
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 52fd52ce6..10d8e3d0b 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -381,14 +381,13 @@
>0 will disable the limit for a particular socket.
>  
>  
> -  If not specified, OVS will configure limits equal to the amount of
> -  preallocated memory specified by  +  If not specified, OVS will not configure limits by default.
> +  Limits can be configured with key="dpdk-socket-mem"/> or --socket-mem in
>. If none of the above
> -  options specified or --legacy-mem provided in
> +  options are specified or --legacy-mem is provided in
>, limits will not be
> -  applied.
> -  Changing this value requires restarting the daemon.
> +  applied. Changing this value requires restarting the daemon.
>  
>
>
> --
> 2.31.1
>

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn v9 3/4] northd: Add options to automatically add routes for NATs and LBs.

2021-06-30 Thread 0-day Robot
Bleep bloop.  Greetings Mark Michelson, I am a robot and I have tried out your 
patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Line is 84 characters long (recommended limit is 79)
#162 FILE: northd/ovn-northd.c:3637:
bool is_routable = smap_get_bool(>nlb->options, "add_route", 
false);

WARNING: Line has trailing whitespace
#297 FILE: ovn-nb.xml:2078:
  load balancer IP addresses to the appropriate MAC address. Setting 

WARNING: Line lacks whitespace around operator
#425 FILE: utilities/ovn-nbctl.c:370:
  [--add-route]\n\

Lines checked: 508, Warnings: 3, Errors: 0


Please check this out.  If you feel there has been an error, please email 
acon...@redhat.com

Thanks,
0-day Robot
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn v9 2/4] northd: Add IP routing and ARP resolution flows for NAT/LB addresses.

2021-06-30 Thread 0-day Robot
Bleep bloop.  Greetings Mark Michelson, I am a robot and I have tried out your 
patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
ERROR: Inappropriate bracing around statement
#92 FILE: northd/ovn-northd.c:1493:
if (!extract_addresses(nats[i], [i], )){

WARNING: Line is 90 characters long (recommended limit is 79)
#215 FILE: northd/ovn-northd.c:10203:
ds_put_format(match, "outport == %s && "REG_NEXT_HOP_IPV4" == {", 
peer->json_key);

Lines checked: 564, Warnings: 1, Errors: 1


Please check this out.  If you feel there has been an error, please email 
acon...@redhat.com

Thanks,
0-day Robot
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn v9 4/4] northd: Flood ARPs to routers for "unreachable" addresses.

2021-06-30 Thread Mark Michelson
Previously, ARP TPAs were filtered down only to "reachable" addresses.
Reachable addresses are all router interface addresses, as well as NAT
external addresses and load balancer VIPs that are within the subnet
handled by a router's port.

However, it is possible that in some configurations, CMSes purposely
configure NAT or load balancer addresses on a router that are outside
the router's subnets, and they expect the router to respond to ARPs for
those addresses.

This commit adds a higher priority flow to logical switches that makes
it so ARPs targeted at "unreachable" addresses are flooded to all ports.
This way, the ARPs can reach the router appropriately and receive a
response.

Reported at: https://bugzilla.redhat.com/show_bug.cgi?id=1929901

Signed-off-by: Mark Michelson 
---
 northd/ovn-northd.8.xml |   8 ++
 northd/ovn-northd.c | 162 +++-
 northd/ovn_northd.dl|  91 ++
 tests/ovn-northd.at |  99 
 tests/system-ovn.at | 102 +
 5 files changed, 395 insertions(+), 67 deletions(-)

diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index beaf5a183..5aedd6619 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -1587,6 +1587,14 @@ output;
 logical ports.
   
 
+  
+Priority-90 flows for each IP address/VIP/NAT address configured
+outside its owning router port's subnet. These flows match ARP
+requests and ND packets for the specific IP addresses.  Matched packets
+are forwarded to the MC_FLOOD multicast group which
+contains all connected logical ports.
+  
+
   
 Priority-75 flows for each port connected to a logical router
 matching self originated ARP request/ND packets.  These packets
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index f6fad281b..d0b325748 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -6555,38 +6555,41 @@ build_lswitch_rport_arp_req_self_orig_flow(struct 
ovn_port *op,
 ds_destroy();
 }
 
-/*
- * Ingress table 19: Flows that forward ARP/ND requests only to the routers
- * that own the addresses. Other ARP/ND packets are still flooded in the
- * switching domain as regular broadcast.
- */
 static void
-build_lswitch_rport_arp_req_flow_for_ip(struct ds *ip_match,
-int addr_family,
-struct ovn_port *patch_op,
-struct ovn_datapath *od,
-uint32_t priority,
-struct hmap *lflows,
-const struct ovsdb_idl_row *stage_hint)
+arp_nd_ns_match(struct ds *ips, int addr_family, struct ds *match)
 {
-struct ds match   = DS_EMPTY_INITIALIZER;
-struct ds actions = DS_EMPTY_INITIALIZER;
-
 /* Packets received from VXLAN tunnels have already been through the
  * router pipeline so we should skip them. Normally this is done by the
  * multicast_group implementation (VXLAN packets skip table 32 which
  * delivers to patch ports) but we're bypassing multicast_groups.
  */
-ds_put_cstr(, FLAGBIT_NOT_VXLAN " && ");
+ds_put_cstr(match, FLAGBIT_NOT_VXLAN " && ");
 
 if (addr_family == AF_INET) {
-ds_put_cstr(, "arp.op == 1 && arp.tpa == { ");
+ds_put_cstr(match, "arp.op == 1 && arp.tpa == {");
 } else {
-ds_put_cstr(, "nd_ns && nd.target == { ");
+ds_put_cstr(match, "nd_ns && nd.target == {");
 }
 
-ds_put_cstr(, ds_cstr_ro(ip_match));
-ds_put_cstr(, "}");
+ds_put_cstr(match, ds_cstr_ro(ips));
+ds_put_cstr(match, "}");
+}
+
+/*
+ * Ingress table 19: Flows that forward ARP/ND requests only to the routers
+ * that own the addresses. Other ARP/ND packets are still flooded in the
+ * switching domain as regular broadcast.
+ */
+static void
+build_lswitch_rport_arp_req_flow_for_reachable_ip(struct ds *ips,
+int addr_family, struct ovn_port *patch_op, struct ovn_datapath *od,
+uint32_t priority, struct hmap *lflows,
+const struct ovsdb_idl_row *stage_hint)
+{
+struct ds match   = DS_EMPTY_INITIALIZER;
+struct ds actions = DS_EMPTY_INITIALIZER;
+
+arp_nd_ns_match(ips, addr_family, );
 
 /* Send a the packet to the router pipeline.  If the switch has non-router
  * ports then flood it there as well.
@@ -6609,6 +6612,30 @@ build_lswitch_rport_arp_req_flow_for_ip(struct ds 
*ip_match,
 ds_destroy();
 }
 
+/*
+ * Ingress table 19: Flows that forward ARP/ND requests for "unreachable" IPs
+ * (NAT or load balancer IPs configured on a router that are outside the
+ * router's configured subnets).
+ * These ARP/ND packets are flooded in the switching domain as regular
+ * broadcast.
+ */
+static void
+build_lswitch_rport_arp_req_flow_for_unreachable_ip(struct ds *ips,
+int addr_family, struct 

[ovs-dev] [PATCH ovn v9 2/4] northd: Add IP routing and ARP resolution flows for NAT/LB addresses.

2021-06-30 Thread Mark Michelson
Dealing with NAT and load balancer IPs has been a bit of a pain point.
It requires creating static routes if east-west traffic to those
addresses is desired. Further, it requires ARPs to be sent between the
logical routers in order to create MAC Bindings.

This commit seeks to make things easier. NAT and load balancer addresess
automatically have IP routing logical flows and ARP resolution logical
flows created for reachable routers. This eliminates the need to create
static routes, and it also eliminates the need for ARPs to be sent
between logical routers.

In this commit, the behavior is not optional. The next commit will
introduce configuration to make the behavior optional.

Signed-off-by: Mark Michelson 
---
 northd/ovn-northd.c  | 129 +-
 northd/ovn_northd.dl |  57 
 tests/ovn-northd.at  | 214 +++
 3 files changed, 395 insertions(+), 5 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 694c3b2c4..58132bc5c 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -1378,6 +1378,21 @@ build_datapaths(struct northd_context *ctx, struct hmap 
*datapaths,
 }
 }
 
+/* Structure representing logical router port
+ * routable addresses. This includes DNAT and Load Balancer
+ * addresses. This structure will only be filled in if the
+ * router port is a gateway router port. Otherwise, all pointers
+ * will be NULL and n_addrs will be 0.
+ */
+struct ovn_port_routable_addresses {
+/* Array of address strings suitable for writing to a database table */
+char **addresses;
+/* The addresses field parsed into component parts */
+struct lport_addresses *laddrs;
+/* Number of items in each of the above arrays */
+size_t n_addrs;
+};
+
 /* A logical switch port or logical router port.
  *
  * In steady state, an ovn_port points to a northbound Logical_Switch_Port
@@ -1421,6 +1436,8 @@ struct ovn_port {
 
 struct lport_addresses lrp_networks;
 
+struct ovn_port_routable_addresses routables;
+
 /* Logical port multicast data. */
 struct mcast_port_info mcast_info;
 
@@ -1447,6 +1464,44 @@ struct ovn_port {
 struct ovs_list list;   /* In list of similar records. */
 };
 
+static void
+destroy_routable_addresses(struct ovn_port_routable_addresses *ra)
+{
+for (size_t i = 0; i < ra->n_addrs; i++) {
+free(ra->addresses[i]);
+destroy_lport_addresses(>laddrs[i]);
+}
+free(ra->addresses);
+free(ra->laddrs);
+}
+
+static char **get_nat_addresses(const struct ovn_port *op, size_t *n);
+
+static void
+assign_routable_addresses(struct ovn_port *op)
+{
+size_t n;
+char **nats = get_nat_addresses(op, );
+
+if (!nats) {
+return;
+}
+
+struct lport_addresses *laddrs = xcalloc(n, sizeof(*laddrs));
+for (size_t i = 0; i < n; i++) {
+int ofs;
+if (!extract_addresses(nats[i], [i], )){
+continue;
+}
+}
+
+/* Everything seems to have worked out */
+op->routables.addresses = nats;
+op->routables.laddrs = laddrs;
+op->routables.n_addrs = n;
+}
+
+
 static void
 ovn_port_set_nb(struct ovn_port *op,
 const struct nbrec_logical_switch_port *nbsp,
@@ -1496,6 +1551,8 @@ ovn_port_destroy(struct hmap *ports, struct ovn_port 
*port)
 }
 free(port->ps_addrs);
 
+destroy_routable_addresses(>routables);
+
 destroy_lport_addresses(>lrp_networks);
 free(port->json_key);
 free(port->key);
@@ -2403,6 +2460,8 @@ join_logical_ports(struct northd_context *ctx,
  * use during flow creation. */
 od->l3dgw_port = op;
 od->l3redirect_port = crp;
+
+assign_routable_addresses(op);
 }
 }
 }
@@ -2486,7 +2545,7 @@ get_nat_addresses(const struct ovn_port *op, size_t *n)
 {
 size_t n_nats = 0;
 struct eth_addr mac;
-if (!op->nbrp || !op->od || !op->od->nbr
+if (!op || !op->nbrp || !op->od || !op->od->nbr
 || (!op->od->nbr->n_nat && !op->od->nbr->n_load_balancer)
 || !eth_addr_from_string(op->nbrp->mac, )) {
 *n = n_nats;
@@ -3067,7 +3126,6 @@ ovn_port_update_sbrec(struct northd_context *ctx,
 } else {
 sbrec_port_binding_set_options(op->sb, NULL);
 }
-
 const char *nat_addresses = smap_get(>nbsp->options,
"nat-addresses");
 size_t n_nats = 0;
@@ -3123,6 +3181,7 @@ ovn_port_update_sbrec(struct northd_context *ctx,
 if (add_router_port_garp) {
 struct ds garp_info = DS_EMPTY_INITIALIZER;
 ds_put_format(_info, "%s", op->peer->lrp_networks.ea_s);
+
 for (size_t i = 0; i < op->peer->lrp_networks.n_ipv4_addrs;
  i++) {
 ds_put_format(_info, " %s",
@@ -3139,7 +3198,6 @@ 

[ovs-dev] [PATCH ovn v9 3/4] northd: Add options to automatically add routes for NATs and LBs.

2021-06-30 Thread Mark Michelson
Load_Balancer and NAT entries have a new option, "add_route" that can be
set to automatically add routes to those addresses to neighbor routers,
therefore eliminating the need to create static routes.

Signed-off-by: Mark Michelson 
---
 northd/ovn-northd.8.xml |  7 -
 northd/ovn-northd.c | 57 +
 northd/ovn_northd.dl| 23 -
 ovn-nb.xml  | 33 +++-
 tests/ovn-nbctl.at  |  3 +++
 tests/ovn-northd.at | 40 ++---
 utilities/ovn-nbctl.c   | 25 +-
 7 files changed, 158 insertions(+), 30 deletions(-)

diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index b5c961e89..beaf5a183 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -3539,7 +3539,12 @@ outport = P
column
   of  table for of type
   dnat_and_snat, otherwise the Ethernet address of the
-  distributed logical router port.
+  distributed logical router port. Note that if the
+   is not
+  within a subnet on the owning logical router, then OVN will only
+  create ARP resolution flows if the 
+  is set to true. Otherwise, no ARP resolution flows
+  will be added.
 
 
 
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 58132bc5c..f6fad281b 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -662,8 +662,14 @@ struct ovn_datapath {
 struct lport_addresses dnat_force_snat_addrs;
 struct lport_addresses lb_force_snat_addrs;
 bool lb_force_snat_router_ip;
+/* The "routable" ssets are subsets of the load balancer
+ * IPs for which IP routes and ARP resolution flows are automatically
+ * added
+ */
 struct sset lb_ips_v4;
+struct sset lb_ips_v4_routable;
 struct sset lb_ips_v6;
+struct sset lb_ips_v6_routable;
 
 struct ovn_port **localnet_ports;
 size_t n_localnet_ports;
@@ -834,7 +840,9 @@ static void
 init_lb_ips(struct ovn_datapath *od)
 {
 sset_init(>lb_ips_v4);
+sset_init(>lb_ips_v4_routable);
 sset_init(>lb_ips_v6);
+sset_init(>lb_ips_v6_routable);
 }
 
 static void
@@ -845,7 +853,9 @@ destroy_lb_ips(struct ovn_datapath *od)
 }
 
 sset_destroy(>lb_ips_v4);
+sset_destroy(>lb_ips_v4_routable);
 sset_destroy(>lb_ips_v6);
+sset_destroy(>lb_ips_v6_routable);
 }
 
 /* A group of logical router datapaths which are connected - either
@@ -1475,13 +1485,14 @@ destroy_routable_addresses(struct 
ovn_port_routable_addresses *ra)
 free(ra->laddrs);
 }
 
-static char **get_nat_addresses(const struct ovn_port *op, size_t *n);
+static char **get_nat_addresses(const struct ovn_port *op, size_t *n,
+bool routable_only);
 
 static void
 assign_routable_addresses(struct ovn_port *op)
 {
 size_t n;
-char **nats = get_nat_addresses(op, );
+char **nats = get_nat_addresses(op, , true);
 
 if (!nats) {
 return;
@@ -2541,7 +2552,7 @@ join_logical_ports(struct northd_context *ctx,
  * The caller must free each of the n returned strings with free(),
  * and must free the returned array when it is no longer needed. */
 static char **
-get_nat_addresses(const struct ovn_port *op, size_t *n)
+get_nat_addresses(const struct ovn_port *op, size_t *n, bool routable_only)
 {
 size_t n_nats = 0;
 struct eth_addr mac;
@@ -2564,6 +2575,12 @@ get_nat_addresses(const struct ovn_port *op, size_t *n)
 const struct nbrec_nat *nat = op->od->nbr->nat[i];
 ovs_be32 ip, mask;
 
+if (routable_only &&
+(!strcmp(nat->type, "snat") ||
+ !smap_get_bool(>options, "add_route", false))) {
+continue;
+}
+
 char *error = ip_parse_masked(nat->external_ip, , );
 if (error || mask != OVS_BE32_MAX) {
 free(error);
@@ -2615,13 +2632,24 @@ get_nat_addresses(const struct ovn_port *op, size_t *n)
 }
 
 const char *ip_address;
-SSET_FOR_EACH (ip_address, >od->lb_ips_v4) {
-ds_put_format(_addresses, " %s", ip_address);
-central_ip_address = true;
-}
-SSET_FOR_EACH (ip_address, >od->lb_ips_v6) {
-ds_put_format(_addresses, " %s", ip_address);
-central_ip_address = true;
+if (routable_only) {
+SSET_FOR_EACH (ip_address, >od->lb_ips_v4_routable) {
+ds_put_format(_addresses, " %s", ip_address);
+central_ip_address = true;
+}
+SSET_FOR_EACH (ip_address, >od->lb_ips_v6_routable) {
+ds_put_format(_addresses, " %s", ip_address);
+central_ip_address = true;
+}
+} else {
+SSET_FOR_EACH (ip_address, >od->lb_ips_v4) {
+ds_put_format(_addresses, " %s", ip_address);
+central_ip_address = true;
+}
+SSET_FOR_EACH (ip_address, >od->lb_ips_v6) {
+ds_put_format(_addresses, " %s", ip_address);
+

[ovs-dev] [PATCH ovn v9 1/4] northd: Factor peer retrieval into its own function.

2021-06-30 Thread Mark Michelson
The same pattern is repeated several times throughout ovn-northd.c, so
this puts it in its own function. This will be used even more in an
upcoming commit.

Signed-off-by: Mark Michelson 
---
 northd/ovn-northd.c | 70 -
 1 file changed, 24 insertions(+), 46 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 83746f4ab..694c3b2c4 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -1571,6 +1571,21 @@ lrport_is_enabled(const struct nbrec_logical_router_port 
*lrport)
 return !lrport->enabled || *lrport->enabled;
 }
 
+static struct ovn_port *
+ovn_port_get_peer(struct hmap *ports, struct ovn_port *op)
+{
+if (!op->nbsp || !lsp_is_router(op->nbsp) || op->derived) {
+return NULL;
+}
+
+const char *peer_name = smap_get(>nbsp->options, "router-port");
+if (!peer_name) {
+return NULL;
+}
+
+return ovn_port_find(ports, peer_name);
+}
+
 static void
 ipam_insert_ip_for_datapath(struct ovn_datapath *od, uint32_t ip)
 {
@@ -2398,12 +2413,7 @@ join_logical_ports(struct northd_context *ctx,
 struct ovn_port *op;
 HMAP_FOR_EACH (op, key_node, ports) {
 if (op->nbsp && lsp_is_router(op->nbsp) && !op->derived) {
-const char *peer_name = smap_get(>nbsp->options, 
"router-port");
-if (!peer_name) {
-continue;
-}
-
-struct ovn_port *peer = ovn_port_find(ports, peer_name);
+struct ovn_port *peer = ovn_port_get_peer(ports, op);
 if (!peer || !peer->nbrp) {
 continue;
 }
@@ -10206,14 +10216,8 @@ build_arp_resolve_flows_for_lrouter_port(
 /* Get the Logical_Router_Port that the
  * Logical_Switch_Port is connected to, as
  * 'peer'. */
-const char *peer_name = smap_get(
->od->router_ports[k]->nbsp->options,
-"router-port");
-if (!peer_name) {
-continue;
-}
-
-struct ovn_port *peer = ovn_port_find(ports, peer_name);
+struct ovn_port *peer = ovn_port_get_peer(
+ports, op->od->router_ports[k]);
 if (!peer || !peer->nbrp) {
 continue;
 }
@@ -10243,14 +10247,8 @@ build_arp_resolve_flows_for_lrouter_port(
 /* Get the Logical_Router_Port that the
  * Logical_Switch_Port is connected to, as
  * 'peer'. */
-const char *peer_name = smap_get(
->od->router_ports[k]->nbsp->options,
-"router-port");
-if (!peer_name) {
-continue;
-}
-
-struct ovn_port *peer = ovn_port_find(ports, peer_name);
+struct ovn_port *peer = ovn_port_get_peer(
+ports, op->od->router_ports[k]);
 if (!peer || !peer->nbrp) {
 continue;
 }
@@ -10298,14 +10296,8 @@ build_arp_resolve_flows_for_lrouter_port(
 !op->sb->chassis) {
 /* The virtual port is not claimed yet. */
 for (size_t i = 0; i < op->od->n_router_ports; i++) {
-const char *peer_name = smap_get(
->od->router_ports[i]->nbsp->options,
-"router-port");
-if (!peer_name) {
-continue;
-}
-
-struct ovn_port *peer = ovn_port_find(ports, peer_name);
+struct ovn_port *peer = ovn_port_get_peer(
+ports, op->od->router_ports[i]);
 if (!peer || !peer->nbrp) {
 continue;
 }
@@ -10340,15 +10332,8 @@ build_arp_resolve_flows_for_lrouter_port(
 /* Get the Logical_Router_Port that the
 * Logical_Switch_Port is connected to, as
 * 'peer'. */
-const char *peer_name = smap_get(
->od->router_ports[j]->nbsp->options,
-"router-port");
-if (!peer_name) {
-continue;
-}
-
 struct ovn_port *peer =
-ovn_port_find(ports, peer_name);
+ovn_port_get_peer(ports, vp->od->router_ports[j]);
 if (!peer || !peer->nbrp) {
 continue;
 }
@@ -10385,14 +10370,7 @@ build_arp_resolve_flows_for_lrouter_port(
  * we need to add logical flows such that it can resolve
  * ARP entries for all the other router ports connected to
  * the switch in question. */
-
-const char *peer_name 

[ovs-dev] [PATCH ovn v9 0/4] ARP and Floating IP Fixes

2021-06-30 Thread Mark Michelson
This patch series aims to fix issues seen in OpenStack deployments when
floating IPs were assigned to routers, and those floating IPs were not
part of any subnet configured on that router.

This series has gone through several transformations, but this
incarnation is a four patch series:

Patch 1 is a small cleanup in ovn-northd.c to factor out peer retrieval
into its own function.

Patch 2 alters northd to install logical flows to make it so that
routers can reach NAT and load balancer addresses on their neighbors
without the need to configure static routes or MAC bindings.

Patch 3 recognizes that patch 2 may not always be desired, so it makes
the behavior opt-in.

Finally, patch 4 addresses the situation for when the pre-allocated
logical flows cannot be used. For this situation, we will flood the ARP
request if the TPA is for a configured IP address that is outside the
connected routers' subnets.
---
v8 -> v9
* Rebased
* Added missing documentation to ovn-northd.8.xml
* Fixed memory leaks in northd.c
---

Mark Michelson (4):
  northd: Factor peer retrieval into its own function.
  northd: Add IP routing and ARP resolution flows for NAT/LB addresses.
  northd: Add options to automatically add routes for NATs and LBs.
  northd: Flood ARPs to routers for "unreachable" addresses.

 northd/ovn-northd.8.xml |  15 +-
 northd/ovn-northd.c | 414 +---
 northd/ovn_northd.dl| 169 +---
 ovn-nb.xml  |  33 +++-
 tests/ovn-nbctl.at  |   3 +
 tests/ovn-northd.at | 345 +
 tests/system-ovn.at | 102 ++
 utilities/ovn-nbctl.c   |  25 ++-
 8 files changed, 965 insertions(+), 141 deletions(-)

-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn v2] ovn-controller: Fix port group I-P when they contain non-vif ports.

2021-06-30 Thread Han Zhou
On Wed, Jun 30, 2021 at 7:01 AM Dumitru Ceara  wrote:
>
> It's valid that port_groups contain non-vif ports, they can actually
> contain any type of logical_switch_port.
>
> Also, there's no need to allocate a new sset containing the local ports'
> names every time the I-P engine processes a change.  We were already
> maintaining a set of "local_lport_ids".  These correspond to port
> bindings that are relevant locally (including non-vif ports).  Extend
> it to include the locally relevant lport names too and rename the
> structure an its helper functions to related_lport*.
>
> Reported-at: https://github.com/ovn-org/ovn/pull/61#issuecomment-865094163
> Reported-by: Antonio Ojea 
> Fixes: 0cfeba6b55e3 ("ovn-controller: Fix port group conjunction flow
explosion problem.")
> Signed-off-by: Dumitru Ceara 
> ---
> v2:
> - Addressed Numan's and Han's comments:
>   - add struct related_lports
>   - add test case.
> ---
>  controller/binding.c| 79 ++---
>  controller/binding.h| 31 ---
>  controller/lflow.c  |  2 +-
>  controller/lflow.h  |  2 +-
>  controller/ovn-controller.c | 48 +-
>  tests/ovn.at| 44 +
>  6 files changed, 120 insertions(+), 86 deletions(-)
>
> diff --git a/controller/binding.c b/controller/binding.c
> index 7fde0fdbb..594babc98 100644
> --- a/controller/binding.c
> +++ b/controller/binding.c
> @@ -531,38 +531,41 @@ remove_local_lports(const char *iface_id, struct
binding_ctx_out *b_ctx)
>  }
>  }
>
> -/* Add a port binding ID (of the form "dp-key"_"port-key") to the set of
local
> - * lport IDs. Also track if the set has changed.
> +/* Add a port binding to the set of locally relevant lports.
> + * Also track if the set has changed.
>   */
>  static void
> -update_local_lport_ids(const struct sbrec_port_binding *pb,
> -   struct binding_ctx_out *b_ctx)
> +update_related_lport(const struct sbrec_port_binding *pb,
> + struct binding_ctx_out *b_ctx)
>  {
>  char buf[16];
>  get_unique_lport_key(pb->datapath->tunnel_key, pb->tunnel_key,
>   buf, sizeof(buf));
> -if (sset_add(b_ctx->local_lport_ids, buf) != NULL) {
> -b_ctx->local_lport_ids_changed = true;
> +if (sset_add(_ctx->related_lports->lport_ids, buf) != NULL) {
> +b_ctx->related_lports_changed = true;
>
>  if (b_ctx->tracked_dp_bindings) {
>  /* Add the 'pb' to the tracked_datapaths. */
>  tracked_binding_datapath_lport_add(pb,
b_ctx->tracked_dp_bindings);
>  }
>  }
> +sset_add(_ctx->related_lports->lport_names, pb->logical_port);
>  }
>
> -/* Remove a port binding id from the set of local lport IDs. Also track
if
> - * the set has changed.
> +/* Remove a port binding id from the set of locally relevant lports.
> + * Also track if the set has changed.
>   */
>  static void
> -remove_local_lport_ids(const struct sbrec_port_binding *pb,
> -   struct binding_ctx_out *b_ctx)
> +remove_related_lport(const struct sbrec_port_binding *pb,
> + struct binding_ctx_out *b_ctx)
>  {
>  char buf[16];
>  get_unique_lport_key(pb->datapath->tunnel_key, pb->tunnel_key,
>   buf, sizeof(buf));
> -if (sset_find_and_delete(b_ctx->local_lport_ids, buf)) {
> -b_ctx->local_lport_ids_changed = true;
> +sset_find_and_delete(_ctx->related_lports->lport_names,
> + pb->logical_port);
> +if (sset_find_and_delete(_ctx->related_lports->lport_ids, buf)) {
> +b_ctx->related_lports_changed = true;
>
>  if (b_ctx->tracked_dp_bindings) {
>  /* Add the 'pb' to the tracked_datapaths. */
> @@ -678,6 +681,20 @@ static struct binding_lport
*binding_lport_check_and_cleanup(
>
>  static char *get_lport_type_str(enum en_lport_type lport_type);
>
> +void
> +related_lports_init(struct related_lports *rp)
> +{
> +sset_init(>lport_names);
> +sset_init(>lport_ids);
> +}
> +
> +void
> +related_lports_destroy(struct related_lports *rp)
> +{
> +sset_destroy(>lport_names);
> +sset_destroy(>lport_ids);
> +}
> +
>  void
>  local_binding_data_init(struct local_binding_data *lbinding_data)
>  {
> @@ -1172,7 +1189,7 @@ release_binding_lport(const struct sbrec_chassis
*chassis_rec,
>struct binding_ctx_out *b_ctx_out)
>  {
>  if (is_binding_lport_this_chassis(b_lport, chassis_rec)) {
> -remove_local_lport_ids(b_lport->pb, b_ctx_out);
> +remove_related_lport(b_lport->pb, b_ctx_out);
>  if (!release_lport(b_lport->pb, sb_readonly,
> b_ctx_out->tracked_dp_bindings,
> b_ctx_out->if_mgr)) {
> @@ -1214,7 +1231,7 @@ consider_vif_lport_(const struct sbrec_port_binding
*pb,
> pb->datapath, false,
> 

Re: [ovs-dev] [PATCH ovn 1/3] system-ovn.at: fix typo

2021-06-30 Thread Numan Siddique
On Tue, Jun 22, 2021 at 1:17 AM Han Zhou  wrote:
>
> Acked-by: Han Zhou 

Thanks,  I applied this patch.

Numan

>
> On Sat, Jun 19, 2021 at 2:51 AM Mark Gray  wrote:
>
> > Signed-off-by: Mark Gray 
> > ---
> >  tests/system-ovn.at | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/tests/system-ovn.at b/tests/system-ovn.at
> > index 1b8bb3803def..552fdae52665 100644
> > --- a/tests/system-ovn.at
> > +++ b/tests/system-ovn.at
> > @@ -23,7 +23,7 @@ start_daemon ovn-controller
> >
> >  # Logical network:
> >  # Two LRs - R1 and R2 that are connected to each other via LS "join"
> > -# in 20.0.0.0/24 network. R1 has switchess foo (192.168.1.0/24) and
> > +# in 20.0.0.0/24 network. R1 has switches foo (192.168.1.0/24) and
> >  # bar (192.168.2.0/24) connected to it. R2 has alice (172.16.1.0/24)
> > connected
> >  # to it.  R2 is a gateway router on which we add NAT rules.
> >  #
> > --
> > 2.27.0
> >
> > ___
> > dev mailing list
> > d...@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 2/2] dpdk: Stop configuring socket-limit with the value of socket-mem.

2021-06-30 Thread Rosemarie O'Riorden
From: Rosemarie O'Riorden 

This change removes the automatic memory limit on start-up of OVS with
DPDK. As DPDK supports dynamic memory allocation, there is no
need to limit the amount of memory available, if not requested.

Currently, if socket-limit is not configured, it is set to the value of
socket-mem. With this change, the user can decide to set it or have no
memory limit.

Reported at: https://bugzilla.redhat.com/show_bug.cgi?id=1949850
Signed-off-by: Rosemarie O'Riorden 
---
 NEWS | 2 ++
 lib/dpdk.c   | 4 
 vswitchd/vswitch.xml | 9 -
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/NEWS b/NEWS
index 6245b28d2..3d9cac918 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,8 @@ Post-v2.15.0
Available only if DPDK experimantal APIs enabled during the build.
  * EAL option --socket-mem is no longer configured by default upon
start-up.
+ * EAL option --socket-limit no longer takes on the value of --socket-mem
+   by default.
- ovsdb-tool:
  * New option '--election-timer' to the 'create-cluster' command to set the
leader election timer during cluster creation.
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 1c128fca3..9e217f825 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -438,10 +438,6 @@ dpdk_init__(const struct smap *ovs_other_config)
 break;
 }
 }
-if (i < args.n - 1) {
-svec_add(, "--socket-limit");
-svec_add(, args.names[i + 1]);
-}
 }
 
 if (args_contains(, "-c") || args_contains(, "-l")) {
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 52fd52ce6..10d8e3d0b 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -381,14 +381,13 @@
   0 will disable the limit for a particular socket.
 
 
-  If not specified, OVS will configure limits equal to the amount of
-  preallocated memory specified by  or --socket-mem in
   . If none of the above
-  options specified or --legacy-mem provided in
+  options are specified or --legacy-mem is provided in
   , limits will not be
-  applied.
-  Changing this value requires restarting the daemon.
+  applied. Changing this value requires restarting the daemon.
 
   
 
-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 1/2] dpdk: Remove default values for socket-mem and limit.

2021-06-30 Thread Rosemarie O'Riorden
From: Rosemarie O'Riorden 

This change removes the default values for EAL args socket-mem and
socket-limit. As DPDK supports dynamic memory allocation, there is no
need to allocate a certain amount of memory on start-up, nor limit the
amount of memory available, if not requested.

Currently, socket-mem has a default value of 1024 when it is not
configured by the user, and socket-limit takes on the value of socket-mem,
1024, by default. With this change, socket-mem is not configured by default,
meaning that socket-limit is not either. Neither, either or both options can be 
set.

Reported at: https://bugzilla.redhat.com/show_bug.cgi?id=1949850
Signed-off-by: Rosemarie O'Riorden 
---
 Documentation/intro/install/dpdk.rst | 3 +--
 NEWS | 2 ++
 lib/dpdk.c   | 2 +-
 vswitchd/vswitch.xml | 4 ++--
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/Documentation/intro/install/dpdk.rst 
b/Documentation/intro/install/dpdk.rst
index 612f2fdbc..92d1c8119 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -290,8 +290,7 @@ listed below. Defaults will be provided for all values not 
explicitly set.
 
 ``dpdk-socket-mem``
   Comma separated list of memory to pre-allocate from hugepages on specific
-  sockets. If not specified, 1024 MB will be set for each numa node by
-  default.
+  sockets. If not specified, this option will not be set by default.
 
 ``dpdk-hugepage-dir``
   Directory where hugetlbfs is mounted
diff --git a/NEWS b/NEWS
index f02f07cdf..6245b28d2 100644
--- a/NEWS
+++ b/NEWS
@@ -22,6 +22,8 @@ Post-v2.15.0
Available only if DPDK experimantal APIs enabled during the build.
  * Add hardware offload support for VXLAN flows (experimental).
Available only if DPDK experimantal APIs enabled during the build.
+ * EAL option --socket-mem is no longer configured by default upon
+   start-up.
- ovsdb-tool:
  * New option '--election-timer' to the 'create-cluster' command to set the
leader election timer during cluster creation.
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 2eaaa569c..1c128fca3 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -167,7 +167,7 @@ construct_dpdk_mutex_options(const struct smap 
*ovs_other_config,
 {"memory type",
  {"dpdk-alloc-mem", "dpdk-socket-mem", NULL,},
  {"-m", "--socket-mem",NULL,},
- default_dpdk_socket_mem, 1
+  NULL, 0
 },
 };
 
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 4597a215d..52fd52ce6 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -362,8 +362,8 @@
   preallocate 2048MB and socket 3 (no value given) to preallocate 0MB.
 
 
-  If dpdk-socket-mem and dpdk-alloc-mem are not specified, 
dpdk-socket-mem
-  will be used and the default value is 1024 for each numa node. If
+  If dpdk-socket-mem and dpdk-alloc-mem are not specified, neither
+  will be used and there is no default value for each numa node. If
   dpdk-socket-mem and dpdk-alloc-mem are specified at same time,
   dpdk-socket-mem will be used as default. Changing this value
   requires restarting the daemon.
-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 0/2] Stop configuring '--socket-mem'/'--socket-limit' by default for DPDK if not requested.

2021-06-30 Thread Rosemarie O'Riorden
From: Rosemarie O'Riorden 

Currently, there is a default value of 1024 for socket-mem if not
configured. socket-limit automatically takes on the value of socket-mem
unless otherwise specified. With these changes, memory allocation will
be dynamically managed by DPDK, meaning that by default,  no memory will
be pre-allocated on startup, and there will be no limit to how much
memory can be used. Either or both of these values can be set by the
user.

The EAL arguments will look like this:

- dpdk-socket-mem=, dpdk-socket-limit=
  current: "--scket-mem=1024,1024 --socket-limit=1024,1024"
  patch 1: ""
  patch 2: ""

- dpdk-socket-mem=, dpdk-socket-limit=
  current: "--scket-mem=MEM --socket-limit=MEM"
  patch 1: "--scket-mem=MEM --socket-limit=MEM"
  patch 2: "--scket-mem=MEM"

- dpdk-socket-mem=, dpdk-socket-limit=
  current: "--scket-mem=1024,1024 --socket-limit=LIMIT"
  patch 1: "--socket-limit=LIMIT"
  patch 2: "--socket-limit=LIMIT"

- dpdk-socket-mem=, dpdk-socket-limit=
  current: "--scket-mem=MEM --socket-limit=LIMIT"
  patch 1: "--scket-mem=MEM --socket-limit=LIMIT"
  patch 2: "--scket-mem=MEM --socket-limit=LIMIT"

Rosemarie O'Riorden (2):
  dpdk: Remove default values for socket-mem and limit.
  dpdk: Stop configuring socket-limit with the value of socket-mem.

 Documentation/intro/install/dpdk.rst |  3 +--
 NEWS |  4 
 lib/dpdk.c   |  6 +-
 vswitchd/vswitch.xml | 13 ++---
 4 files changed, 12 insertions(+), 14 deletions(-)

-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 11/12] dpif-netdev/mfex: add more AVX512 traffic profiles

2021-06-30 Thread Flavio Leitner
Hi,

On Thu, Jun 17, 2021 at 09:57:53PM +0530, Kumar Amber wrote:
> From: Harry van Haaren 
> 
> This commit adds 3 new traffic profile implementations to the
> existing avx512 miniflow extract infrastructure. The profiles added are:
> - Ether()/IP()/TCP()
> - Ether()/Dot1Q()/IP()/UDP()
> - Ether()/Dot1Q()/IP()/TCP()
> 
> The design of the avx512 code here is for scalability to add more
> traffic profiles, as well as enabling CPU ISA. Note that an implementation
> is primarily adding static const data, which the compiler then specializes
> away when the profile specific function is declared below.
> 
> As a result, the code is relatively maintainable, and scalable for new
> traffic profiles as well as new ISA, and does not lower performance
> compared with manually written code for each profile/ISA.
> 
> Note that confidence in the correctness of each implementation is
> achieved through autovalidation, unit tests with known packets, and
> fuzz tested packets.
> 
> Signed-off-by: Harry van Haaren 
> 
> ---
> 
> Hi Readers,
> 
> If you have a traffic profile you'd like to see accelerated using
> avx512 code, please send me an email and we can collaborate on adding
> support for it!
> 
> Regards, -Harry
> ---
>  lib/dpif-netdev-extract-avx512.c  | 155 ++
>  lib/dpif-netdev-private-extract.c |  31 ++
>  lib/dpif-netdev-private-extract.h |   4 +
>  3 files changed, 190 insertions(+)
> 
> diff --git a/lib/dpif-netdev-extract-avx512.c 
> b/lib/dpif-netdev-extract-avx512.c
> index 1145ac8a9..0e0f6e295 100644
> --- a/lib/dpif-netdev-extract-avx512.c
> +++ b/lib/dpif-netdev-extract-avx512.c
> @@ -117,6 +117,13 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, 
> __m512i idx, __m512i a)
>  
>  #define PATTERN_ETHERTYPE_MASK PATTERN_ETHERTYPE_GEN(0xFF, 0xFF)
>  #define PATTERN_ETHERTYPE_IPV4 PATTERN_ETHERTYPE_GEN(0x08, 0x00)
> +#define PATTERN_ETHERTYPE_DT1Q PATTERN_ETHERTYPE_GEN(0x81, 0x00)
> +
> +/* VLAN (Dot1Q) patterns and masks. */
> +#define PATTERN_DT1Q_MASK   \
> +  0x00, 0x00, 0xFF, 0xFF,
> +#define PATTERN_DT1Q_IPV4   \
> +  0x00, 0x00, 0x08, 0x00,
>  
>  /* Generator for checking IPv4 ver, ihl, and proto */
>  #define PATTERN_IPV4_GEN(VER_IHL, FLAG_OFF_B0, FLAG_OFF_B1, PROTO) \
> @@ -142,6 +149,29 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, 
> __m512i idx, __m512i a)
>34, 35, 36, 37, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* UDP */  
>  \
>NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused. 
> */
>  
> +/* TCP shuffle: tcp_ctl bits require mask/processing, not included here. */
> +#define PATTERN_IPV4_TCP_SHUFFLE \
> +   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, NU, NU, /* Ether 
> */ \
> +  26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */ 
>  \
> +  NU, NU, NU, NU, NU, NU, NU, NU, 34, 35, 36, 37, NU, NU, NU, NU, /* TCP */  
>  \
> +  NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused. 
> */
> +
> +#define PATTERN_DT1Q_IPV4_UDP_SHUFFLE
>  \
> +  /* Ether (2 blocks): Note that *VLAN* type is written here. */ 
>  \
> +  0,  1,  2,  3,  4,  5,  6,  7, 8,  9, 10, 11, 16, 17,  0,  0,  
>  \
> +  /* VLAN (1 block): Note that the *EtherHdr->Type* is written here. */  
>  \
> +  12, 13, 14, 15, 0, 0, 0, 0,
>  \
> +  30, 31, 32, 33, 34, 35, 36, 37, 0, 0, 0, 0, 24, 19, 26, 27, /* IPv4 */ 
>  \
> +  38, 39, 40, 41, NU, NU, NU, NU, /* UDP */
> +
> +#define PATTERN_DT1Q_IPV4_TCP_SHUFFLE
>  \
> +  /* Ether (2 blocks): Note that *VLAN* type is written here. */ 
>  \
> +  0,  1,  2,  3,  4,  5,  6,  7, 8,  9, 10, 11, 16, 17,  0,  0,  
>  \
> +  /* VLAN (1 block): Note that the *EtherHdr->Type* is written here. */  
>  \
> +  12, 13, 14, 15, 0, 0, 0, 0,
>  \
> +  30, 31, 32, 33, 34, 35, 36, 37, 0, 0, 0, 0, 24, 19, 26, 27, /* IPv4 */ 
>  \
> +  NU, NU, NU, NU, NU, NU, NU, NU, 38, 39, 40, 41, NU, NU, NU, NU, /* TCP */  
>  \
> +  NU, NU, NU, NU, NU, NU, NU, NU, /* Unused. */
>  
>  /* Generation of K-mask bitmask values, to zero out data in result. Note that
>   * these correspond 1:1 to the above "*_SHUFFLE" values, and bit used must be
> @@ -151,12 +181,22 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, 
> __m512i idx, __m512i a)
>   * Note the ULL suffix allows shifting by 32 or more without integer 
> overflow.
>   */
>  #define KMASK_ETHER 0x1FFFULL
> +#define KMASK_DT1Q  0x000FULL
>  #define KMASK_IPV4  0xF0FFULL
>  #define KMASK_UDP   0x000FULL
> +#define KMASK_TCP   0x0F00ULL
>  
>  #define PATTERN_IPV4_UDP_KMASK \
>  (KMASK_ETHER | (KMASK_IPV4 << 16) | (KMASK_UDP << 32))
>  
> +#define PATTERN_IPV4_TCP_KMASK \
> +   

Re: [ovs-dev] [PATCH net-next] openvswitch: Introduce per-cpu upcall dispatch

2021-06-30 Thread kernel test robot
Hi Mark,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Mark-Gray/openvswitch-Introduce-per-cpu-upcall-dispatch/20210630-175435
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
b6df00789e2831fff7a2c65aa7164b2a4dcbe599
config: sh-randconfig-s032-20210630 (attached as .config)
compiler: sh4-linux-gcc (GCC) 9.3.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# apt-get install sparse
# sparse version: v0.6.3-341-g8af24329-dirty
# 
https://github.com/0day-ci/linux/commit/24234cf5ef3fdbb1cad04189ba6459292f84180e
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Mark-Gray/openvswitch-Introduce-per-cpu-upcall-dispatch/20210630-175435
git checkout 24234cf5ef3fdbb1cad04189ba6459292f84180e
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross C=1 
CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=sh 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 


sparse warnings: (new ones prefixed by >>)
>> net/openvswitch/datapath.c:169:17: sparse: sparse: incorrect type in 
>> argument 1 (different address spaces) @@ expected void const * @@ 
>> got struct dp_nlsk_pids [noderef] __rcu *upcall_portids @@
   net/openvswitch/datapath.c:169:17: sparse: expected void const *
   net/openvswitch/datapath.c:169:17: sparse: got struct dp_nlsk_pids 
[noderef] __rcu *upcall_portids

vim +169 net/openvswitch/datapath.c

   160  
   161  static void destroy_dp_rcu(struct rcu_head *rcu)
   162  {
   163  struct datapath *dp = container_of(rcu, struct datapath, rcu);
   164  
   165  ovs_flow_tbl_destroy(>table);
   166  free_percpu(dp->stats_percpu);
   167  kfree(dp->ports);
   168  ovs_meters_exit(dp);
 > 169  kfree(dp->upcall_portids);
   170  kfree(dp);
   171  }
   172  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn v8 0/6] ARP and Floating IP Fixes

2021-06-30 Thread Numan Siddique
On Thu, Jun 3, 2021 at 2:50 PM Mark Michelson  wrote:
>
> This patch series aims to fix issues seen in OpenStack deployments when
> floating IPs were assigned to routers, and those floating IPs were not
> part of any subnet configured on that router.
>
> Originally, this was a two patch series but it has bloomed into a 5
> patch series. After v7, a new approach was suggested and it's now 6
> patches.
>
> Patch 1 fixes the scenario where a VM attempts to reach a floating IP on
> the directly connected router. This has been part of this patch series
> since v1.
>
> Patch 2 is an incidental fix that removes a redundant paragraph from
> documenttion.
>
> Patch 3 is a small cleanup in ovn-northd.c to factor out peer retrieval
> into its own function.
>
> Patch 4 alters northd to install logical flows to make it so that
> routers can reach NAT and load balancer addresses on their neighbors
> without the need to configure static routes or MAC bindings.
>
> Patch 5 recognizes that patch 4 may not always be desired, so it makes
> the behavior opt-in.
>
> Finally, patch 6  addresses the situation for when the pre-allocated
> logical flows cannot be used. For this situation, we will flood the ARP
> request if the TPA is for a configured IP address that is outside the
> connected routers' subnets.
> ---
> v7 -> v8: First 2 patches are the same as they have been in previous
> versions. Patch 6 is nearly identical to patch 5 from previous versions.
> Patches 3, 4, and 5 represent a completely new approach to solving the
> issue from before. And that's also why previous version are not
> documented here.
> ---
> Mark Michelson (6):
>   northd: Swap src and dst eth addresses in router egress loop.
>   ovn-sb: Remove redundant "nat-addresses" information from
> Port_Binding.
>   northd: Factor peer retrieval into its own function.
>   northd: Add IP routing and ARP resolution flows for NAT/LB addresses.
>   northd: Add options to automatically add routes for NATs and LBs.
>   northd: Flood ARPs to routers for "unreachable" addresses.

Hi Mark,

I applied the first 2 patches of this series to master and backported
the first patch to branch-21.06.

I think the first patch is required to address the issue reported by
Brendan Doyle.
The first patch doesn't apply cleanly to other branches.  Can you
please take a  look and try to backport
to other branches ? I think this fix is required.


I provided a few comments to patch 4 earlier.

Thanks
Numan

>
>  northd/ovn-northd.8.xml |   8 +
>  northd/ovn-northd.c | 380 +---
>  northd/ovn_northd.dl| 180 ---
>  ovn-nb.xml  |  29 ++-
>  ovn-sb.xml  |  10 --
>  tests/ovn-nbctl.at  |   3 +
>  tests/ovn-northd.at | 345 
>  tests/system-ovn.at | 215 ++-
>  utilities/ovn-nbctl.c   |  25 ++-
>  9 files changed, 1043 insertions(+), 152 deletions(-)
>
> --
> 2.31.1
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 11/12] dpif-netdev/mfex: add more AVX512 traffic profiles

2021-06-30 Thread Van Haaren, Harry
> -Original Message-
> From: Amber, Kumar 
> Sent: Wednesday, June 30, 2021 4:10 PM
> To: Eelco Chaudron ; Van Haaren, Harry
> 
> Cc: d...@openvswitch.org; i.maxim...@ovn.org; Flavio Leitner 
> ;
> Stokes, Ian 
> Subject: RE: [ovs-dev] [v4 11/12] dpif-netdev/mfex: add more AVX512 traffic 
> profiles
> 
> Hi Eelco,
> 
> Pls find my comments inline.

Snip away all except the "callbacks" topic.

> > NIT: As we might continue to add variants, would a callback in the profile 
> > be
> > cleaner? Not sure what arguments to pass? Just a thought…
> 
> Nice thought we have patch for IPv6 coming up we can surely explore the idea 

Callbacks can be more difficult for the compiler to inline.
A direct function call is always taken, and compilers know this.

A function pointer requires a little more "compiler knowledge"
to successfully inline the actual function. (GCC can do it, its been
tested before).

I'm not seeing a better solution overall however, as that function pointer
will still need to be set based on each profile... so we still have a switch()
with all profiles inside it.

We could abstract each specific profile away to a profile-specific helper 
function,
however that would cause an explosion of functions, instead of some code inside
a switch statement.

For now, lets leave as is, no need for churn at this point. If the list really 
gets
unmanageable, we can review again in future when adding more impls.

Thanks for feedback, -Harry
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 10/12] dpif-netdev/mfex: Add AVX512 based optimized miniflow extract

2021-06-30 Thread Van Haaren, Harry
> -Original Message-
> From: Eelco Chaudron 
> Sent: Wednesday, June 30, 2021 3:35 PM
> To: Van Haaren, Harry 
> Cc: Amber, Kumar ; d...@openvswitch.org;
> i.maxim...@ovn.org; Flavio Leitner ; Stokes, Ian
> 
> Subject: Re: [ovs-dev] [v4 10/12] dpif-netdev/mfex: Add AVX512 based optimized
> miniflow extract
> 
> 
> 
> On 30 Jun 2021, at 15:30, Van Haaren, Harry wrote:
> 
> >> -Original Message-
> >> From: Eelco Chaudron 
> >> Sent: Wednesday, June 30, 2021 2:12 PM
> >> To: Amber, Kumar ; Van Haaren, Harry
> >> 
> >> Cc: d...@openvswitch.org; i.maxim...@ovn.org; Flavio Leitner
> ;
> >> Stokes, Ian 
> >> Subject: Re: [ovs-dev] [v4 10/12] dpif-netdev/mfex: Add AVX512 based 
> >> optimized
> >> miniflow extract
> >>
> >> This patch was an interesting patch to review and being reminded about
> endianness,
> >> and this site,
> >>
> https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz
> >> _permutexvar_epi8=4315, got me through it ;)
> >
> > Hah, yes the Intrinsics Guide is very useful for reading/investigating 
> > what/how
> instructions can do.
> > Its... almost always open in a browser in some tab here! :)
> >
> >
> >> Some comments below...
> >>
> >> //Eelco
> >
> > Thanks for review, I'll snip away large chunks of code to reduce verbosity.
> >
> > Regards, -Harry
> >
> >
> >> On 17 Jun 2021, at 18:27, Kumar Amber wrote:
> >>
> >>> From: Harry van Haaren 
> >
> > 
> >
> >>> +/* AVX512-BW level permutex2var_epi8 emulation. */
> >>> +static inline __m512i
> >>> +__attribute__((target("avx512bw")))
> >>
> >> Are these targets universal enough for all supported compilers, if not we 
> >> might
> need
> >> to move them to individual macros in compile.h.
> >
> > Yes, these are the standard gcc/clang etc compiler -m  switches.
> >
> > Search for "-mavx512bw" on e.g. this GCC page, lists them all;
> > https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
> >
> > If a compiler does not understand them, we will have to #ifdef that 
> > compiler out,
> > as it just doesn't support the ISA.
> 
> Guess my concern is with the windows/Microsoft compiler, as I have no windows
> setup, I can not verify this.

Me neither. Flavio you mentioned a windows compiler issue on the DPIF patchset,
would you test compile here please?


> >>> +/* Static const instances of profiles. These are compile-time constants,
> >>> + * and are specialized into individual miniflow-extract functions.
> >>> + */
> >>> +static const struct mfex_profile mfex_profiles[PROFILE_COUNT] =
> >>> +{
> >>> +[PROFILE_ETH_IPV4_UDP] = {
> >>> +.probe_mask.u8_data = { PATTERN_ETHERTYPE_MASK
> PATTERN_IPV4_MASK
> >> },
> >>> +.probe_data.u8_data = { PATTERN_ETHERTYPE_IPV4
> PATTERN_IPV4_UDP},
> >>> +
> >>> +.store_shuf.u8_data = { PATTERN_IPV4_UDP_SHUFFLE },
> >>> +.store_kmsk = PATTERN_IPV4_UDP_KMASK,
> >>> +
> >>> +.mf_bits = { 0x18a0, 0x00040401},
> >>
> >> I did some manual translation from these bits, to parts of the flow 
> >> structure they
> >> represent, but it was not something fun to do. Maybe you still have your 
> >> notes
> and
> >> could add some to the code? It might help debugging?
> >
> > Agree that these bits are "arbitrary" to some degree, they're offsets into 
> > the
> miniflow
> > datastructure, with each bit representing 8-bytes of data.
> >
> > These are derived from the output of the autovalidator, which prints "good" 
> > and
> "test"
> > values.
> 
> Nice forgot about that part ;)

The autovalidator strikes again :)

> > 
> >
> >> As we are explicitly manual defining the mf_bits I think we also need to 
> >> update
> the
> >> comment in the “struct flow” definition to reflect that if the order 
> >> change these
> >> specific functions need updating also.
> >
> > There's an "ABI Macro" in that struct, we can throw one of those build-time 
> > asserts
> into here
> > too to be "extra sure", but this would be caught by running MFEX 
> > autovalidation
> unit tests.
> 
> Guess they will but not sure if the dpdk test is part of the standard tests. 
> Anyway, this
> is the comment I think should be updated:
> https://github.com/openvswitch/ovs/blob/e5b5008acdf08e90874f5b4da09ffe162fc
> 762aa/include/openvswitch/flow.h#L97

Will include a build-time check in the AVX512 MFEX to fail a build if struct 
flow is
updated in future. Autovalidator would again catch any mis-matches, but nice to
know it at build-time too.


> > 
> >
> >>> +/* Generic loop to process any mfex profile. This code is specialized 
> >>> into
> >>> + * multiple actual MFEX implementation functions. Its marked 
> >>> ALWAYS_INLINE
> >>> + * to ensure the compiler specializes each instance. The code is marked 
> >>> "hot"
> >>> + * to inform the compiler this is a hotspot in the program, encouraging
> >>> + * inlining of callee functions such as the permute calls.
> >>> + */
> >>> +static inline uint32_t ALWAYS_INLINE
> >>> +__attribute__ ((hot))
> >>
> >> Do we need to 

Re: [ovs-dev] [PATCH] datapath-windows: Specify external include paths

2021-06-30 Thread alinserdean
Hi Guru,

 

Thank you for letting me know. I backported the change until 2.13.

 

Thank you,

Alin.

 

From: Guru Shetty  
Sent: Saturday, June 26, 2021 1:14 AM
To: Alin-Gabriel Serdean 
Cc: Ilya Maximets ; ovs dev ; Frank 
Wagner 
Subject: Re: [ovs-dev] [PATCH] datapath-windows: Specify external include paths

 

Alin,

 Does it make sense to apply this patch to stable branches too. I see that 2.15 
fails with a similar error.

 

Thanks,

Guru

 

On Thu, 17 Jun 2021 at 02:31, Alin-Gabriel Serdean mailto:aserd...@ovn.org> > wrote:

On Wed, 2021-06-16 at 18:06 +0300, Alin-Gabriel Serdean wrote:
> On Tue, 2021-06-15 at 18:06 +0200, Ilya Maximets wrote:
> > On 6/15/21 3:43 PM, Alin Gabriel Serdean wrote:
> > > VStudio 16.10 adds usermode includes before including the driver
> > > kit ones.
> > > 
> > > Bug tracked at:
> > > https://developercommunity.visualstudio.com/t/error-lnk2019-unresolved-external-symbol-stdio-com/1434674
> > > 
> > > Fixes appveyor build reported by forcing external includes.
> > 
> > Thanks, Alin.  I know nothing about the windows build process, but
> > I
> > see
> > that this patch fixes the issue with the current AppVeyor CI,
> > therefore:
> > 
> > Acked-by: Ilya Maximets mailto:i.maxim...@ovn.org> >
> 
> Thank you!
> 
> > Out of curiosity, is this change backward compatible?  I mean,
> > is it possible to build on older platform (older VS) with this
> > change?
> 
> It should be.
> Usually we do not need to force the order of include directories. For
> kernel projects it should default to the kernel includes.
> I test with the last two versions of VS (2019, 2017).
> 
> We should add a build matrix for different versions of VS images to
> appveyor / GHA so we could be sure.
> I'll try to update the appveyor side.
> 
> FWIW a new version of VS was launched yesterday (
> https://docs.microsoft.com/en-us/visualstudio/releases/2019/release-notes#visual-studio-2019-version-1610-releases
> ), I will try to compile
> without the patch to see if they hotfixed the issue.

It did not. Applying the patch.

___
dev mailing list
d...@openvswitch.org  
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 1/1] match: do not print "igmp" match keyword

2021-06-30 Thread Adrian Moreno
The match keyword "igmp" is not supported in ofp-parse, which means
that flow dumps cannot be restored. This patch prints the igmp match
in the accepted format (ip,nw_proto=2) and adds a test.

Signed-off-by: Adrian Moreno 
---
 lib/match.c| 2 --
 tests/ovs-ofctl.at | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/match.c b/lib/match.c
index ba716579d..4a0778c30 100644
--- a/lib/match.c
+++ b/lib/match.c
@@ -1556,8 +1556,6 @@ match_format(const struct match *match,
 skip_proto = true;
 if (f->nw_proto == IPPROTO_ICMP) {
 ds_put_format(s, "%sicmp%s,", colors.value, colors.end);
-} else if (f->nw_proto == IPPROTO_IGMP) {
-ds_put_format(s, "%sigmp%s,", colors.value, colors.end);
 } else if (f->nw_proto == IPPROTO_TCP) {
 ds_put_format(s, "%stcp%s,", colors.value, colors.end);
 } else if (f->nw_proto == IPPROTO_UDP) {
diff --git a/tests/ovs-ofctl.at b/tests/ovs-ofctl.at
index 5ddca67e7..fbc622959 100644
--- a/tests/ovs-ofctl.at
+++ b/tests/ovs-ofctl.at
@@ -192,6 +192,7 @@ actions=note:41.42.43,note:00.01.02.03.04.05.06.07,note
 ip,actions=set_field:10.4.3.77->ip_src,mod_nw_ecn:2
 sctp actions=drop
 sctp actions=drop
+ip,nw_proto=2 actions=drop
 in_port=0 actions=resubmit:0
 
actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678)
 
actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,ingress)
@@ -226,6 +227,7 @@ OFPT_FLOW_MOD: ADD 
actions=note:41.42.43.00.00.00,note:00.01.02.03.04.05.06.07.0
 OFPT_FLOW_MOD: ADD ip actions=mod_nw_src:10.4.3.77,load:0x2->NXM_NX_IP_ECN[]
 OFPT_FLOW_MOD: ADD sctp actions=drop
 OFPT_FLOW_MOD: ADD sctp actions=drop
+OFPT_FLOW_MOD: ADD ip,nw_proto=2 actions=drop
 OFPT_FLOW_MOD: ADD in_port=0 actions=resubmit:0
 OFPT_FLOW_MOD: ADD 
actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678)
 OFPT_FLOW_MOD: ADD 
actions=sample(probability=12345,collector_set_id=23456,obs_domain_id=34567,obs_point_id=45678,ingress)
-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 0/1] flow dumps containing igmp match cannot be restored

2021-06-30 Thread Adrian Moreno
As discussed in [1], "igmp" is not a special OF header so
ofp_parse_protocol() does not accept it. However, it's still being
printed by match_format().

This makes flow dump restoration fail if it contains this match,
which affects for instance the ovs-save script used by "ovs-ctl restart"
to make ovs-vswitchd restart have less impact on the datapath.

I am aware this change might break users that already expect this
keyword but on the other hand, the fact that flows cannot be saved and
restored could be considered a bug.

An alternative to make "ovs-save" work would be to use "sed" to rewrite
the match, but that would still expose the failure to any user not using
that script.

Sending the patch to gather feedback on this topic

[1] 
https://patchwork.ozlabs.org/project/openvswitch/patch/20201102232805.1960103-1-...@ovn.org/


Adrian Moreno (1):
  match: do not print "igmp" match keyword

 lib/match.c| 2 --
 tests/ovs-ofctl.at | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 12/12] dpif/dpcls: limit count subtable search info logs

2021-06-30 Thread Amber, Kumar
Thanks Eelco sure go ahead :) upgrade will surely raise the pps! 

> -Original Message-
> From: Eelco Chaudron 
> Sent: Wednesday, June 30, 2021 8:27 PM
> To: Amber, Kumar ; Van Haaren, Harry
> 
> Cc: d...@openvswitch.org; i.maxim...@ovn.org; Stokes, Ian
> ; Flavio Leitner 
> Subject: Re: [ovs-dev] [v4 12/12] dpif/dpcls: limit count subtable search 
> info logs
> 
> No additional comments on this patch! This concludes my review of v4, looking
> forward to v5.
> 
> I will now do some additional tests on my non AVX512 machine. Guess I need to
> update my Intel NUC to an AVX512 supported one :)
> 
> //Eelco
> 
> 
> On 17 Jun 2021, at 18:27, Kumar Amber wrote:
> 
> > From: Harry van Haaren 
> >
> > This commit avoids many instances of "using subtable X for miniflow (x,y)"
> > in the ovs-vswitchd log when using the DPCLS Autovalidator. This
> > occurs when no specialized subtable is found, and the generic "_any"
> > version of the avx512 subtable search implementation was used. This
> > change logs the subtable usage once, avoiding duplicates.
> >
> > Signed-off-by: Harry van Haaren 
> > ---
> >  lib/dpif-netdev-lookup-avx512-gather.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/lib/dpif-netdev-lookup-avx512-gather.c
> > b/lib/dpif-netdev-lookup-avx512-gather.c
> > index 2e754c89f..deed527b0 100644
> > --- a/lib/dpif-netdev-lookup-avx512-gather.c
> > +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> > @@ -411,7 +411,7 @@ dpcls_subtable_avx512_gather_probe(uint32_t
> u0_bits, uint32_t u1_bits)
> >   */
> >  if (!f && (u0_bits + u1_bits) < (NUM_U64_IN_ZMM_REG * 2)) {
> >  f = dpcls_avx512_gather_mf_any;
> > -VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n",
> > +VLOG_INFO_ONCE("Using avx512_gather_mf_any for subtable
> > + (%d,%d)\n",
> >u0_bits, u1_bits);
> >  }
> >
> > --
> > 2.25.1
> >
> > ___
> > dev mailing list
> > d...@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 11/12] dpif-netdev/mfex: add more AVX512 traffic profiles

2021-06-30 Thread Amber, Kumar
Hi Eelco,

Pls find my comments inline.



> >  #define KMASK_ETHER 0x1FFFULL
> > +#define KMASK_DT1Q  0x000FULL
> 
> This was messing me up, as this suggests this is a 16-byte mask, but this is 
> only 8,
> so maybe we should indicate it by removing the two leading zeros?
> 
>#define KMASK_DT1Q0x0FULL
> 

Fixed in v5.

> >  #define KMASK_IPV4  0xF0FFULL
> >  #define KMASK_UDP   0x000FULL
> > +#define KMASK_TCP   0x0F00ULL
> >
> > @@ -233,6 +326,28 @@ mfex_ipv4_set_l2_pad_size(struct dp_packet *pkt,
> > +}
> > +
> > +/* Process TCP flags using known LE endian-ness as this is AVX512
> > +code. */ #define TCP_FLAGS_BE32(tcp_ctl) ((OVS_FORCE ovs_be32)
> > +TCP_FLAGS_BE16(tcp_ctl))
> > +
> 
> Looks like the TCP_FLAGS_BE32() macro is not used in this code.
> 

Cleared in v5.

> > +static void
> > +mfex_handle_tcp_flags(const struct tcp_header *tcp, uint64_t *block)
> > +{
> > +uint16_t ctl = (OVS_FORCE uint16_t) TCP_FLAGS_BE16(tcp->tcp_ctl);
> > +uint64_t ctl_u64 = ctl;
> > +*block = ctl_u64 << 32;
> > +}
> > +
> >  /* Generic loop to process any mfex profile. This code is specialized into
> >   * multiple actual MFEX implementation functions. Its marked
> ALWAYS_INLINE
> >   * to ensure the compiler specializes each instance. The code is marked 
> > "hot"
> > @@ -321,6 +436,43 @@ mfex_avx512_process(struct dp_packet_batch
> *packets,
> >  ovs_assert(0); /* avoid compiler warning on missing ENUM */
> >  break;
> >
> 
> NIT: As we might continue to add variants, would a callback in the profile be
> cleaner? Not sure what arguments to pass? Just a thought…
> 
> 

Nice thought we have patch for IPv6 coming up we can surely explore the idea 
> >
> >
> > --
> > 2.25.1
> >
> > ___
> > dev mailing list
> > d...@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 12/12] dpif/dpcls: limit count subtable search info logs

2021-06-30 Thread Eelco Chaudron
No additional comments on this patch! This concludes my review of v4, looking 
forward to v5.

I will now do some additional tests on my non AVX512 machine. Guess I need to 
update my Intel NUC to an AVX512 supported one :)

//Eelco


On 17 Jun 2021, at 18:27, Kumar Amber wrote:

> From: Harry van Haaren 
>
> This commit avoids many instances of "using subtable X for miniflow (x,y)"
> in the ovs-vswitchd log when using the DPCLS Autovalidator. This occurs
> when no specialized subtable is found, and the generic "_any" version of
> the avx512 subtable search implementation was used. This change logs the
> subtable usage once, avoiding duplicates.
>
> Signed-off-by: Harry van Haaren 
> ---
>  lib/dpif-netdev-lookup-avx512-gather.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/lib/dpif-netdev-lookup-avx512-gather.c 
> b/lib/dpif-netdev-lookup-avx512-gather.c
> index 2e754c89f..deed527b0 100644
> --- a/lib/dpif-netdev-lookup-avx512-gather.c
> +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> @@ -411,7 +411,7 @@ dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, 
> uint32_t u1_bits)
>   */
>  if (!f && (u0_bits + u1_bits) < (NUM_U64_IN_ZMM_REG * 2)) {
>  f = dpcls_avx512_gather_mf_any;
> -VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n",
> +VLOG_INFO_ONCE("Using avx512_gather_mf_any for subtable (%d,%d)\n",
>u0_bits, u1_bits);
>  }
>
> -- 
> 2.25.1
>
> ___
> dev mailing list
> d...@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 11/12] dpif-netdev/mfex: add more AVX512 traffic profiles

2021-06-30 Thread Eelco Chaudron


On 17 Jun 2021, at 18:27, Kumar Amber wrote:

> From: Harry van Haaren 
>
> This commit adds 3 new traffic profile implementations to the
> existing avx512 miniflow extract infrastructure. The profiles added are:
> - Ether()/IP()/TCP()
> - Ether()/Dot1Q()/IP()/UDP()
> - Ether()/Dot1Q()/IP()/TCP()
>
> The design of the avx512 code here is for scalability to add more
> traffic profiles, as well as enabling CPU ISA. Note that an implementation
> is primarily adding static const data, which the compiler then specializes
> away when the profile specific function is declared below.
>
> As a result, the code is relatively maintainable, and scalable for new
> traffic profiles as well as new ISA, and does not lower performance
> compared with manually written code for each profile/ISA.
>
> Note that confidence in the correctness of each implementation is
> achieved through autovalidation, unit tests with known packets, and
> fuzz tested packets.
>
> Signed-off-by: Harry van Haaren 
>
> ---
>
> Hi Readers,
>
> If you have a traffic profile you'd like to see accelerated using
> avx512 code, please send me an email and we can collaborate on adding
> support for it!
>
> Regards, -Harry
> ---
>  lib/dpif-netdev-extract-avx512.c  | 155 ++
>  lib/dpif-netdev-private-extract.c |  31 ++
>  lib/dpif-netdev-private-extract.h |   4 +
>  3 files changed, 190 insertions(+)
>
> diff --git a/lib/dpif-netdev-extract-avx512.c 
> b/lib/dpif-netdev-extract-avx512.c
> index 1145ac8a9..0e0f6e295 100644
> --- a/lib/dpif-netdev-extract-avx512.c
> +++ b/lib/dpif-netdev-extract-avx512.c
> @@ -117,6 +117,13 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, 
> __m512i idx, __m512i a)
>
>  #define PATTERN_ETHERTYPE_MASK PATTERN_ETHERTYPE_GEN(0xFF, 0xFF)
>  #define PATTERN_ETHERTYPE_IPV4 PATTERN_ETHERTYPE_GEN(0x08, 0x00)
> +#define PATTERN_ETHERTYPE_DT1Q PATTERN_ETHERTYPE_GEN(0x81, 0x00)
> +
> +/* VLAN (Dot1Q) patterns and masks. */
> +#define PATTERN_DT1Q_MASK   \
> +  0x00, 0x00, 0xFF, 0xFF,
> +#define PATTERN_DT1Q_IPV4   \
> +  0x00, 0x00, 0x08, 0x00,
>
>  /* Generator for checking IPv4 ver, ihl, and proto */
>  #define PATTERN_IPV4_GEN(VER_IHL, FLAG_OFF_B0, FLAG_OFF_B1, PROTO) \
> @@ -142,6 +149,29 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, 
> __m512i idx, __m512i a)
>34, 35, 36, 37, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* UDP */  
>  \
>NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused. 
> */
>
> +/* TCP shuffle: tcp_ctl bits require mask/processing, not included here. */
> +#define PATTERN_IPV4_TCP_SHUFFLE \
> +   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, NU, NU, /* Ether 
> */ \
> +  26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */ 
>  \
> +  NU, NU, NU, NU, NU, NU, NU, NU, 34, 35, 36, 37, NU, NU, NU, NU, /* TCP */  
>  \
> +  NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused. 
> */
> +
> +#define PATTERN_DT1Q_IPV4_UDP_SHUFFLE
>  \
> +  /* Ether (2 blocks): Note that *VLAN* type is written here. */ 
>  \
> +  0,  1,  2,  3,  4,  5,  6,  7, 8,  9, 10, 11, 16, 17,  0,  0,  
>  \
> +  /* VLAN (1 block): Note that the *EtherHdr->Type* is written here. */  
>  \
> +  12, 13, 14, 15, 0, 0, 0, 0,
>  \
> +  30, 31, 32, 33, 34, 35, 36, 37, 0, 0, 0, 0, 24, 19, 26, 27, /* IPv4 */ 
>  \
> +  38, 39, 40, 41, NU, NU, NU, NU, /* UDP */
> +
> +#define PATTERN_DT1Q_IPV4_TCP_SHUFFLE
>  \
> +  /* Ether (2 blocks): Note that *VLAN* type is written here. */ 
>  \
> +  0,  1,  2,  3,  4,  5,  6,  7, 8,  9, 10, 11, 16, 17,  0,  0,  
>  \
> +  /* VLAN (1 block): Note that the *EtherHdr->Type* is written here. */  
>  \
> +  12, 13, 14, 15, 0, 0, 0, 0,
>  \
> +  30, 31, 32, 33, 34, 35, 36, 37, 0, 0, 0, 0, 24, 19, 26, 27, /* IPv4 */ 
>  \
> +  NU, NU, NU, NU, NU, NU, NU, NU, 38, 39, 40, 41, NU, NU, NU, NU, /* TCP */  
>  \
> +  NU, NU, NU, NU, NU, NU, NU, NU, /* Unused. */
>
>  /* Generation of K-mask bitmask values, to zero out data in result. Note that
>   * these correspond 1:1 to the above "*_SHUFFLE" values, and bit used must be
> @@ -151,12 +181,22 @@ _mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, 
> __m512i idx, __m512i a)
>   * Note the ULL suffix allows shifting by 32 or more without integer 
> overflow.
>   */
>  #define KMASK_ETHER 0x1FFFULL
> +#define KMASK_DT1Q  0x000FULL

This was messing me up, as this suggests this is a 16-byte mask, but this is 
only 8, so maybe we should indicate it by removing the two leading zeros?

   #define KMASK_DT1Q0x0FULL

>  #define KMASK_IPV4  0xF0FFULL
>  #define KMASK_UDP   0x000FULL
> +#define KMASK_TCP   

Re: [ovs-dev] [v4 10/12] dpif-netdev/mfex: Add AVX512 based optimized miniflow extract

2021-06-30 Thread Eelco Chaudron


On 30 Jun 2021, at 15:30, Van Haaren, Harry wrote:

>> -Original Message-
>> From: Eelco Chaudron 
>> Sent: Wednesday, June 30, 2021 2:12 PM
>> To: Amber, Kumar ; Van Haaren, Harry
>> 
>> Cc: d...@openvswitch.org; i.maxim...@ovn.org; Flavio Leitner 
>> ;
>> Stokes, Ian 
>> Subject: Re: [ovs-dev] [v4 10/12] dpif-netdev/mfex: Add AVX512 based 
>> optimized
>> miniflow extract
>>
>> This patch was an interesting patch to review and being reminded about 
>> endianness,
>> and this site,
>> https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz
>> _permutexvar_epi8=4315, got me through it ;)
>
> Hah, yes the Intrinsics Guide is very useful for reading/investigating 
> what/how instructions can do.
> Its... almost always open in a browser in some tab here! :)
>
>
>> Some comments below...
>>
>> //Eelco
>
> Thanks for review, I'll snip away large chunks of code to reduce verbosity.
>
> Regards, -Harry
>
>
>> On 17 Jun 2021, at 18:27, Kumar Amber wrote:
>>
>>> From: Harry van Haaren 
>
> 
>
>>> +/* AVX512-BW level permutex2var_epi8 emulation. */
>>> +static inline __m512i
>>> +__attribute__((target("avx512bw")))
>>
>> Are these targets universal enough for all supported compilers, if not we 
>> might need
>> to move them to individual macros in compile.h.
>
> Yes, these are the standard gcc/clang etc compiler -m  switches.
>
> Search for "-mavx512bw" on e.g. this GCC page, lists them all;
> https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html
>
> If a compiler does not understand them, we will have to #ifdef that compiler 
> out,
> as it just doesn't support the ISA.

Guess my concern is with the windows/Microsoft compiler, as I have no windows 
setup, I can not verify this.

>>> +/* Static const instances of profiles. These are compile-time constants,
>>> + * and are specialized into individual miniflow-extract functions.
>>> + */
>>> +static const struct mfex_profile mfex_profiles[PROFILE_COUNT] =
>>> +{
>>> +[PROFILE_ETH_IPV4_UDP] = {
>>> +.probe_mask.u8_data = { PATTERN_ETHERTYPE_MASK PATTERN_IPV4_MASK
>> },
>>> +.probe_data.u8_data = { PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_UDP},
>>> +
>>> +.store_shuf.u8_data = { PATTERN_IPV4_UDP_SHUFFLE },
>>> +.store_kmsk = PATTERN_IPV4_UDP_KMASK,
>>> +
>>> +.mf_bits = { 0x18a0, 0x00040401},
>>
>> I did some manual translation from these bits, to parts of the flow 
>> structure they
>> represent, but it was not something fun to do. Maybe you still have your 
>> notes and
>> could add some to the code? It might help debugging?
>
> Agree that these bits are "arbitrary" to some degree, they're offsets into 
> the miniflow
> datastructure, with each bit representing 8-bytes of data.
>
> These are derived from the output of the autovalidator, which prints "good" 
> and "test"
> values.

Nice forgot about that part ;)
> 
>
>> As we are explicitly manual defining the mf_bits I think we also need to 
>> update the
>> comment in the “struct flow” definition to reflect that if the order change 
>> these
>> specific functions need updating also.
>
> There's an "ABI Macro" in that struct, we can throw one of those build-time 
> asserts into here
> too to be "extra sure", but this would be caught by running MFEX 
> autovalidation unit tests.

Guess they will but not sure if the dpdk test is part of the standard tests. 
Anyway, this is the comment I think should be updated: 
https://github.com/openvswitch/ovs/blob/e5b5008acdf08e90874f5b4da09ffe162fc762aa/include/openvswitch/flow.h#L97

> 
>
>>> +/* Generic loop to process any mfex profile. This code is specialized into
>>> + * multiple actual MFEX implementation functions. Its marked ALWAYS_INLINE
>>> + * to ensure the compiler specializes each instance. The code is marked 
>>> "hot"
>>> + * to inform the compiler this is a hotspot in the program, encouraging
>>> + * inlining of callee functions such as the permute calls.
>>> + */
>>> +static inline uint32_t ALWAYS_INLINE
>>> +__attribute__ ((hot))
>>
>> Do we need to move this to a macro in compiler.h as OVS_HOT to make sure 
>> it’s not
>> causing issues on other compilers like windows, etc?
>
> I'm not sure, we could I suppose, I'm not strongly for or against. Today this
> patchset doesn't modify compiler.h at all, perhaps cleaner to update in a 
> later patch,
> and consider other functions for tagging with OVS_HOT too in that patchset?
>
> 

I do not have a strong preference either. It looks like this is the only 
patch/place using it, and as you suggested, we could do it in a follow-up patch 
if we start using it in more places.

>>> +/* Copy known dp packet offsets to the dp_packet instance. */
>>> +memcpy(>l2_pad_size, >dp_pkt_offs,
>>> +   sizeof(uint16_t) * 4);
>>> +
>>
>> Here we copy four fields to the packet structure (l2_pad_size, l2_5_ofs, 
>> l3_ofs,
>> l4_ofs). I think we should add some static_assert to make sure the order of 
>> these
>> fields 

Re: [ovs-dev] [PATCH ovn] ovn-controller: Fix port group I-P when they contain non-vif ports.

2021-06-30 Thread Dumitru Ceara
On 6/28/21 8:05 AM, Han Zhou wrote:
> On Fri, Jun 25, 2021 at 2:38 PM Numan Siddique  wrote:
>>
>> On Fri, Jun 25, 2021 at 2:53 PM Han Zhou  wrote:
>>>
>>> On Fri, Jun 25, 2021 at 4:50 AM Dumitru Ceara  wrote:

 It's valid that port_groups contain non-vif ports, they can actually
 contain any type of logical_switch_port.

 Also, there's no need to allocate a new sset containing the local
> ports'
 names every time the I-P engine processes a change, we can maintain a
 sset and incrementally update it when port bindings are added/removed.

>>> Thanks Dumitru for the fix and thanks Numan for the review.
>>>
>>> I battled with myself when deciding to allocate a new sset for the local
>>> ports' names at the I-P main iteration level. I did this because the
>>> current data structures maintaining the local bindings were already very
>>> complex, and the sset was not to maintain any extra information but just
>>> redundant information (for efficiency). So I decided to abstract this
> part
>>> as a helper function so that I don't add more complexity in the binding
>>> data structure, and other I-P engine nodes doesn't need to understand
> the
>>> internals of how the bindings are maintained in the bindings module.
>>> Regarding the cost, the local binding data should be small, and the sset
>>> allocation is at the main loop level, so really nothing to worry about
> the
>>> cost.
>>>
>>> However, I didn't think about the non-VIF use case of port-group, and
> the
>>> local_binding doesn't maintain non-VIF bindings, so came the bug. This
>>> patch fixes it by maintaining a sset that includes all types of lport
>>> names. It looks correct to me, but I have some comments:
>>>
>>> 1) The structures in bindings module is really complex and I spent a
> lot of
>>> time to understand it before, but when I am reviewing this patch I had
> to
>>> spent some time again to understand it. There are fields in binding_ctx
>>> look quite similar, and the comments don't tell exactly the difference:
>>>
>>> - struct local_binding_data *lbinding_data;
>>> - struct sset *local_lports;
>>> - struct sset *local_lport_ids;
>>>
>>
>> I agree with the complexity and the naming confusion.
>>
>> I think local_lports and local_lport_ids have been maintained in the
>> binding code
>> since a long time and lbinding_data was added recently.
>>
>> I think there is a lot of redundant data which can be unified.
>>
>>
>>> According to the code (and also the bug the patch is trying to fix),
>>> lbinding_data is supposed to maintain VIFs only.
>>
>> I agree.  "lbinding_data" is supposed to maintain local vif binding
> information.
>>
>>> local_lports maintains all types, but it maintains *potentially* local
> VIFs
>>> as well (meaning the VIF may not be bound locally yet). I was thinking
> if I
>>> could use local_lports directly. I think it would work, but just not
>>> accurate enough (maybe it doesn't really matter).
>>
>>
>>> The local_lport_ids may look straightforward, which maintains generated
> id
>>> keys for local_lports, but the functions update_local_lport_ids() and
>>> remove_local_lport_ids() are not only updating the local_lport_ids, but
>>> also tracking information of lbinding_data, which is really confusing.
>>>
>>> 2) Now for this patch, the intention is to include non-VIF bindings,
> but it
>>> adds a sset to maintain all types of lports in "lbinding_data", which
> was
>>> supposed to maintain VIF bindings only. I think it is not the right
> place
>>> to maintain this sset. And the
>>> update_local_lport_ids()/remove_local_lport_ids() are not the right
> place
>>> to update them either.
>>>
>>> So here are my suggestions:
>>>
>>> 1) Clarify a little more about the role of each of the above fields in
>>> binding_ctx with some comments.
>>
>> These comments would be super helpful.  But I think it is outside the
>> scope of this bug fix patch.  It's better if it's a separate patch.
> 
> Agree. And I just noticed that the comments for local_lport_ids in
> ovn-controller.c is not correct any more (probably since very long time
> ago):
> /* Contains the same ports as local_lports, but in the format:
> 
>  * _ */
> 
> struct sset local_lport_ids;
> 
> It in fact contains more lports than local_lports, including patch ports,
> and they are used for different purposes. I think we should rename them.
> 
>>
>>> 2) Can we use local_lports directly, instead of maintaining a new sset?
> If
>>> we can't (I am not sure yet), can we generate it on-the-fly, just
> updating
>>> the "binding_collect_local_binding_lports" by adding non-VIF lports from
>>> "local_lports"? I really don't think the cost makes any difference
> overall.
>>> If none of the above work, can we maintain it as a separate field at
>>> binding_ctx level instead of in lbinding_data (with proper comment
> telling
>>> the difference from local_lports)?
>>
>> I think local_lports can be used.  The side effect would be that we
>> 

[ovs-dev] [PATCH ovn v2] ovn-controller: Fix port group I-P when they contain non-vif ports.

2021-06-30 Thread Dumitru Ceara
It's valid that port_groups contain non-vif ports, they can actually
contain any type of logical_switch_port.

Also, there's no need to allocate a new sset containing the local ports'
names every time the I-P engine processes a change.  We were already
maintaining a set of "local_lport_ids".  These correspond to port
bindings that are relevant locally (including non-vif ports).  Extend
it to include the locally relevant lport names too and rename the
structure an its helper functions to related_lport*.

Reported-at: https://github.com/ovn-org/ovn/pull/61#issuecomment-865094163
Reported-by: Antonio Ojea 
Fixes: 0cfeba6b55e3 ("ovn-controller: Fix port group conjunction flow explosion 
problem.")
Signed-off-by: Dumitru Ceara 
---
v2:
- Addressed Numan's and Han's comments:
  - add struct related_lports
  - add test case.
---
 controller/binding.c| 79 ++---
 controller/binding.h| 31 ---
 controller/lflow.c  |  2 +-
 controller/lflow.h  |  2 +-
 controller/ovn-controller.c | 48 +-
 tests/ovn.at| 44 +
 6 files changed, 120 insertions(+), 86 deletions(-)

diff --git a/controller/binding.c b/controller/binding.c
index 7fde0fdbb..594babc98 100644
--- a/controller/binding.c
+++ b/controller/binding.c
@@ -531,38 +531,41 @@ remove_local_lports(const char *iface_id, struct 
binding_ctx_out *b_ctx)
 }
 }
 
-/* Add a port binding ID (of the form "dp-key"_"port-key") to the set of local
- * lport IDs. Also track if the set has changed.
+/* Add a port binding to the set of locally relevant lports.
+ * Also track if the set has changed.
  */
 static void
-update_local_lport_ids(const struct sbrec_port_binding *pb,
-   struct binding_ctx_out *b_ctx)
+update_related_lport(const struct sbrec_port_binding *pb,
+ struct binding_ctx_out *b_ctx)
 {
 char buf[16];
 get_unique_lport_key(pb->datapath->tunnel_key, pb->tunnel_key,
  buf, sizeof(buf));
-if (sset_add(b_ctx->local_lport_ids, buf) != NULL) {
-b_ctx->local_lport_ids_changed = true;
+if (sset_add(_ctx->related_lports->lport_ids, buf) != NULL) {
+b_ctx->related_lports_changed = true;
 
 if (b_ctx->tracked_dp_bindings) {
 /* Add the 'pb' to the tracked_datapaths. */
 tracked_binding_datapath_lport_add(pb, b_ctx->tracked_dp_bindings);
 }
 }
+sset_add(_ctx->related_lports->lport_names, pb->logical_port);
 }
 
-/* Remove a port binding id from the set of local lport IDs. Also track if
- * the set has changed.
+/* Remove a port binding id from the set of locally relevant lports.
+ * Also track if the set has changed.
  */
 static void
-remove_local_lport_ids(const struct sbrec_port_binding *pb,
-   struct binding_ctx_out *b_ctx)
+remove_related_lport(const struct sbrec_port_binding *pb,
+ struct binding_ctx_out *b_ctx)
 {
 char buf[16];
 get_unique_lport_key(pb->datapath->tunnel_key, pb->tunnel_key,
  buf, sizeof(buf));
-if (sset_find_and_delete(b_ctx->local_lport_ids, buf)) {
-b_ctx->local_lport_ids_changed = true;
+sset_find_and_delete(_ctx->related_lports->lport_names,
+ pb->logical_port);
+if (sset_find_and_delete(_ctx->related_lports->lport_ids, buf)) {
+b_ctx->related_lports_changed = true;
 
 if (b_ctx->tracked_dp_bindings) {
 /* Add the 'pb' to the tracked_datapaths. */
@@ -678,6 +681,20 @@ static struct binding_lport 
*binding_lport_check_and_cleanup(
 
 static char *get_lport_type_str(enum en_lport_type lport_type);
 
+void
+related_lports_init(struct related_lports *rp)
+{
+sset_init(>lport_names);
+sset_init(>lport_ids);
+}
+
+void
+related_lports_destroy(struct related_lports *rp)
+{
+sset_destroy(>lport_names);
+sset_destroy(>lport_ids);
+}
+
 void
 local_binding_data_init(struct local_binding_data *lbinding_data)
 {
@@ -1172,7 +1189,7 @@ release_binding_lport(const struct sbrec_chassis 
*chassis_rec,
   struct binding_ctx_out *b_ctx_out)
 {
 if (is_binding_lport_this_chassis(b_lport, chassis_rec)) {
-remove_local_lport_ids(b_lport->pb, b_ctx_out);
+remove_related_lport(b_lport->pb, b_ctx_out);
 if (!release_lport(b_lport->pb, sb_readonly,
b_ctx_out->tracked_dp_bindings,
b_ctx_out->if_mgr)) {
@@ -1214,7 +1231,7 @@ consider_vif_lport_(const struct sbrec_port_binding *pb,
pb->datapath, false,
b_ctx_out->local_datapaths,
b_ctx_out->tracked_dp_bindings);
-update_local_lport_ids(pb, b_ctx_out);
+update_related_lport(pb, b_ctx_out);
 update_local_lports(pb->logical_port, b_ctx_out);
 if 

Re: [ovs-dev] [PATCH ovn v3 2/3] ovn-northd: Add useful stopwatches

2021-06-30 Thread Dumitru Ceara
On 6/30/21 2:49 PM, Mark Gray wrote:
> On 24/06/2021 16:33, Dumitru Ceara wrote:
>> On 6/18/21 10:52 AM, Mark Gray wrote:
>>> For performance measurement, it is useful to understand the
>>> length of time required to complete a number of key code paths
>>> in ovn-northd.c. Add stopwatches to measure these timings.
>>>
>>> Signed-off-by: Mark Gray 
>>> ---
>>
>> Acked-by: Dumitru Ceara 
>>
>> I only have one real nit on this patch (below).  Except for that here
>> are some more random thoughts for potential follow ups.
>>
>> I think we might benefit from a even more granular measurement, e.g., in
>> some of the tests I was doing a while ago build_datapaths() was also
>> taking up a significant amount of time.
> 
> Yes, I think so too. However, I don't want to start cluttering the code
> with stopwatches put in arbitrary locations. I would rather it was
> driven by actual need. I don't feel particularly qualified to make that
> decision and I figured this patch is more about establishing the precedent.
> 

You're right, let's take it one step at a time.

>>
>> Some more interesting ones to measure are in
>> build_lswitch_and_lrouter_flows(), the loops that call
>> build_lswitch_*_by_od/op().  I don't know however how we could deal with
>> the parallel case though.
> 
> I could add these as an additional patch if you are convinced of their
> utility?
> 

Same as above, I'd say let's take it one step at a time, unless someone
else has a easy-to-implement idea.

Regards,
Dumitru

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn v3 3/3] tests: Add check-perf target

2021-06-30 Thread Dumitru Ceara
On 6/30/21 2:49 PM, Mark Gray wrote:
> On 24/06/2021 16:34, Dumitru Ceara wrote:
>> On 6/18/21 10:52 AM, Mark Gray wrote:
>>> Add a suite of micro-benchmarks to aid a developer in understanding the
>>> performance impact of any changes that they are making. They can be used to
>>> help to understand the relative performance between two test runs on the 
>>> same
>>> test machine, but are not intended to give the absolute performance of OVN.
>>>
>>> To invoke the performance testsuite, run:
>>>
>>> $ make check-perf
>>>
>>> This will run all available performance tests.
>>>
>>> Additional metrics (e.g. memory, coverage, perf counters) may be added
>>> in the future. Additional tests (e.g. additional topologies,  ovn-controller
>>> tests) may be added in the future.
>>>
>>> Signed-off-by: Mark Gray 
>>> ---
>>
>> Thanks for this, I think this is a very good idea!
> 
> Thanks (and thanks for the review!), I think it is a good starting part
> but it will only be useful if it gets used! Personally, I can see myself
> using this. However, if it doesn't, it is not very invasive and could
> easily be let rot or removed.
>>
>>>
>>> Notes:
>>> v2:  create results directory to fix build error
>>> v3:  forgot to commit, create results directory to fix build error
>>>
>>>  Documentation/topics/testing.rst |  49 
>>>  tests/.gitignore |   3 +
>>>  tests/automake.mk|  27 
>>>  tests/perf-northd.at | 207 +++
>>>  tests/perf-testsuite.at  |  26 
>>>  5 files changed, 312 insertions(+)
>>>  create mode 100644 tests/perf-northd.at
>>>  create mode 100644 tests/perf-testsuite.at
>>>
>>> diff --git a/Documentation/topics/testing.rst 
>>> b/Documentation/topics/testing.rst
>>> index be9e7c57331c..ccd3278437b1 100644
>>> --- a/Documentation/topics/testing.rst
>>> +++ b/Documentation/topics/testing.rst
>>> @@ -256,3 +256,52 @@ the following::
>>>  All the features documented under `Unit Tests`_ are available for the
>>>  datapath testsuites, except that the datapath testsuites do not
>>>  support running tests in parallel.
>>> +
>>> +Performance testing
>>> +~~~
>>> +
>>> +OVN includes a suite of micro-benchmarks to aid a developer in 
>>> understanding the
>>> +performance impact of any changes that they are making. They can be used to
>>> +help to understand the relative performance between two test runs on the 
>>> same
>>> +test machine, but are not intended to give the absolute performance of OVN.
>>> +
>>> +To invoke the performance testsuite, run::
>>> +
>>> +$ make check-perf
>>> +
>>> +This will run all available performance tests. Some of these tests may be
>>> +long-running as they need to build complex logical network topologies. In 
>>> order
>>> +to speed up subsequent test runs, some objects (e.g. the Northbound DB) 
>>> may be
>>> +cached. In order to force the tests to rebuild all these objects, run::
>>> +
>>> +$ make check-perf TESTSUITEFLAGS="--rebuild"
>>> +
>>> +A typical workflow for a developer trying to improve the performance of OVN
>>> +would be the following:
>>> +
>>> +0. Optional: Modify/add a performance test to buld the topology that you 
>>> are
>>> +   benchmarking, if required.
>>> +1. Run ``make check-perf TESTSUITEFLAGS="--rebuild"`` to generate cached
>>> +   databases.
>>> +
>>> +.. note::
>>> +   This step may take some time depending on the number of tests that are 
>>> being
>>> +   rebuilt, the complexity of the tests and the performance of the test 
>>> machine.
>>> +   If you are only using one test, you can specify the test to run by 
>>> adding the
>>> +   test number to the ``make`` command.
>>> +   (e.g. ``make check-perf TESTSUITEFLAGS="--rebuild "``)
>>> +
>>> +2. Run ``make check-perf`` to measure the performance metric that you are
>>> +   benchmarking against. If you are only using one test, you can specify 
>>> the test to run by adding the test number to the ``make`` command.
>>> +   (e.g. ``make check-perf TESTSUITEFLAGS="--rebuild "``)
>>
>> It's not very clear where the user would find the test results:
>>
>> tests/perf-testsuite.dir/results
> 
> It is mentioned further below but I will move it up to here to make it
> clearer.

Ah, OK, I missed it completely.

[...]

>>
>> The file is called perf-northd.at so if we ever decide to add some tests
>> to measure end-to-end performance including ovn-controller I assume
>> we'll need a new file, e.g., perf-controller.at.  Shouldn't we already
>> move all of the generic macros above to a shared file; maybe ovn-macros.at?
>>
> 
> This is exactly what I was thinking but I didn't want to do it yet
> because a) I am not sure if it will be used much b) I am unsure what
> will be common yet.
> 
> As such, I would prefer to hold off but I am happy to do it now if you
> think there is benefit.
> 

Ok, makes sense, let's keep it like this and see where it goes.

Thanks!


Re: [ovs-dev] [PATCH ovn] docs: fix git format-patch command for backports

2021-06-30 Thread Ihar Hrachyshka
On Tue, Jun 29, 2021 at 1:18 PM Ben Pfaff  wrote:
>
> On Tue, Jun 29, 2021 at 12:50:54PM -0400, Ihar Hrachyshka wrote:
> > On Tue, Jun 29, 2021 at 12:46 PM Ben Pfaff  wrote:
> > >
> > > On Tue, Jun 29, 2021 at 12:24:11PM -0400, Ihar Hrachyshka wrote:
> > > > One, HEAD~, not HEAD, should be used to generate any patches. Two, add
> > > > "ovn" to the generated mail topic. Third, update branch name to a
> > > > fresh one.
> > > >
> > > > Signed-off-by: Ihar Hrachyshka 
> > >
> > > Both of these look odd to me:
> > > > -$ git format-patch HEAD --subject-prefix="PATCH branch-2.7"
> > > > +$ git format-patch HEAD~ --subject-prefix="PATCH ovn branch-21.06"
> > >
> > > I think the idea here is to just generate one patch from the tip of the
> > > current branch.  HEAD~ works but I'd normally write -1 instead.
> > >
> >
> > Yes. But HEAD doesn't generate any patches since it's a head and there
> > are no patches "above" it. Am I missing something?
>
> You're correct.  -1 and HEAD~ have the same effect, but -1 is the more
> common way to write it.
>

I can't comment on what's more common (personally using HEAD~ hence
putting it here), but I don't mind if we change to -1 and not HEAD~.
Whatever works.

Ihar

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best mfex function

2021-06-30 Thread Van Haaren, Harry
> -Original Message-
> From: Eelco Chaudron 
> Sent: Wednesday, June 30, 2021 11:07 AM
> To: Van Haaren, Harry 
> Cc: Flavio Leitner ; Amber, Kumar ;
> d...@openvswitch.org; i.maxim...@ovn.org
> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select 
> the best
> mfex function
> 
> On 30 Jun 2021, at 11:43, Van Haaren, Harry wrote:



> > $ ovs-appctl  dpif-netdev/miniflow-parser-set avx512_ipv4_udp
> >
> > There is an assumption here that all datapath threads handle
> > the same outer traffic type. If that's not the case, we cannot manually
> > set different MFEX impls to different pmd threads today, as your lab
> > to production requirement requests above.
> >
> > If we add an optional PMD thread id parameter, we can support this:
> > $ ovs-appctl  dpif-netdev/miniflow-parser-set avx512_ipv4_udp
>  
> 
> I think if we allow study to set it per PMD thread, we should support the pmd 
> thread
> for manual configuration.
> We also might need to re-think the command to make sure packet_count_to_study
> is only needed for the study command.
> So the help text might become something like:
> 
> dpif-netdev/miniflow-parser-set {miniflow_implementation_name | study 
> [pkt_cnt]}
> [dp] [pmd_core]

Amber has designed & implemented a proposal, with documentation on each. 
Request to
review the next version of the patchset when available, to ensure it meets 
requirements.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 10/12] dpif-netdev/mfex: Add AVX512 based optimized miniflow extract

2021-06-30 Thread Van Haaren, Harry
> -Original Message-
> From: Eelco Chaudron 
> Sent: Wednesday, June 30, 2021 2:12 PM
> To: Amber, Kumar ; Van Haaren, Harry
> 
> Cc: d...@openvswitch.org; i.maxim...@ovn.org; Flavio Leitner 
> ;
> Stokes, Ian 
> Subject: Re: [ovs-dev] [v4 10/12] dpif-netdev/mfex: Add AVX512 based optimized
> miniflow extract
> 
> This patch was an interesting patch to review and being reminded about 
> endianness,
> and this site,
> https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz
> _permutexvar_epi8=4315, got me through it ;)

Hah, yes the Intrinsics Guide is very useful for reading/investigating what/how 
instructions can do.
Its... almost always open in a browser in some tab here! :)


> Some comments below...
> 
> //Eelco

Thanks for review, I'll snip away large chunks of code to reduce verbosity.

Regards, -Harry


> On 17 Jun 2021, at 18:27, Kumar Amber wrote:
> 
> > From: Harry van Haaren 



> > +/* AVX512-BW level permutex2var_epi8 emulation. */
> > +static inline __m512i
> > +__attribute__((target("avx512bw")))
> 
> Are these targets universal enough for all supported compilers, if not we 
> might need
> to move them to individual macros in compile.h.

Yes, these are the standard gcc/clang etc compiler -m  switches.

Search for "-mavx512bw" on e.g. this GCC page, lists them all;
https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html 

If a compiler does not understand them, we will have to #ifdef that compiler 
out,
as it just doesn't support the ISA.


> > +/* Static const instances of profiles. These are compile-time constants,
> > + * and are specialized into individual miniflow-extract functions.
> > + */
> > +static const struct mfex_profile mfex_profiles[PROFILE_COUNT] =
> > +{
> > +[PROFILE_ETH_IPV4_UDP] = {
> > +.probe_mask.u8_data = { PATTERN_ETHERTYPE_MASK PATTERN_IPV4_MASK
> },
> > +.probe_data.u8_data = { PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_UDP},
> > +
> > +.store_shuf.u8_data = { PATTERN_IPV4_UDP_SHUFFLE },
> > +.store_kmsk = PATTERN_IPV4_UDP_KMASK,
> > +
> > +.mf_bits = { 0x18a0, 0x00040401},
> 
> I did some manual translation from these bits, to parts of the flow structure 
> they
> represent, but it was not something fun to do. Maybe you still have your 
> notes and
> could add some to the code? It might help debugging?

Agree that these bits are "arbitrary" to some degree, they're offsets into the 
miniflow
datastructure, with each bit representing 8-bytes of data.

These are derived from the output of the autovalidator, which prints "good" and 
"test"
values.



> As we are explicitly manual defining the mf_bits I think we also need to 
> update the
> comment in the “struct flow” definition to reflect that if the order change 
> these
> specific functions need updating also.

There's an "ABI Macro" in that struct, we can throw one of those build-time 
asserts into here
too to be "extra sure", but this would be caught by running MFEX autovalidation 
unit tests.




> > +/* Generic loop to process any mfex profile. This code is specialized into
> > + * multiple actual MFEX implementation functions. Its marked ALWAYS_INLINE
> > + * to ensure the compiler specializes each instance. The code is marked 
> > "hot"
> > + * to inform the compiler this is a hotspot in the program, encouraging
> > + * inlining of callee functions such as the permute calls.
> > + */
> > +static inline uint32_t ALWAYS_INLINE
> > +__attribute__ ((hot))
> 
> Do we need to move this to a macro in compiler.h as OVS_HOT to make sure it’s 
> not
> causing issues on other compilers like windows, etc?

I'm not sure, we could I suppose, I'm not strongly for or against. Today this
patchset doesn't modify compiler.h at all, perhaps cleaner to update in a later 
patch,
and consider other functions for tagging with OVS_HOT too in that patchset? 



> > +/* Copy known dp packet offsets to the dp_packet instance. */
> > +memcpy(>l2_pad_size, >dp_pkt_offs,
> > +   sizeof(uint16_t) * 4);
> > +
> 
> Here we copy four fields to the packet structure (l2_pad_size, l2_5_ofs, 
> l3_ofs,
> l4_ofs). I think we should add some static_assert to make sure the order of 
> these
> fields do not change.

Yes, I think Flavio had a similar comment in one of the reviews. Good point,
has been addressed with BUILD_ASSERT_DELC() and offsets into struct by Amber.


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn v5 2/2] tests: Add check-perf target

2021-06-30 Thread Mark Gray
Add a suite of micro-benchmarks to aid a developer in understanding the
performance impact of any changes that they are making. They can be used to
help to understand the relative performance between two test runs on the same
test machine, but are not intended to give the absolute performance of OVN.

To invoke the performance testsuite, run:

$ make check-perf

This will run all available performance tests.

Additional metrics (e.g. memory, coverage, perf counters) may be added
in the future. Additional tests (e.g. additional topologies,  ovn-controller
tests) may be added in the future.

Signed-off-by: Mark Gray 
---

Notes:
v2:  create results directory to fix build error
v3:  forgot to commit, create results directory to fix build error
v4:  fix 0-day issues
 remove `sudo` in Makefile
 updated documentation

 Documentation/topics/testing.rst |  50 
 tests/.gitignore |   3 +
 tests/automake.mk|  27 
 tests/perf-northd.at | 207 +++
 tests/perf-testsuite.at  |  26 
 5 files changed, 313 insertions(+)
 create mode 100644 tests/perf-northd.at
 create mode 100644 tests/perf-testsuite.at

diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst
index be9e7c57331c..db265344a507 100644
--- a/Documentation/topics/testing.rst
+++ b/Documentation/topics/testing.rst
@@ -256,3 +256,53 @@ the following::
 All the features documented under `Unit Tests`_ are available for the
 datapath testsuites, except that the datapath testsuites do not
 support running tests in parallel.
+
+Performance testing
+~~~
+
+OVN includes a suite of micro-benchmarks to aid a developer in understanding
+the performance impact of any changes that they are making. They can be used to
+help to understand the relative performance between two test runs on the same
+test machine, but are not intended to give the absolute performance of OVN.
+
+To invoke the performance testsuite, run::
+
+$ make check-perf
+
+This will run all available performance tests. Some of these tests may be
+long-running as they need to build complex logical network topologies. In order
+to speed up subsequent test runs, some objects (e.g. the Northbound DB) may be
+cached. In order to force the tests to rebuild all these objects, run::
+
+$ make check-perf TESTSUITEFLAGS="--rebuild"
+
+A typical workflow for a developer trying to improve the performance of OVN
+would be the following:
+
+0. Optional: Modify/add a performance test to buld the topology that you are
+   benchmarking, if required.
+1. Run ``make check-perf TESTSUITEFLAGS="--rebuild"`` to generate cached
+   databases (and complete a test run). The results of each test run are
+   displayed on the screen at the end of the test run but are also saved in the
+   file ``tests/perf-testsuite.dir/results``.
+
+.. note::
+   This step may take some time depending on the number of tests that are being
+   rebuilt, the complexity of the tests and the performance of the test
+   machine. If you are only using one test, you can specify the test to run by
+   adding the test number to the ``make`` command.
+   (e.g. ``make check-perf TESTSUITEFLAGS="--rebuild "``)
+
+2. Run ``make check-perf`` to measure the performance metric that you are
+   benchmarking against. If you are only using one test, you can specify the
+   test to run by adding the test number to the ``make`` command.
+   (e.g. ``make check-perf TESTSUITEFLAGS="--rebuild "``)
+3. Modify OVN code to implement the change that you believe will improve the
+   performance.
+4. Go to Step 2. to continue making improvements.
+
+If, as a developer, you modify a performance test in a way that may change one
+of these cached objects, be sure to rebuild the test.
+
+The cached objects are stored under the relevant folder in
+``tests/perf-testsuite.dir/cached``.
diff --git a/tests/.gitignore b/tests/.gitignore
index 8479f9bb0f8f..65cb1c6e4fad 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -22,6 +22,9 @@
 /system-offloads-testsuite
 /system-offloads-testsuite.dir/
 /system-offloads-testsuite.log
+/perf-testsuite
+/perf-testsuite.dir/
+/perf-testsuite.log
 /test-aes128
 /test-atomic
 /test-bundle
diff --git a/tests/automake.mk b/tests/automake.mk
index a8ec64212791..5b890d644eeb 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -4,9 +4,11 @@ EXTRA_DIST += \
$(SYSTEM_TESTSUITE_AT) \
$(SYSTEM_KMOD_TESTSUITE_AT) \
$(SYSTEM_USERSPACE_TESTSUITE_AT) \
+   $(PERF_TESTSUITE_AT) \
$(TESTSUITE) \
$(SYSTEM_KMOD_TESTSUITE) \
$(SYSTEM_USERSPACE_TESTSUITE) \
+   $(PERF_TESTSUITE) \
tests/atlocal.in \
$(srcdir)/package.m4 \
$(srcdir)/tests/testsuite \
@@ -53,6 +55,10 @@ SYSTEM_TESTSUITE_AT = \
tests/system-ovn.at \
tests/system-ovn-kmod.at
 
+PERF_TESTSUITE_AT = \
+   tests/perf-testsuite.at \
+ 

[ovs-dev] [PATCH ovn v5 0/2] tests: Add check-perf target

2021-06-30 Thread Mark Gray
This is a proposal to add some micro-benchmarks to aid developers
in benchmarking optimizations to OVN. It starts by adding simple
metrics for northd but could be expanded in future patches.

Mark Gray (2):
  ovn-northd: Add useful stopwatches
  tests: Add check-perf target

 Documentation/topics/testing.rst |  50 
 lib/automake.mk  |   3 +-
 lib/stopwatch-names.h|  25 
 northd/ovn-northd-ddlog.c|  12 ++
 northd/ovn-northd.c  |  17 +++
 tests/.gitignore |   3 +
 tests/automake.mk|  27 
 tests/perf-northd.at | 207 +++
 tests/perf-testsuite.at  |  26 
 9 files changed, 369 insertions(+), 1 deletion(-)
 create mode 100644 lib/stopwatch-names.h
 create mode 100644 tests/perf-northd.at
 create mode 100644 tests/perf-testsuite.at

-- 
2.27.0


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn v5 1/2] ovn-northd: Add useful stopwatches

2021-06-30 Thread Mark Gray
For performance measurement, it is useful to understand the
length of time required to complete a number of key code paths
in ovn-northd.c. Add stopwatches to measure these timings.

Signed-off-by: Mark Gray 
Acked-by: Dumitru Ceara 
---

Notes:
v4:  Add common header file for stopwatch names
v5:  Forgot to `git add` a new file. Added this file.

 lib/automake.mk   |  3 ++-
 lib/stopwatch-names.h | 25 +
 northd/ovn-northd-ddlog.c | 12 
 northd/ovn-northd.c   | 17 +
 4 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 lib/stopwatch-names.h

diff --git a/lib/automake.mk b/lib/automake.mk
index 917b28e1edf7..f668b791bb81 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -29,7 +29,8 @@ lib_libovn_la_SOURCES = \
lib/inc-proc-eng.c \
lib/inc-proc-eng.h \
lib/lb.c \
-   lib/lb.h
+   lib/lb.h \
+   lib/stopwatch-names.h
 nodist_lib_libovn_la_SOURCES = \
lib/ovn-dirs.c \
lib/ovn-nb-idl.c \
diff --git a/lib/stopwatch-names.h b/lib/stopwatch-names.h
new file mode 100644
index ..06b20272e8cf
--- /dev/null
+++ b/lib/stopwatch-names.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2021 Red Hat, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef STOPWATCH_NAMES_H
+#define STOPWATCH_NAMES_H 1
+
+/* In order to not duplicate names for stopwatches between ddlog and non-ddlog
+ * we define them in a common header file.
+ */
+#define NORTHD_LOOP_STOPWATCH_NAME "ovn-northd-loop"
+#define OVNNB_DB_RUN_STOPWATCH_NAME "ovnnb_db_run"
+#define OVNSB_DB_RUN_STOPWATCH_NAME "ovnsb_db_run"
+
+#endif
\ No newline at end of file
diff --git a/northd/ovn-northd-ddlog.c b/northd/ovn-northd-ddlog.c
index bc2c75f51bb0..a02949b2d1b7 100644
--- a/northd/ovn-northd-ddlog.c
+++ b/northd/ovn-northd-ddlog.c
@@ -38,6 +38,8 @@
 #include "ovsdb-parser.h"
 #include "ovsdb-types.h"
 #include "simap.h"
+#include "stopwatch.h"
+#include "lib/stopwatch-names.h"
 #include "stream-ssl.h"
 #include "stream.h"
 #include "unixctl.h"
@@ -1267,6 +1269,10 @@ main(int argc, char *argv[])
 
 daemonize_complete();
 
+stopwatch_create(NORTHD_LOOP_STOPWATCH_NAME, SW_MS);
+stopwatch_create(OVNNB_DB_RUN_STOPWATCH_NAME, SW_MS);
+stopwatch_create(OVNSB_DB_RUN_STOPWATCH_NAME, SW_MS);
+
 /* Main loop. */
 exiting = false;
 while (!exiting) {
@@ -1293,8 +1299,12 @@ main(int argc, char *argv[])
 status.locked = has_lock;
 status.pause = sb_ctx->paused;
 
+stopwatch_start(OVNNB_DB_RUN_STOPWATCH_NAME, time_msec());
 northd_run(nb_ctx);
+stopwatch_stop(OVNNB_DB_RUN_STOPWATCH_NAME, time_msec());
+stopwatch_start(OVNSB_DB_RUN_STOPWATCH_NAME, time_msec());
 northd_run(sb_ctx);
+stopwatch_stop(OVNSB_DB_RUN_STOPWATCH_NAME, time_msec());
 northd_update_probe_interval(nb_ctx, sb_ctx);
 if (ovsdb_cs_has_lock(sb_ctx->cs) &&
 sb_ctx->state == S_UPDATE &&
@@ -1305,6 +1315,8 @@ main(int argc, char *argv[])
 northd_send_deltas(sb_ctx);
 }
 
+stopwatch_stop(NORTHD_LOOP_STOPWATCH_NAME, time_msec());
+stopwatch_start(NORTHD_LOOP_STOPWATCH_NAME, time_msec());
 unixctl_server_run(unixctl);
 
 northd_wait(nb_ctx);
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index e96494ba3c7a..2149c8f60459 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -50,6 +50,8 @@
 #include "smap.h"
 #include "sset.h"
 #include "svec.h"
+#include "stopwatch.h"
+#include "lib/stopwatch-names.h"
 #include "stream.h"
 #include "stream-ssl.h"
 #include "timeval.h"
@@ -13261,6 +13263,9 @@ ovnnb_db_run(struct northd_context *ctx,
 if (!ctx->ovnsb_txn || !ctx->ovnnb_txn) {
 return;
 }
+
+stopwatch_start(OVNNB_DB_RUN_STOPWATCH_NAME, time_msec());
+
 struct hmap port_groups;
 struct hmap mcast_groups;
 struct hmap igmp_groups;
@@ -13404,6 +13409,8 @@ ovnnb_db_run(struct northd_context *ctx,
  * as well.
  */
 cleanup_macam();
+
+stopwatch_stop(OVNNB_DB_RUN_STOPWATCH_NAME, time_msec());
 }
 
 /* Stores the list of chassis which references an ha_chassis_group.
@@ -13996,6 +14003,8 @@ ovnsb_db_run(struct northd_context *ctx,
 return;
 }
 
+stopwatch_start(OVNSB_DB_RUN_STOPWATCH_NAME, time_msec());
+
 struct shash ha_ref_chassis_map = SHASH_INITIALIZER(_ref_chassis_map);
 

Re: [ovs-dev] [v4 10/12] dpif-netdev/mfex: Add AVX512 based optimized miniflow extract

2021-06-30 Thread Eelco Chaudron
This patch was an interesting patch to review and being reminded about 
endianness, and this site, 
https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi8=4315,
 got me through it ;)

Some comments below...

//Eelco

On 17 Jun 2021, at 18:27, Kumar Amber wrote:

> From: Harry van Haaren 
>
> This commit adds AVX512 implementations of miniflow extract.
> By using the 64 bytes available in an AVX512 register, it is
> possible to convert a packet to a miniflow data-structure in
> a small quantity instructions.
>
> The implementation here probes for Ether()/IP()/UDP() traffic,
> and builds the appropriate miniflow data-structure for packets
> that match the probe.
>
> The implementation here is auto-validated by the miniflow
> extract autovalidator, hence its correctness can be easily
> tested and verified.
>
> Note that this commit is designed to easily allow addition of new
> traffic profiles in a scalable way, without code duplication for
> each traffic profile.
>
> Signed-off-by: Harry van Haaren 
> ---
>  lib/automake.mk   |   1 +
>  lib/dpif-netdev-extract-avx512.c  | 416 ++
>  lib/dpif-netdev-private-extract.c |  15 ++
>  lib/dpif-netdev-private-extract.h |  19 ++
>  4 files changed, 451 insertions(+)
>  create mode 100644 lib/dpif-netdev-extract-avx512.c
>
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 3080bb04a..2b95d6f92 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -39,6 +39,7 @@ lib_libopenvswitchavx512_la_CFLAGS = \
>   $(AM_CFLAGS)
>  lib_libopenvswitchavx512_la_SOURCES = \
>   lib/dpif-netdev-lookup-avx512-gather.c \
> + lib/dpif-netdev-extract-avx512.c \
>   lib/dpif-netdev-avx512.c
>  lib_libopenvswitchavx512_la_LDFLAGS = \
>   -static
> diff --git a/lib/dpif-netdev-extract-avx512.c 
> b/lib/dpif-netdev-extract-avx512.c
> new file mode 100644
> index 0..1145ac8a9
> --- /dev/null
> +++ b/lib/dpif-netdev-extract-avx512.c
> @@ -0,0 +1,416 @@
> +/*
> + * Copyright (c) 2021 Intel.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + * http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifdef __x86_64__
> +/* Sparse cannot handle the AVX512 instructions. */
> +#if !defined(__CHECKER__)
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "flow.h"
> +#include "dpdk.h"
> +
> +#include "dpif-netdev-private-dpcls.h"
> +#include "dpif-netdev-private-extract.h"
> +#include "dpif-netdev-private-flow.h"
> +
> +/* AVX512-BW level permutex2var_epi8 emulation. */
> +static inline __m512i
> +__attribute__((target("avx512bw")))

Are these targets universal enough for all supported compilers, if not we might 
need to move them to individual macros in compile.h.

> +_mm512_maskz_permutex2var_epi8_skx(__mmask64 k_mask,
> +   __m512i v_data_0,
> +   __m512i v_shuf_idxs,
> +   __m512i v_data_1)
> +{
> +/* Manipulate shuffle indexes for u16 size. */
> +__mmask64 k_mask_odd_lanes = 0x;
> +/* clear away ODD lane bytes. Cannot be done above due to no u8 shift */
> +__m512i v_shuf_idx_evn = _mm512_mask_blend_epi8(k_mask_odd_lanes,
> +v_shuf_idxs, _mm512_setzero_si512());
> +v_shuf_idx_evn = _mm512_srli_epi16(v_shuf_idx_evn, 1);
> +
> +__m512i v_shuf_idx_odd = _mm512_srli_epi16(v_shuf_idxs, 9);
> +
> +/* Shuffle each half at 16-bit width */
> +__m512i v_shuf1 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_evn,
> +v_data_1);
> +__m512i v_shuf2 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_odd,
> +v_data_1);
> +
> +/* Find if the shuffle index was odd, via mask and compare */
> +uint16_t index_odd_mask = 0x1;
> +const __m512i v_index_mask_u16 = _mm512_set1_epi16(index_odd_mask);
> +
> +/* EVEN lanes, find if u8 index was odd,  result as u16 bitmask */
> +__m512i v_idx_even_masked = _mm512_and_si512(v_shuf_idxs,
> + v_index_mask_u16);
> +__mmask32 evn_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_even_masked,
> +v_index_mask_u16);
> +
> +/* ODD lanes, find if u8 index was odd, result as u16 bitmask */
> +__m512i v_shuf_idx_srli8 = _mm512_srli_epi16(v_shuf_idxs, 8);
> +

Re: [ovs-dev] [PATCH v2 6/8] ovs-thread: Quiesce when joining pthreads

2021-06-30 Thread Gaëtan Rivet
On Wed, Jun 30, 2021, at 11:09, Ilya Maximets wrote:
> On 5/20/21 3:35 PM, Gaetan Rivet wrote:
> > Joining pthreads makes the caller quiescent. It should register as such,
> > as joined threads may wait on an RCU callback executing before quitting,
> > deadlocking the caller.
> 
> Hi, Gaetan.
> 
> This patch doesn't look right to me.  The problem is that users of this
> function has no idea that the quiescent state will be entered by this
> function.  And this is really hard to track down, because it can be called
> very deep inside some separate part of the code base that user at the
> top level might not even know about.  For example, a lot of call chains
> in ovsdb-server may lead to xpthread_join called from ovsdb/log.c.
> Even though ovsdb-server is single-threaded now, insertion of the
> ovsrcu_quiesce_start() into xpthread_join() will effectively mean that
> ovsdb-server will never be able to use RCU if it will become multi-threaded
> someday, e.g. it will not be able to use CMAPs.
> 
> So, instead of doing that, callers should enter quiescent state before
> joining, and this should be done at the highest level of a call chain
> possible.  We have the same thing with cond_wait() implementation, you
> may add similar comment to the join() function as we have for cond_wait().
> 
> Best regards, Ilya Maximets.
> 

Hi Ilya,

I think you are right, this is dangerous and puts the caller in a position
where they might break RCU without realizing.

Without quiescing, joining on a thread that has RCU callbacks scheduled
has 2 different behaviors: if the RCU is not blocking this is fine and
joining will resolve ; if it is blocking then both threads enter a deadlock.

The solution would be to explicitly quiesce as you mentioned, but this is what
I tried to avoid with this series: requiring developers to 'fix' code that is
only an issue if this very specific compilation option is used.

I don't yet see a good solution to it.

Documenting only is bound to leave incorrect code through, that would be fine
most of the time but will break on people trying to use RCU blocking.
I could try to detect the deadlock (I don't see how yet), issue a warning and
make the joined thread non-blocking / wake it if already blocked.

Thanks for the review,
-- 
Gaetan Rivet
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn v4 1/2] ovn-northd: Add useful stopwatches

2021-06-30 Thread 0-day Robot
Bleep bloop.  Greetings Mark Gray, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


build:
libtool: link: gcc -std=gnu99 -Wstrict-prototypes -Wall -Wextra 
-Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum 
-Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes 
-Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers 
-fno-strict-aliasing -Wshadow -Werror -Werror -g -O2 -o 
controller/ovn-controller controller/bfd.o controller/binding.o 
controller/chassis.o controller/encaps.o controller/ha-chassis.o 
controller/if-status.o controller/ip-mcast.o controller/lflow.o 
controller/lflow-cache.o controller/lport.o controller/ofctrl.o 
controller/ofctrl-seqno.o controller/pinctrl.o controller/patch.o 
controller/ovn-controller.o controller/physical.o controller/mac-learn.o  
lib/.libs/libovn.a 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/lib/.libs/libopenvswitch.a
 -lssl -lcrypto -lcap-ng -lpthread -lrt -lm -lunbound
depbase=`echo controller-vtep/binding.o | sed 's|[^/]*$|.deps/&|;s|\.o$||'`;\
gcc -std=gnu99 -DHAVE_CONFIG_H -I.   -I ./include  -I ./include -I ./ovn -I 
./include -I ./lib -I ./lib -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/include
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/include
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/lib
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/lib
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR
-Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat 
-Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast 
-Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes 
-Wmissing-field-initializers -fno-strict-aliasing -Wshadow -Werror -Werror  -g 
-O2 -MT controller-vtep/binding.o -MD -MP -MF $depbase.Tpo -c -o 
controller-vtep/bi
 nding.o controller-vtep/binding.c &&\
mv -f $depbase.Tpo $depbase.Po
depbase=`echo controller-vtep/gateway.o | sed 's|[^/]*$|.deps/&|;s|\.o$||'`;\
gcc -std=gnu99 -DHAVE_CONFIG_H -I.   -I ./include  -I ./include -I ./ovn -I 
./include -I ./lib -I ./lib -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/include
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/include
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/lib
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/lib
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR
-Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat 
-Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast 
-Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes 
-Wmissing-field-initializers -fno-strict-aliasing -Wshadow -Werror -Werror  -g 
-O2 -MT controller-vtep/gateway.o -MD -MP -MF $depbase.Tpo -c -o 
controller-vtep/ga
 teway.o controller-vtep/gateway.c &&\
mv -f $depbase.Tpo $depbase.Po
depbase=`echo controller-vtep/ovn-controller-vtep.o | sed 
's|[^/]*$|.deps/&|;s|\.o$||'`;\
gcc -std=gnu99 -DHAVE_CONFIG_H -I.   -I ./include  -I ./include -I ./ovn -I 
./include -I ./lib -I ./lib -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/include
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/include
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/lib
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/lib
 -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR
-Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat 
-Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast 
-Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes 
-Wmissing-field-initializers -fno-strict-aliasing -Wshadow -Werror -Werror  -g 
-O2 -MT controller-vtep/ovn-controller-vtep.o -MD -MP -MF $depbase.Tpo -c -o 
contro
 ller-vtep/ovn-controller-vtep.o controller-vtep/ovn-controller-vtep.c &&\
mv -f $depbase.Tpo $depbase.Po
depbase=`echo controller-vtep/vtep.o | sed 's|[^/]*$|.deps/&|;s|\.o$||'`;\
gcc -std=gnu99 -DHAVE_CONFIG_H -I.   -I ./include  -I ./include -I ./ovn -I 
./include -I ./lib -I ./lib -I 
/var/lib/jenkins/jobs/0day_robot_upstream_build_ovn_from_pw/workspace/OVSDIR/include
 -I 

[ovs-dev] [PATCH ovn v4 2/2] tests: Add check-perf target

2021-06-30 Thread Mark Gray
Add a suite of micro-benchmarks to aid a developer in understanding the
performance impact of any changes that they are making. They can be used to
help to understand the relative performance between two test runs on the same
test machine, but are not intended to give the absolute performance of OVN.

To invoke the performance testsuite, run:

$ make check-perf

This will run all available performance tests.

Additional metrics (e.g. memory, coverage, perf counters) may be added
in the future. Additional tests (e.g. additional topologies,  ovn-controller
tests) may be added in the future.

Signed-off-by: Mark Gray 
---

Notes:
v2:  create results directory to fix build error
v3:  forgot to commit, create results directory to fix build error
v4:  fix 0-day issues
 remove `sudo` in Makefile
 updated documentation

 Documentation/topics/testing.rst |  50 
 tests/.gitignore |   3 +
 tests/automake.mk|  27 
 tests/perf-northd.at | 207 +++
 tests/perf-testsuite.at  |  26 
 5 files changed, 313 insertions(+)
 create mode 100644 tests/perf-northd.at
 create mode 100644 tests/perf-testsuite.at

diff --git a/Documentation/topics/testing.rst b/Documentation/topics/testing.rst
index be9e7c57331c..db265344a507 100644
--- a/Documentation/topics/testing.rst
+++ b/Documentation/topics/testing.rst
@@ -256,3 +256,53 @@ the following::
 All the features documented under `Unit Tests`_ are available for the
 datapath testsuites, except that the datapath testsuites do not
 support running tests in parallel.
+
+Performance testing
+~~~
+
+OVN includes a suite of micro-benchmarks to aid a developer in understanding
+the performance impact of any changes that they are making. They can be used to
+help to understand the relative performance between two test runs on the same
+test machine, but are not intended to give the absolute performance of OVN.
+
+To invoke the performance testsuite, run::
+
+$ make check-perf
+
+This will run all available performance tests. Some of these tests may be
+long-running as they need to build complex logical network topologies. In order
+to speed up subsequent test runs, some objects (e.g. the Northbound DB) may be
+cached. In order to force the tests to rebuild all these objects, run::
+
+$ make check-perf TESTSUITEFLAGS="--rebuild"
+
+A typical workflow for a developer trying to improve the performance of OVN
+would be the following:
+
+0. Optional: Modify/add a performance test to buld the topology that you are
+   benchmarking, if required.
+1. Run ``make check-perf TESTSUITEFLAGS="--rebuild"`` to generate cached
+   databases (and complete a test run). The results of each test run are
+   displayed on the screen at the end of the test run but are also saved in the
+   file ``tests/perf-testsuite.dir/results``.
+
+.. note::
+   This step may take some time depending on the number of tests that are being
+   rebuilt, the complexity of the tests and the performance of the test
+   machine. If you are only using one test, you can specify the test to run by
+   adding the test number to the ``make`` command.
+   (e.g. ``make check-perf TESTSUITEFLAGS="--rebuild "``)
+
+2. Run ``make check-perf`` to measure the performance metric that you are
+   benchmarking against. If you are only using one test, you can specify the
+   test to run by adding the test number to the ``make`` command.
+   (e.g. ``make check-perf TESTSUITEFLAGS="--rebuild "``)
+3. Modify OVN code to implement the change that you believe will improve the
+   performance.
+4. Go to Step 2. to continue making improvements.
+
+If, as a developer, you modify a performance test in a way that may change one
+of these cached objects, be sure to rebuild the test.
+
+The cached objects are stored under the relevant folder in
+``tests/perf-testsuite.dir/cached``.
diff --git a/tests/.gitignore b/tests/.gitignore
index 8479f9bb0f8f..65cb1c6e4fad 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -22,6 +22,9 @@
 /system-offloads-testsuite
 /system-offloads-testsuite.dir/
 /system-offloads-testsuite.log
+/perf-testsuite
+/perf-testsuite.dir/
+/perf-testsuite.log
 /test-aes128
 /test-atomic
 /test-bundle
diff --git a/tests/automake.mk b/tests/automake.mk
index a8ec64212791..5b890d644eeb 100644
--- a/tests/automake.mk
+++ b/tests/automake.mk
@@ -4,9 +4,11 @@ EXTRA_DIST += \
$(SYSTEM_TESTSUITE_AT) \
$(SYSTEM_KMOD_TESTSUITE_AT) \
$(SYSTEM_USERSPACE_TESTSUITE_AT) \
+   $(PERF_TESTSUITE_AT) \
$(TESTSUITE) \
$(SYSTEM_KMOD_TESTSUITE) \
$(SYSTEM_USERSPACE_TESTSUITE) \
+   $(PERF_TESTSUITE) \
tests/atlocal.in \
$(srcdir)/package.m4 \
$(srcdir)/tests/testsuite \
@@ -53,6 +55,10 @@ SYSTEM_TESTSUITE_AT = \
tests/system-ovn.at \
tests/system-ovn-kmod.at
 
+PERF_TESTSUITE_AT = \
+   tests/perf-testsuite.at \
+ 

[ovs-dev] [PATCH ovn v4 1/2] ovn-northd: Add useful stopwatches

2021-06-30 Thread Mark Gray
For performance measurement, it is useful to understand the
length of time required to complete a number of key code paths
in ovn-northd.c. Add stopwatches to measure these timings.

Signed-off-by: Mark Gray 
Acked-by: Dumitru Ceara 
---

Notes:
v4:  Add common header file for stopwatch names

 lib/automake.mk   |  3 ++-
 northd/ovn-northd-ddlog.c | 12 
 northd/ovn-northd.c   | 17 +
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/lib/automake.mk b/lib/automake.mk
index 917b28e1edf7..59345cbf84e6 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -39,7 +39,8 @@ nodist_lib_libovn_la_SOURCES = \
lib/ovn-ic-nb-idl.c \
lib/ovn-ic-nb-idl.h \
lib/ovn-ic-sb-idl.c \
-   lib/ovn-ic-sb-idl.h
+   lib/ovn-ic-sb-idl.h \
+   lib/stopwatch-names.h
 
 CLEANFILES += $(nodist_lib_libovn_la_SOURCES)
 
diff --git a/northd/ovn-northd-ddlog.c b/northd/ovn-northd-ddlog.c
index bc2c75f51bb0..a02949b2d1b7 100644
--- a/northd/ovn-northd-ddlog.c
+++ b/northd/ovn-northd-ddlog.c
@@ -38,6 +38,8 @@
 #include "ovsdb-parser.h"
 #include "ovsdb-types.h"
 #include "simap.h"
+#include "stopwatch.h"
+#include "lib/stopwatch-names.h"
 #include "stream-ssl.h"
 #include "stream.h"
 #include "unixctl.h"
@@ -1267,6 +1269,10 @@ main(int argc, char *argv[])
 
 daemonize_complete();
 
+stopwatch_create(NORTHD_LOOP_STOPWATCH_NAME, SW_MS);
+stopwatch_create(OVNNB_DB_RUN_STOPWATCH_NAME, SW_MS);
+stopwatch_create(OVNSB_DB_RUN_STOPWATCH_NAME, SW_MS);
+
 /* Main loop. */
 exiting = false;
 while (!exiting) {
@@ -1293,8 +1299,12 @@ main(int argc, char *argv[])
 status.locked = has_lock;
 status.pause = sb_ctx->paused;
 
+stopwatch_start(OVNNB_DB_RUN_STOPWATCH_NAME, time_msec());
 northd_run(nb_ctx);
+stopwatch_stop(OVNNB_DB_RUN_STOPWATCH_NAME, time_msec());
+stopwatch_start(OVNSB_DB_RUN_STOPWATCH_NAME, time_msec());
 northd_run(sb_ctx);
+stopwatch_stop(OVNSB_DB_RUN_STOPWATCH_NAME, time_msec());
 northd_update_probe_interval(nb_ctx, sb_ctx);
 if (ovsdb_cs_has_lock(sb_ctx->cs) &&
 sb_ctx->state == S_UPDATE &&
@@ -1305,6 +1315,8 @@ main(int argc, char *argv[])
 northd_send_deltas(sb_ctx);
 }
 
+stopwatch_stop(NORTHD_LOOP_STOPWATCH_NAME, time_msec());
+stopwatch_start(NORTHD_LOOP_STOPWATCH_NAME, time_msec());
 unixctl_server_run(unixctl);
 
 northd_wait(nb_ctx);
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index e96494ba3c7a..2149c8f60459 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -50,6 +50,8 @@
 #include "smap.h"
 #include "sset.h"
 #include "svec.h"
+#include "stopwatch.h"
+#include "lib/stopwatch-names.h"
 #include "stream.h"
 #include "stream-ssl.h"
 #include "timeval.h"
@@ -13261,6 +13263,9 @@ ovnnb_db_run(struct northd_context *ctx,
 if (!ctx->ovnsb_txn || !ctx->ovnnb_txn) {
 return;
 }
+
+stopwatch_start(OVNNB_DB_RUN_STOPWATCH_NAME, time_msec());
+
 struct hmap port_groups;
 struct hmap mcast_groups;
 struct hmap igmp_groups;
@@ -13404,6 +13409,8 @@ ovnnb_db_run(struct northd_context *ctx,
  * as well.
  */
 cleanup_macam();
+
+stopwatch_stop(OVNNB_DB_RUN_STOPWATCH_NAME, time_msec());
 }
 
 /* Stores the list of chassis which references an ha_chassis_group.
@@ -13996,6 +14003,8 @@ ovnsb_db_run(struct northd_context *ctx,
 return;
 }
 
+stopwatch_start(OVNSB_DB_RUN_STOPWATCH_NAME, time_msec());
+
 struct shash ha_ref_chassis_map = SHASH_INITIALIZER(_ref_chassis_map);
 handle_port_binding_changes(ctx, ports, _ref_chassis_map);
 update_northbound_cfg(ctx, sb_loop, loop_start_time);
@@ -14003,6 +14012,8 @@ ovnsb_db_run(struct northd_context *ctx,
 update_sb_ha_group_ref_chassis(_ref_chassis_map);
 }
 shash_destroy(_ref_chassis_map);
+
+stopwatch_stop(OVNSB_DB_RUN_STOPWATCH_NAME, time_msec());
 }
 
 static void
@@ -14457,6 +14468,10 @@ main(int argc, char *argv[])
 char *ovn_internal_version = ovn_get_internal_version();
 VLOG_INFO("OVN internal version is : [%s]", ovn_internal_version);
 
+stopwatch_create(NORTHD_LOOP_STOPWATCH_NAME, SW_MS);
+stopwatch_create(OVNNB_DB_RUN_STOPWATCH_NAME, SW_MS);
+stopwatch_create(OVNSB_DB_RUN_STOPWATCH_NAME, SW_MS);
+
 /* Main loop. */
 exiting = false;
 
@@ -14540,6 +14555,8 @@ main(int argc, char *argv[])
 ovsdb_idl_wait(ovnsb_idl_loop.idl);
 }
 
+stopwatch_stop(NORTHD_LOOP_STOPWATCH_NAME, time_msec());
+stopwatch_start(NORTHD_LOOP_STOPWATCH_NAME, time_msec());
 unixctl_server_run(unixctl);
 unixctl_server_wait(unixctl);
 memory_wait();
-- 
2.27.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn v4 0/2] tests: Add check-perf target

2021-06-30 Thread Mark Gray
This is a proposal to add some micro-benchmarks to aid developers
in benchmarking optimizations to OVN. It starts by adding simple
metrics for northd but could be expanded in future patches.

Mark Gray (2):
  ovn-northd: Add useful stopwatches
  tests: Add check-perf target

 Documentation/topics/testing.rst |  50 
 lib/automake.mk  |   3 +-
 northd/ovn-northd-ddlog.c|  12 ++
 northd/ovn-northd.c  |  17 +++
 tests/.gitignore |   3 +
 tests/automake.mk|  27 
 tests/perf-northd.at | 207 +++
 tests/perf-testsuite.at  |  26 
 8 files changed, 344 insertions(+), 1 deletion(-)
 create mode 100644 tests/perf-northd.at
 create mode 100644 tests/perf-testsuite.at

-- 
2.27.0


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn v3 3/3] tests: Add check-perf target

2021-06-30 Thread Mark Gray
On 24/06/2021 16:34, Dumitru Ceara wrote:
> On 6/18/21 10:52 AM, Mark Gray wrote:
>> Add a suite of micro-benchmarks to aid a developer in understanding the
>> performance impact of any changes that they are making. They can be used to
>> help to understand the relative performance between two test runs on the same
>> test machine, but are not intended to give the absolute performance of OVN.
>>
>> To invoke the performance testsuite, run:
>>
>> $ make check-perf
>>
>> This will run all available performance tests.
>>
>> Additional metrics (e.g. memory, coverage, perf counters) may be added
>> in the future. Additional tests (e.g. additional topologies,  ovn-controller
>> tests) may be added in the future.
>>
>> Signed-off-by: Mark Gray 
>> ---
> 
> Thanks for this, I think this is a very good idea!

Thanks (and thanks for the review!), I think it is a good starting part
but it will only be useful if it gets used! Personally, I can see myself
using this. However, if it doesn't, it is not very invasive and could
easily be let rot or removed.
> 
>>
>> Notes:
>> v2:  create results directory to fix build error
>> v3:  forgot to commit, create results directory to fix build error
>>
>>  Documentation/topics/testing.rst |  49 
>>  tests/.gitignore |   3 +
>>  tests/automake.mk|  27 
>>  tests/perf-northd.at | 207 +++
>>  tests/perf-testsuite.at  |  26 
>>  5 files changed, 312 insertions(+)
>>  create mode 100644 tests/perf-northd.at
>>  create mode 100644 tests/perf-testsuite.at
>>
>> diff --git a/Documentation/topics/testing.rst 
>> b/Documentation/topics/testing.rst
>> index be9e7c57331c..ccd3278437b1 100644
>> --- a/Documentation/topics/testing.rst
>> +++ b/Documentation/topics/testing.rst
>> @@ -256,3 +256,52 @@ the following::
>>  All the features documented under `Unit Tests`_ are available for the
>>  datapath testsuites, except that the datapath testsuites do not
>>  support running tests in parallel.
>> +
>> +Performance testing
>> +~~~
>> +
>> +OVN includes a suite of micro-benchmarks to aid a developer in 
>> understanding the
>> +performance impact of any changes that they are making. They can be used to
>> +help to understand the relative performance between two test runs on the 
>> same
>> +test machine, but are not intended to give the absolute performance of OVN.
>> +
>> +To invoke the performance testsuite, run::
>> +
>> +$ make check-perf
>> +
>> +This will run all available performance tests. Some of these tests may be
>> +long-running as they need to build complex logical network topologies. In 
>> order
>> +to speed up subsequent test runs, some objects (e.g. the Northbound DB) may 
>> be
>> +cached. In order to force the tests to rebuild all these objects, run::
>> +
>> +$ make check-perf TESTSUITEFLAGS="--rebuild"
>> +
>> +A typical workflow for a developer trying to improve the performance of OVN
>> +would be the following:
>> +
>> +0. Optional: Modify/add a performance test to buld the topology that you are
>> +   benchmarking, if required.
>> +1. Run ``make check-perf TESTSUITEFLAGS="--rebuild"`` to generate cached
>> +   databases.
>> +
>> +.. note::
>> +   This step may take some time depending on the number of tests that are 
>> being
>> +   rebuilt, the complexity of the tests and the performance of the test 
>> machine.
>> +   If you are only using one test, you can specify the test to run by 
>> adding the
>> +   test number to the ``make`` command.
>> +   (e.g. ``make check-perf TESTSUITEFLAGS="--rebuild "``)
>> +
>> +2. Run ``make check-perf`` to measure the performance metric that you are
>> +   benchmarking against. If you are only using one test, you can specify 
>> the test to run by adding the test number to the ``make`` command.
>> +   (e.g. ``make check-perf TESTSUITEFLAGS="--rebuild "``)
> 
> It's not very clear where the user would find the test results:
> 
> tests/perf-testsuite.dir/results

It is mentioned further below but I will move it up to here to make it
clearer.
> 
>> +3. Modify OVN code to implement the change that you believe will improve the
>> +   performance.
>> +4. Go to Step 2. to continue making improvements.
>> +
>> +If, as a developer, you modify a performance test in a way that may change 
>> one
>> +of these cached objects, be sure to rebuild the test.
>> +
>> +The results of each test run are displayed on the screen at the end of the 
>> test
>> +run but are also saved in the file ``tests/perf-testsuite.dir/results``. The
>> +cached objects are stored under the relevant folder in
>> +``tests/perf-testsuite.dir/cached``.
>> diff --git a/tests/.gitignore b/tests/.gitignore
>> index 8479f9bb0f8f..65cb1c6e4fad 100644
>> --- a/tests/.gitignore
>> +++ b/tests/.gitignore
>> @@ -22,6 +22,9 @@
>>  /system-offloads-testsuite
>>  /system-offloads-testsuite.dir/
>>  /system-offloads-testsuite.log
>> +/perf-testsuite
>> 

Re: [ovs-dev] [PATCH ovn v3 2/3] ovn-northd: Add useful stopwatches

2021-06-30 Thread Mark Gray
On 24/06/2021 16:33, Dumitru Ceara wrote:
> On 6/18/21 10:52 AM, Mark Gray wrote:
>> For performance measurement, it is useful to understand the
>> length of time required to complete a number of key code paths
>> in ovn-northd.c. Add stopwatches to measure these timings.
>>
>> Signed-off-by: Mark Gray 
>> ---
> 
> Acked-by: Dumitru Ceara 
> 
> I only have one real nit on this patch (below).  Except for that here
> are some more random thoughts for potential follow ups.
> 
> I think we might benefit from a even more granular measurement, e.g., in
> some of the tests I was doing a while ago build_datapaths() was also
> taking up a significant amount of time.

Yes, I think so too. However, I don't want to start cluttering the code
with stopwatches put in arbitrary locations. I would rather it was
driven by actual need. I don't feel particularly qualified to make that
decision and I figured this patch is more about establishing the precedent.

> 
> Some more interesting ones to measure are in
> build_lswitch_and_lrouter_flows(), the loops that call
> build_lswitch_*_by_od/op().  I don't know however how we could deal with
> the parallel case though.

I could add these as an additional patch if you are convinced of their
utility?

> 
> Thanks,
> Dumitru
> 
>>  northd/ovn-northd-ddlog.c | 15 +++
>>  northd/ovn-northd.c   | 20 
>>  2 files changed, 35 insertions(+)
>>
>> diff --git a/northd/ovn-northd-ddlog.c b/northd/ovn-northd-ddlog.c
>> index a4f2960bdcb8..7c552d516550 100644
>> --- a/northd/ovn-northd-ddlog.c
>> +++ b/northd/ovn-northd-ddlog.c
>> @@ -37,6 +37,7 @@
>>  #include "ovsdb-parser.h"
>>  #include "ovsdb-types.h"
>>  #include "simap.h"
>> +#include "stopwatch.h"
>>  #include "stream-ssl.h"
>>  #include "stream.h"
>>  #include "unixctl.h"
>> @@ -50,6 +51,10 @@ VLOG_DEFINE_THIS_MODULE(ovn_northd);
>>  #include "northd/ovn-northd-ddlog-nb.inc"
>>  #include "northd/ovn-northd-ddlog-sb.inc"
>>  
>> +#define NORTHD_LOOP_STOPWATCH_NAME "ovn-northd-loop"
>> +#define OVNNB_DB_RUN_STOPWATCH_NAME "ovnnb_db_run"
>> +#define OVNSB_DB_RUN_STOPWATCH_NAME "ovnsb_db_run"
> 
> A bit of a nit: would it make sense to not duplicate these in both
> northd versions and just add them to a (potentially new) common header file?
> 

Not a nit at all. Makes perfect sense to maintain consistency.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v2 ovn 0/9] northd: rework ovn-northd lb flow installation

2021-06-30 Thread Lorenzo Bianconi
> On Fri, Jun 18, 2021 at 9:04 AM Lorenzo Bianconi
>  wrote:
> >
> > Rework lb flow logic in order to visit first each load_balancer and then
> > related datapath during lb flow installation.
> > This patch allows to reduce memory footprint and cpu utilization in
> > ovn-northd.
> >
> 
> Hi Lorenzo,
> 
> The ovsrobot CI runs show failures with memory leaks.  Please take a
> look at them - https://github.com/ovsrobot/ovn/runs/2859118644
> 
> Numan

ack, Numan, thx for reporting this. I will fix it in v3.

Regards,
Lorenzo

> 
> 
> northd: move snat_type out of vip loop
> 
> > Testing environment:
> > ovn-nbctl lr-list |wc -l
> > 308
> > ovn-nbctl ls-list |wc -l
> > 615
> > ovn-nbctl lb-list |wc -l
> > 14524
> >
> > Time needed for build_lrouter_lb_flows() to run for all datapaths/lbs 
> > (logical routers)
> > Total samples: 22
> > Maximum: 6937 msec
> > Minimum: 6869 msec
> > 95th percentile: 6933.00 msec
> > Short term average: 6916.599206 msec
> > Long term average: 6914.809656 msec
> >
> > Time needed for build_pre_lb()/build_stateful()[lb-only] to run for all 
> > datapaths/lbs (logical switches)
> >   Total samples: 20
> >   Maximum: 1735 msec
> >   Minimum: 1693 msec
> >   95th percentile: 1735.00 msec
> >   Short term average: 1731.136610 msec
> >   Long term average: 1698.853040 msec
> >
> > Time needed for build_lrouter_flows_for_lb() to run for all lbs/datapaths 
> > (logical routers)
> >Total samples: 22
> >Maximum: 2745 msec
> >Minimum: 2674 msec
> >95th percentile: 2742.00 msec
> >Short term average: 2724.775973 msec
> >Long term average: 2681.334522 msec
> >
> > Time needed for build_lswitch_flows_for_lb() to run for all lbs/datapaths 
> > (logical switches)
> >   Total samples: 20
> >   Maximum: 406 msec
> >   Minimum: 354 msec
> >   95th percentile: 406.00 msec
> >   Short term average: 383.915676 msec
> >   Long term average: 363.318006 mse
> >
> >
> > This series does not introduce any new feature to ovn-northd.
> >
> > Changes since v1:
> > - rebase ontop of ovn-master
> > - add build_lswitch_flows_for_lb routine
> >
> > Lorenzo Bianconi (9):
> >   northd: move snat_type out of vip loop
> >   lib: link logical routers assigned for the same lb
> >   northd: move build_empty_lb_event_flow in build_lrouter_flows_for_lb
> >   northd: move lb_{skip,force}_snat code in
> > build_lrouter_snat_flows_for_lb
> >   northd: get rid of add_router_lb_flow
> >   northd: remove dead code in build_lrouter_nat_defrag_and_lb
> >   lb: link logical switches assigned for the same lb
> >   northd: move build_empty_lb_event_flow in build_lswitch_flows_for_lb
> >   northd: move build_lb_rules in build_lswitch_flows_for_lb
> >
> >  lib/lb.c|  22 ++
> >  lib/lb.h|  12 +
> >  northd/ovn-northd.c | 606 +++-
> >  3 files changed, 403 insertions(+), 237 deletions(-)
> >
> > --
> > 2.31.1
> >
> > ___
> > dev mailing list
> > d...@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >
> 
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best mfex function

2021-06-30 Thread Van Haaren, Harry
> -Original Message-
> From: Eelco Chaudron 
> Sent: Wednesday, June 30, 2021 10:52 AM
> To: Van Haaren, Harry 
> Cc: Amber, Kumar ; d...@openvswitch.org;
> i.maxim...@ovn.org
> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select 
> the best
> mfex function
> 
> 
> 
> On 30 Jun 2021, at 11:32, Van Haaren, Harry wrote:
> 
> >> -Original Message-
> >> From: Eelco Chaudron 
> >> Sent: Wednesday, June 30, 2021 10:18 AM
> >> To: Van Haaren, Harry 
> >> Cc: Amber, Kumar ; d...@openvswitch.org;
> >> i.maxim...@ovn.org
> >> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to 
> >> select the
> best
> >> mfex function
> >>
> >>
> >>
> >> On 29 Jun 2021, at 18:32, Van Haaren, Harry wrote:
> >>
>  -Original Message-
>  From: dev  On Behalf Of Eelco Chaudron



>  Maybe we should report the numbers/hits for the other methods, as they 
>  might
> >> be
>  equal, and some might be faster in execution time?
> >>>
> >>> As above, the implementations are sorted in performance order. Performance
> >>> here can be known by micro-benchmarks, and developers of such SIMD
> optimized
> >>> code can be expected to know which impl is fastest.
> >>
> >> Don’t think we can, as it’s not documented in the code, and some one can 
> >> just
> add
> >> his own, and has no clue about the existing ones.
> >
> > Yes, in theory somebody could add his own, and get this wrong. There are 
> > many
> many
> > things that could go wrong when making code changes. We cannot document
> everything.
> 
> I meant that the code currently does not document that the implementation 
> table,
> mfex_impls[], is in order of preference. So I think this should be added.

Sure we can document that the impl list is iterated & searched in order, hence
code-doc would help there. Will add this to the code.


> >>> In our current code, the avx512_vbmi_* impls are always before the 
> >>> avx512_*
> >>> impls, as the VBMI instruction set allows a faster runtime.
> >>
> >> Guess we need some documentation in the developer's section on how to add
> >> processor optimized functions, and how to benchmark them (and maybe some
> >> benchmark data for the current implementations).
> >> Also, someone can write a sloppy avx512_vbmi* function that might be slower
> than
> >> an avx512_*, right?
> >
> > What are we trying to achieve here? What is the root problem that is being
> addressed?
> >
> > Yes, somebody "could" write sloppy (complex, advanced, ISA specific, SIMD)
> avx512 code,
> > and have it be slower. Who is realistically going to do that?
> >
> > I'm fine with documenting a few things if they make sense to document, but
> > trying to "hand hold" at every level just doesn't work. Adding sections on 
> > how
> > to benchmark code, and how function pointers work and how to add them?
> > These things are documented in various places across the internet.
> >
> > If there's really an interest to learn AVX512 SIMD optimization, reach out 
> > to the
> > OVS community, put me on CC, and I'll be willing to help. Adding 
> > documentation
> > ad nauseam is not the solution, as each optimization is likely to have 
> > subtle
> differences.
>
> I think the problem is that except you, and some other small group at Intel 
> might
> know AVX512, but for most of the OVS community this is moving back to
> handwritten assembler. 

Nitpick but worth mentioning: optimizing with intrinsics is much easier, and 
much
less mental overhead than actual assembler (e.g. register allocation handled by 
compiler).
I agree lots of developers don't see this on a daily basis, but its really not 
that "crazy".
Once over the 1st level of "reading intrinsics", scalar becomes looped scalar 
becomes vector:

uint64_t x = y & z;

for (int i = 0; i < 8; i++)
   x[i] = y[i] & z[i];

__m512i x = _mm512_and_si512(y, z);

Anyway, this is getting off topic, so I'll stop adding detail here.

> So at least some guidelines on what you should do when
> adding a custom function would help. Like order them in priority, maybe some
> simple example on how to benchmark the runtime of the mfex function. Don't 
> think
> this has to be part of this patch, but a follow-up would be nice.

Honestly I'm still not convinced. Just running the normal OVS benchmarks is 
enough.
If the cycle-counts/packet-rate reported by OVS are better, you're going 
faster. These
things are already documented:
https://docs.openvswitch.org/en/latest/topics/dpdk/pmd/

If you're a developer writing SIMD code, I think its fair to assume some level 
of knowledge
on profiling. If not, the OVS documentation is IMO still _not_ the place to 
document how
to profile optimized code. There's nothing special about benchmarking these 
AVX512 MFEX
implementations compared to any other datapath (or otherwise) function.


> >>> 
> > 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best mfex function

2021-06-30 Thread Eelco Chaudron


On 30 Jun 2021, at 11:43, Van Haaren, Harry wrote:

>> -Original Message-
>> From: Flavio Leitner 
>> Sent: Tuesday, June 29, 2021 7:11 PM
>> To: Van Haaren, Harry 
>> Cc: Eelco Chaudron ; Amber, Kumar
>> ; d...@openvswitch.org; i.maxim...@ovn.org
>> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select 
>> the best
>> mfex function
>>
>> On Tue, Jun 29, 2021 at 04:32:05PM +, Van Haaren, Harry wrote:
 -Original Message-
 From: dev  On Behalf Of Eelco Chaudron
 Sent: Tuesday, June 29, 2021 1:38 PM
 To: Amber, Kumar 
 Cc: d...@openvswitch.org; i.maxim...@ovn.org
 Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to 
 select the
>> best
 mfex function

 More comments below. FYI I’m only reviewing right now, no testing.
>>>
>>> Sure, thanks for reviews.
>>>
 On 17 Jun 2021, at 18:27, Kumar Amber wrote:
>>>
>>> 
>>>
> +/* Allocate per thread PMD pointer space for study_stats. */
> +static inline struct study_stats *
> +get_study_stats(void)
> +{
> +struct study_stats *stats = study_stats_get();
> +if (OVS_UNLIKELY(!stats)) {
> +   stats = xzalloc(sizeof *stats);
> +   study_stats_set_unsafe(stats);
> +}
> +return stats;
> +}
> +

 Just got a mind-meld with the code, and realized that the function might be
>> different
 per PMD thread due to this auto mode (and autovalidator mode in the 
 previous
 patch).

 This makes it only stronger that we need a way to see the currently 
 selected
>> mode,
 and not per datapath, but per PMD per datapath!
>>>
>>> Study depends on the traffic pattern, so yes you're correct that it depends.
>>> The study command was added after community suggested user-experience
>>> would improve if the user doesn't have to provide an exact miniflow profile 
>>> name.
>>>
>>> Study studies the traffic running on that PMD, compares all MFEX impls, and 
>>> prints
>> out
>>> hits. It selects the _first_ implementation that surpasses the threshold of 
>>> packets.
>>>
>>> Users are free to use the more specific names of MFEX impls instead of 
>>> "study"
>>> for fine-grained control over the MFEX impl in use, e.g.
>>>
>>> ovs-appctl dpif-netdev/miniflow-parser-set avx512_vbmi_ipv4_udp
>>>
 Do we also need a way to set this per PMD?
>>>
>>> I don't feel there is real value here, but we could investigate adding an
>>> optional parameter to the command indicating a PMD thread IDX to set?
>>> We have access to "pmd->core_id" in our set() function, so limiting changes
>>> to a specific PMD thread can be done ~ easily... but is it really required?
>>
>> I think the concern here (at least from my side) is that users can
>> set the algorithm globally or per DP, not per PMD. However, the
>> study can set different algorithms per PMD. For example, say that
>> 'study' indicates that alg#1 for PMD#1 and alg#2 for PMD#2 in the
>> lab. Now we want to move to production and make that selection
>> static, how can we do that?
>
> That's a good question. Today the command doesn't give us per-PMD thread
> control. Study can indeed result in different PMDs having different MFEX 
> funcs.
>
>
>> If we set study, how do we tell from the cmdline the algorithm
>> chose for each PMD? Another example of the same situation: if
>> we always start with 'study' and suddenly there is a traffic
>> processing difference. How one can check what is different in
>> the settings? The logs don't tell which PMD was affected.
>
> Sure they do; the "pmd-cX" and "pmd-cY" below show what datapath thread 
> selects what function.
> Note that the first line is from the OVS command thread, which notes that 
> "study" was selected.
> The following two prints are from each datapath thread, noting the resulting 
> function chosen by study.
>
> 2021-06-30T09:05:41Z|00134|dpif_netdev|INFO|Miniflow implementation set to 
> study.
> 2021-06-30T09:05:41Z|1|dpif_mfex_extract_study(pmd-cX/id:X)|INFO|MFEX 
> study chose impl avx512_vbmi_ipv4_udp: (hits 128/128 pkts)
> 2021-06-30T09:05:41Z|1|dpif_mfex_extract_study(pmd-cY/id:Y)|INFO|MFEX 
> study chose impl avx512_vbmi_ipv4_udp: (hits 128/128 pkts)

And with the updated miniflow-parser-get we should be able to see it after the 
logs have wrapped.

>>> Perfect is the enemy of good... I'd prefer focus on getting existing code 
>>> changes
>> merged,
>>> and add additional (optional) parameters in future if deemed useful in real 
>>> world
>> testing?
>>
>> True. Perhaps we have different use cases in mind. How do you expect
>> users to use this feature? Do you think production users will always
>> start with 'study'?
>
> I was expecting OVS users to be aware of what L2-4 traffic they're
> running, and to per-instance configure that statically for all datapath
> threads, for example by running the command below:
>
> $ ovs-appctl  dpif-netdev/miniflow-parser-set avx512_ipv4_udp
>
> 

Re: [ovs-dev] [PATCH 0/3] dpif-netlink: Introduce per-cpu upcall dispatching

2021-06-30 Thread Mark Gray
On 30/06/2021 10:56, Mark Gray wrote:
> This series proposes a new method of distributing upcalls
> to user space threads attempting to resolve a number of
> issues with the current method.
> 
> Mark Gray (3):
>   ofproto: change type of n_handlers and n_revalidators
>   dpif-netlink: fix report_loss() message
>   dpif-netlink: Introduce per-cpu upcall dispatch
> 
>  .../linux/compat/include/linux/openvswitch.h  |   7 +
>  lib/dpif-netdev.c |   1 +
>  lib/dpif-netlink.c| 456 --
>  lib/dpif-provider.h   |  32 +-
>  lib/dpif.c|  17 +
>  lib/dpif.h|   1 +
>  ofproto/ofproto-dpif-upcall.c |  71 ++-
>  ofproto/ofproto-dpif-upcall.h |   5 +-
>  ofproto/ofproto-provider.h|   2 +-
>  ofproto/ofproto.c |  14 +-
>  vswitchd/vswitch.xml  |  23 +-
>  11 files changed, 519 insertions(+), 110 deletions(-)
> 

The kernel space code can be found at:

https://mail.openvswitch.org/pipermail/ovs-dev/2021-June/384670.html

or

https://marc.info/?l=linux-netdev=162504684016825=2

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 3/3] dpif-netlink: Introduce per-cpu upcall dispatch

2021-06-30 Thread Mark Gray
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.

This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:

* On systems with a large number of vports, there is correspondingly
a large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=183)

This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.

In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:

a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.

Reported-at: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray 
---

Notes:
v1 - Reworked based on Flavio's comments:
 * change DISPATCH_MODE_PER_CPU() to inline function
 * add `ovs-appctl` command to check dispatch mode for datapaths
 * fixed issue with userspace actions (tested using `ovs-ofctl monitor 
br0 65534 -P nxt_packet_in`)
 * update documentation as requested

 .../linux/compat/include/linux/openvswitch.h  |   7 +
 lib/dpif-netdev.c |   1 +
 lib/dpif-netlink.c| 456 --
 lib/dpif-provider.h   |  32 +-
 lib/dpif.c|  17 +
 lib/dpif.h|   1 +
 ofproto/ofproto-dpif-upcall.c |  51 +-
 ofproto/ofproto.c |  12 -
 vswitchd/vswitch.xml  |  23 +-
 9 files changed, 504 insertions(+), 96 deletions(-)

diff --git a/datapath/linux/compat/include/linux/openvswitch.h 
b/datapath/linux/compat/include/linux/openvswitch.h
index 875de20250ce..f29265df055e 100644
--- a/datapath/linux/compat/include/linux/openvswitch.h
+++ b/datapath/linux/compat/include/linux/openvswitch.h
@@ -89,6 +89,8 @@ enum ovs_datapath_cmd {
  * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
  * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
  * not be sent.
+ * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
+ * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
  * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
  * datapath.  Always present in notifications.
  * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the
@@ -105,6 +107,8 @@ enum ovs_datapath_attr {
OVS_DP_ATTR_MEGAFLOW_STATS, /* struct ovs_dp_megaflow_stats */
OVS_DP_ATTR_USER_FEATURES,  /* OVS_DP_F_*  */
OVS_DP_ATTR_PAD,
+   OVS_DP_ATTR_PAD2,
+   OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls */
__OVS_DP_ATTR_MAX
 };
 
@@ -146,6 +150,9 @@ struct ovs_vport_stats {
 /* Allow tc offload recirc sharing */
 #define OVS_DP_F_TC_RECIRC_SHARING  (1 << 2)
 
+/* Allow per-cpu dispatch of upcalls */
+#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU (1 << 3)
+
 /* Fixed logical ports. */
 #define OVSP_LOCAL  ((__u32)0)
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c5ab35d2a5a5..b2c2baadf4f3 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -8562,6 +8562,7 @@ const struct dpif_class dpif_netdev_class = {
 dpif_netdev_operate,
 NULL,   /* recv_set */
 NULL,   /* handlers_set */
+NULL,   /* number_handlers_required */
 dpif_netdev_set_config,
 dpif_netdev_queue_to_priority,
 NULL,   /* recv */
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index f92905dd83fd..2399879aea3e 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -98,6 +98,8 @@ struct dpif_netlink_dp {
 const struct ovs_dp_stats *stats;  /* OVS_DP_ATTR_STATS. */
 const struct ovs_dp_megaflow_stats *megaflow_stats;
/* OVS_DP_ATTR_MEGAFLOW_STATS.*/
+const uint32_t *upcall_pids;   /* OVS_DP_ATTR_PER_CPU_PIDS */
+uint32_t n_upcall_pids;
 };
 
 static void 

[ovs-dev] [PATCH 2/3] dpif-netlink: fix report_loss() message

2021-06-30 Thread Mark Gray
Fixes: 1579cf677fcb ("dpif-linux: Implement the API functions to allow multiple 
handler threads read upcall.")
Signed-off-by: Mark Gray 
---

Notes:
v1 - Reworked based on Flavio's comments:
 * Added "Fixes" tag

 lib/dpif-netlink.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 73d5608a81a2..f92905dd83fd 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -4666,7 +4666,7 @@ report_loss(struct dpif_netlink *dpif, struct 
dpif_channel *ch, uint32_t ch_idx,
   time_msec() - ch->last_poll);
 }
 
-VLOG_WARN("%s: lost packet on port channel %u of handler %u",
-  dpif_name(>dpif), ch_idx, handler_id);
+VLOG_WARN("%s: lost packet on port channel %u of handler %u%s",
+  dpif_name(>dpif), ch_idx, handler_id, ds_cstr());
 ds_destroy();
 }
-- 
2.27.0

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 0/3] dpif-netlink: Introduce per-cpu upcall dispatching

2021-06-30 Thread Mark Gray
This series proposes a new method of distributing upcalls
to user space threads attempting to resolve a number of
issues with the current method.

Mark Gray (3):
  ofproto: change type of n_handlers and n_revalidators
  dpif-netlink: fix report_loss() message
  dpif-netlink: Introduce per-cpu upcall dispatch

 .../linux/compat/include/linux/openvswitch.h  |   7 +
 lib/dpif-netdev.c |   1 +
 lib/dpif-netlink.c| 456 --
 lib/dpif-provider.h   |  32 +-
 lib/dpif.c|  17 +
 lib/dpif.h|   1 +
 ofproto/ofproto-dpif-upcall.c |  71 ++-
 ofproto/ofproto-dpif-upcall.h |   5 +-
 ofproto/ofproto-provider.h|   2 +-
 ofproto/ofproto.c |  14 +-
 vswitchd/vswitch.xml  |  23 +-
 11 files changed, 519 insertions(+), 110 deletions(-)

-- 
2.27.0


___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH 1/3] ofproto: change type of n_handlers and n_revalidators

2021-06-30 Thread Mark Gray
'n_handlers' and 'n_revalidators' are declared as type 'size_t'.
However, dpif_handlers_set() requires parameter 'n_handlers' as
type 'uint32_t'. This patch fixes this type mismatch.

Signed-off-by: Mark Gray 
---

Notes:
v1 - Reworked based on Flavio's comments:
 * fixed inconsistency with change of size_t -> uint32_t

 ofproto/ofproto-dpif-upcall.c | 20 ++--
 ofproto/ofproto-dpif-upcall.h |  5 +++--
 ofproto/ofproto-provider.h|  2 +-
 ofproto/ofproto.c |  2 +-
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
index ccf97266c0b9..d22f7f07361f 100644
--- a/ofproto/ofproto-dpif-upcall.c
+++ b/ofproto/ofproto-dpif-upcall.c
@@ -129,10 +129,10 @@ struct udpif {
 struct dpif_backer *backer;/* Opaque dpif_backer pointer. */
 
 struct handler *handlers;  /* Upcall handlers. */
-size_t n_handlers;
+uint32_t n_handlers;
 
 struct revalidator *revalidators;  /* Flow revalidators. */
-size_t n_revalidators;
+uint32_t n_revalidators;
 
 struct latch exit_latch;   /* Tells child threads to exit. */
 
@@ -335,8 +335,8 @@ static int process_upcall(struct udpif *, struct upcall *,
   struct ofpbuf *odp_actions, struct flow_wildcards *);
 static void handle_upcalls(struct udpif *, struct upcall *, size_t n_upcalls);
 static void udpif_stop_threads(struct udpif *, bool delete_flows);
-static void udpif_start_threads(struct udpif *, size_t n_handlers,
-size_t n_revalidators);
+static void udpif_start_threads(struct udpif *, uint32_t n_handlers,
+uint32_t n_revalidators);
 static void udpif_pause_revalidators(struct udpif *);
 static void udpif_resume_revalidators(struct udpif *);
 static void *udpif_upcall_handler(void *);
@@ -562,8 +562,8 @@ udpif_stop_threads(struct udpif *udpif, bool delete_flows)
 
 /* Starts the handler and revalidator threads. */
 static void
-udpif_start_threads(struct udpif *udpif, size_t n_handlers_,
-size_t n_revalidators_)
+udpif_start_threads(struct udpif *udpif, uint32_t n_handlers_,
+uint32_t n_revalidators_)
 {
 if (udpif && n_handlers_ && n_revalidators_) {
 /* Creating a thread can take a significant amount of time on some
@@ -632,8 +632,8 @@ udpif_resume_revalidators(struct udpif *udpif)
  * datapath handle must have packet reception enabled before starting
  * threads. */
 void
-udpif_set_threads(struct udpif *udpif, size_t n_handlers_,
-  size_t n_revalidators_)
+udpif_set_threads(struct udpif *udpif, uint32_t n_handlers_,
+  uint32_t n_revalidators_)
 {
 ovs_assert(udpif);
 ovs_assert(n_handlers_ && n_revalidators_);
@@ -691,8 +691,8 @@ udpif_get_memory_usage(struct udpif *udpif, struct simap 
*usage)
 void
 udpif_flush(struct udpif *udpif)
 {
-size_t n_handlers_ = udpif->n_handlers;
-size_t n_revalidators_ = udpif->n_revalidators;
+uint32_t n_handlers_ = udpif->n_handlers;
+uint32_t n_revalidators_ = udpif->n_revalidators;
 
 udpif_stop_threads(udpif, true);
 dpif_flow_flush(udpif->dpif);
diff --git a/ofproto/ofproto-dpif-upcall.h b/ofproto/ofproto-dpif-upcall.h
index 693107ae56c1..b4dfed32046e 100644
--- a/ofproto/ofproto-dpif-upcall.h
+++ b/ofproto/ofproto-dpif-upcall.h
@@ -16,6 +16,7 @@
 #define OFPROTO_DPIF_UPCALL_H
 
 #include 
+#include 
 
 struct dpif;
 struct dpif_backer;
@@ -31,8 +32,8 @@ struct simap;
 void udpif_init(void);
 struct udpif *udpif_create(struct dpif_backer *, struct dpif *);
 void udpif_run(struct udpif *udpif);
-void udpif_set_threads(struct udpif *, size_t n_handlers,
-   size_t n_revalidators);
+void udpif_set_threads(struct udpif *, uint32_t n_handlers,
+   uint32_t n_revalidators);
 void udpif_destroy(struct udpif *);
 void udpif_revalidate(struct udpif *);
 void udpif_get_memory_usage(struct udpif *, struct simap *usage);
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
index 9ad2b71d23eb..57c7d17cb28f 100644
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -534,7 +534,7 @@ extern unsigned ofproto_min_revalidate_pps;
 
 /* Number of upcall handler and revalidator threads. Only affects the
  * ofproto-dpif implementation. */
-extern size_t n_handlers, n_revalidators;
+extern uint32_t n_handlers, n_revalidators;
 
 static inline struct rule *rule_from_cls_rule(const struct cls_rule *);
 
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c
index 80ec2d9ac9c7..53002f082b52 100644
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -309,7 +309,7 @@ unsigned ofproto_max_idle = OFPROTO_MAX_IDLE_DEFAULT;
 unsigned ofproto_max_revalidator = OFPROTO_MAX_REVALIDATOR_DEFAULT;
 unsigned ofproto_min_revalidate_pps = OFPROTO_MIN_REVALIDATE_PPS_DEFAULT;
 
-size_t n_handlers, n_revalidators;
+uint32_t n_handlers, 

Re: [ovs-dev] [v4 01/12] dpif-netdev: Add command line and function pointer for miniflow extract

2021-06-30 Thread Amber, Kumar
Hi Eelco ,

Replies inline.

> -Original Message-
> From: Eelco Chaudron 
> Sent: Wednesday, June 30, 2021 3:17 PM
> To: Amber, Kumar ; Van Haaren, Harry
> 
> Cc: d...@openvswitch.org; i.maxim...@ovn.org; Flavio Leitner
> 
> Subject: Re: [ovs-dev] [v4 01/12] dpif-netdev: Add command line and
> function pointer for miniflow extract
> 
> 
> 
> On 17 Jun 2021, at 18:27, Kumar Amber wrote:
> 
> > This patch introduces the mfex function pointers which allows the user
> > to switch between different miniflow extract implementations which are
> > provided by the OVS based on optimized ISA CPU.
> >
> > The user can query for the available minflow extract variants
> > available for that CPU by following commands:
> >
> > $ovs-appctl dpif-netdev/miniflow-parser-get
> >
> > Similarly an user can set the miniflow implementation by the following
> > command :
> >
> > $ ovs-appctl dpif-netdev/miniflow-parser-set name
> >
> > This allow for more performance and flexibility to the user to choose
> > the miniflow implementation according to the needs.
> >
> > Signed-off-by: Kumar Amber 
> > Co-authored-by: Harry van Haaren 
> > Signed-off-by: Harry van Haaren 
> > ---
> >  lib/automake.mk   |   2 +
> >  lib/dpif-netdev-avx512.c  |  32 ++--
> >  lib/dpif-netdev-private-extract.c |  86 
> > lib/dpif-netdev-private-extract.h |  94 ++
> >  lib/dpif-netdev-private-thread.h  |   4 +
> >  lib/dpif-netdev.c | 126 +-
> >  6 files changed, 337 insertions(+), 7 deletions(-)  create mode
> > 100644 lib/dpif-netdev-private-extract.c  create mode 100644
> > lib/dpif-netdev-private-extract.h
> >
> > diff --git a/lib/automake.mk b/lib/automake.mk index
> > 49f42c2a3..6657b9ae5 100644
> > --- a/lib/automake.mk
> > +++ b/lib/automake.mk
> > @@ -118,6 +118,8 @@ lib_libopenvswitch_la_SOURCES = \
> > lib/dpif-netdev-private-dpcls.h \
> > lib/dpif-netdev-private-dpif.c \
> > lib/dpif-netdev-private-dpif.h \
> > +   lib/dpif-netdev-private-extract.c \
> > +   lib/dpif-netdev-private-extract.h \
> > lib/dpif-netdev-private-flow.h \
> > lib/dpif-netdev-private-hwol.h \
> > lib/dpif-netdev-private-thread.h \
> > diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c index
> > f9b199637..bb99b23ff 100644
> > --- a/lib/dpif-netdev-avx512.c
> > +++ b/lib/dpif-netdev-avx512.c
> > @@ -148,6 +148,15 @@ dp_netdev_input_outer_avx512(struct
> dp_netdev_pmd_thread *pmd,
> >   * // do all processing (HWOL->MFEX->EMC->SMC)
> >   * }
> >   */
> > +
> > +/* Do a batch minfilow extract into keys. */
> > +uint32_t mf_mask = 0;
> > +if (pmd->miniflow_extract_opt) {
> > +mf_mask = pmd->miniflow_extract_opt(packets, keys,
> > +batch_size, in_port,
> > +(void *) pmd);
> > +}
> > +/* Perform first packet interation */
> >  uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1;
> >  uint32_t iter = lookup_pkts_bitmask;
> >  while (iter) {
> > @@ -159,6 +168,12 @@ dp_netdev_input_outer_avx512(struct
> dp_netdev_pmd_thread *pmd,
> >  pkt_metadata_init(>md, in_port);
> >
> >  struct dp_netdev_flow *f = NULL;
> > +struct netdev_flow_key *key = [i];
> > +
> > +/* Check the minfiflow mask to see if the packet was correctly
> > +* classifed by vector mfex else do a scalar miniflow extract
> > +* for that packet. */
> > +uint32_t mfex_hit = (mf_mask & (1 << i));
> >
> >  /* Check for partial hardware offload mark. */
> >  uint32_t mark;
> > @@ -166,7 +181,13 @@ dp_netdev_input_outer_avx512(struct
> dp_netdev_pmd_thread *pmd,
> >  f = mark_to_flow_find(pmd, mark);
> >  if (f) {
> >  rules[i] = >cr;
> > -pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> > +/* If AVX512 MFEX already classified the packet, use it. */
> > +if (mfex_hit) {
> > +pkt_meta[i].tcp_flags = 
> > miniflow_get_tcp_flags(>mf);
> > +} else {
> > +pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> > +}
> > +
> >  pkt_meta[i].bytes = dp_packet_size(packet);
> >  phwol_hits++;
> >  hwol_emc_smc_hitmask |= (1 << i); @@ -174,11 +195,12
> > @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> >  }
> >  }
> >
> > -/* Do miniflow extract into keys. */
> > -struct netdev_flow_key *key = [i];
> > -miniflow_extract(packet, >mf);
> > +if (!mfex_hit) {
> > +/* Do a scalar miniflow extract into keys */
> > +miniflow_extract(packet, >mf);
> > +}
> >
> > -/* Cache TCP and byte values for all packets. */
> > +/* Cache TCP and byte values for 

[ovs-dev] [PATCH net-next] openvswitch: Introduce per-cpu upcall dispatch

2021-06-30 Thread Mark Gray
The Open vSwitch kernel module uses the upcall mechanism to send
packets from kernel space to user space when it misses in the kernel
space flow table. The upcall sends packets via a Netlink socket.
Currently, a Netlink socket is created for every vport. In this way,
there is a 1:1 mapping between a vport and a Netlink socket.
When a packet is received by a vport, if it needs to be sent to
user space, it is sent via the corresponding Netlink socket.

This mechanism, with various iterations of the corresponding user
space code, has seen some limitations and issues:

* On systems with a large number of vports, there is a correspondingly
large number of Netlink sockets which can limit scaling.
(https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
* Packet reordering on upcalls.
(https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
* A thundering herd issue.
(https://bugzilla.redhat.com/show_bug.cgi?id=183)

This patch introduces an alternative, feature-negotiated, upcall
mode using a per-cpu dispatch rather than a per-vport dispatch.

In this mode, the Netlink socket to be used for the upcall is
selected based on the CPU of the thread that is executing the upcall.
In this way, it resolves the issues above as:

a) The number of Netlink sockets scales with the number of CPUs
rather than the number of vports.
b) Ordering per-flow is maintained as packets are distributed to
CPUs based on mechanisms such as RSS and flows are distributed
to a single user space thread.
c) Packets from a flow can only wake up one user space thread.

The corresponding user space code can be found at:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-April/382618.html

Bugzilla: https://bugzilla.redhat.com/1844576
Signed-off-by: Mark Gray 
---

Notes:
v1 - Reworked based on Flavio's comments:
 * Fixed handling of userspace action case
 * Renamed 'struct dp_portids'
 * Fixed handling of return from kmalloc()
 * Removed check for dispatch type from ovs_dp_get_upcall_portid()
   - Reworked based on Dan's comments:
 * Fixed handling of return from kmalloc()
   - Reworked based on Pravin's comments:
 * Fixed handling of userspace action case
   - Added kfree() in destroy_dp_rcu() to cleanup netlink port ids

 include/uapi/linux/openvswitch.h |  8 
 net/openvswitch/actions.c|  6 ++-
 net/openvswitch/datapath.c   | 70 +++-
 net/openvswitch/datapath.h   | 20 +
 4 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 8d16744edc31..6571b57b2268 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
  * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
  * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
  * not be sent.
+ * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
+ * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
  * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
  * datapath.  Always present in notifications.
  * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for the
@@ -87,6 +89,9 @@ enum ovs_datapath_attr {
OVS_DP_ATTR_USER_FEATURES,  /* OVS_DP_F_*  */
OVS_DP_ATTR_PAD,
OVS_DP_ATTR_MASKS_CACHE_SIZE,
+   OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in 
per-cpu
+* dispatch mode
+*/
__OVS_DP_ATTR_MAX
 };
 
@@ -127,6 +132,9 @@ struct ovs_vport_stats {
 /* Allow tc offload recirc sharing */
 #define OVS_DP_F_TC_RECIRC_SHARING (1 << 2)
 
+/* Allow per-cpu dispatch of upcalls */
+#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU   (1 << 3)
+
 /* Fixed logical ports. */
 #define OVSP_LOCAL  ((__u32)0)
 
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index ef15d9eb4774..f79679746c62 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -924,7 +924,11 @@ static int output_userspace(struct datapath *dp, struct 
sk_buff *skb,
break;
 
case OVS_USERSPACE_ATTR_PID:
-   upcall.portid = nla_get_u32(a);
+   if (dp->user_features & 
OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
+   upcall.portid =
+  ovs_dp_get_upcall_portid(dp, 
smp_processor_id());
+   else
+   upcall.portid = nla_get_u32(a);
break;
 
case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index bc164b35e67d..8d54fa323543 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -166,6 +166,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu)
 

Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best mfex function

2021-06-30 Thread Eelco Chaudron


On 30 Jun 2021, at 11:32, Van Haaren, Harry wrote:

>> -Original Message-
>> From: Eelco Chaudron 
>> Sent: Wednesday, June 30, 2021 10:18 AM
>> To: Van Haaren, Harry 
>> Cc: Amber, Kumar ; d...@openvswitch.org;
>> i.maxim...@ovn.org
>> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select 
>> the best
>> mfex function
>>
>>
>>
>> On 29 Jun 2021, at 18:32, Van Haaren, Harry wrote:
>>
 -Original Message-
 From: dev  On Behalf Of Eelco Chaudron
 Sent: Tuesday, June 29, 2021 1:38 PM
 To: Amber, Kumar 
 Cc: d...@openvswitch.org; i.maxim...@ovn.org
 Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to 
 select the
>> best
 mfex function
>
> 
>
>>> Perfect is the enemy of good... I'd prefer focus on getting existing code 
>>> changes
>> merged,
>>> and add additional (optional) parameters in future if deemed useful in real 
>>> world
>> testing?
>>
>> See Flavio’s reply, as those were the concerns same concerns I thought of.
>
> Yes - thanks for combining threads - I'm writing a detailed reply there as we 
> speak here :)
> I'll send that reply shortly.
>
> 
>
> +if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
> +/* Set the implementation to index with max_hits. */
> +pmd->miniflow_extract_opt =
> +miniflow_funcs[best_func_index].extract_func;
> +VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
> +  miniflow_funcs[best_func_index].name, max_hits,
> +  stats->pkt_count);

 We have no idea which PMD the mode is selected for guess we might need to 
 add
 this?

 Maybe we should report the numbers/hits for the other methods, as they 
 might
>> be
 equal, and some might be faster in execution time?
>>>
>>> As above, the implementations are sorted in performance order. Performance
>>> here can be known by micro-benchmarks, and developers of such SIMD optimized
>>> code can be expected to know which impl is fastest.
>>
>> Don’t think we can, as it’s not documented in the code, and some one can 
>> just add
>> his own, and has no clue about the existing ones.
>
> Yes, in theory somebody could add his own, and get this wrong. There are many 
> many
> things that could go wrong when making code changes. We cannot document 
> everything.

I meant that the code currently does not document that the implementation 
table, mfex_impls[], is in order of preference. So I think this should be added.

>>> In our current code, the avx512_vbmi_* impls are always before the avx512_*
>>> impls, as the VBMI instruction set allows a faster runtime.
>>
>> Guess we need some documentation in the developer's section on how to add
>> processor optimized functions, and how to benchmark them (and maybe some
>> benchmark data for the current implementations).
>> Also, someone can write a sloppy avx512_vbmi* function that might be slower 
>> than
>> an avx512_*, right?
>
> What are we trying to achieve here? What is the root problem that is being 
> addressed?
>
> Yes, somebody "could" write sloppy (complex, advanced, ISA specific, SIMD) 
> avx512 code,
> and have it be slower. Who is realistically going to do that?
>
> I'm fine with documenting a few things if they make sense to document, but
> trying to "hand hold" at every level just doesn't work. Adding sections on how
> to benchmark code, and how function pointers work and how to add them?
> These things are documented in various places across the internet.
>
> If there's really an interest to learn AVX512 SIMD optimization, reach out to 
> the
> OVS community, put me on CC, and I'll be willing to help. Adding documentation
> ad nauseam is not the solution, as each optimization is likely to have subtle 
> differences.
I think the problem is that except you, and some other small group at Intel 
might know AVX512, but for most of the OVS community this is moving back to 
handwritten assembler. So at least some guidelines on what you should do when 
adding a custom function would help. Like order them in priority, maybe some 
simple example on how to benchmark the runtime of the mfex function. Don't 
think this has to be part of this patch, but a follow-up would be nice.
>
>
>>> 
> 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 01/12] dpif-netdev: Add command line and function pointer for miniflow extract

2021-06-30 Thread Eelco Chaudron



On 17 Jun 2021, at 18:27, Kumar Amber wrote:

> This patch introduces the mfex function pointers which allows
> the user to switch between different miniflow extract implementations
> which are provided by the OVS based on optimized ISA CPU.
>
> The user can query for the available minflow extract variants available
> for that CPU by following commands:
>
> $ovs-appctl dpif-netdev/miniflow-parser-get
>
> Similarly an user can set the miniflow implementation by the following
> command :
>
> $ ovs-appctl dpif-netdev/miniflow-parser-set name
>
> This allow for more performance and flexibility to the user to choose
> the miniflow implementation according to the needs.
>
> Signed-off-by: Kumar Amber 
> Co-authored-by: Harry van Haaren 
> Signed-off-by: Harry van Haaren 
> ---
>  lib/automake.mk   |   2 +
>  lib/dpif-netdev-avx512.c  |  32 ++--
>  lib/dpif-netdev-private-extract.c |  86 
>  lib/dpif-netdev-private-extract.h |  94 ++
>  lib/dpif-netdev-private-thread.h  |   4 +
>  lib/dpif-netdev.c | 126 +-
>  6 files changed, 337 insertions(+), 7 deletions(-)
>  create mode 100644 lib/dpif-netdev-private-extract.c
>  create mode 100644 lib/dpif-netdev-private-extract.h
>
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 49f42c2a3..6657b9ae5 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -118,6 +118,8 @@ lib_libopenvswitch_la_SOURCES = \
>   lib/dpif-netdev-private-dpcls.h \
>   lib/dpif-netdev-private-dpif.c \
>   lib/dpif-netdev-private-dpif.h \
> + lib/dpif-netdev-private-extract.c \
> + lib/dpif-netdev-private-extract.h \
>   lib/dpif-netdev-private-flow.h \
>   lib/dpif-netdev-private-hwol.h \
>   lib/dpif-netdev-private-thread.h \
> diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
> index f9b199637..bb99b23ff 100644
> --- a/lib/dpif-netdev-avx512.c
> +++ b/lib/dpif-netdev-avx512.c
> @@ -148,6 +148,15 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
> *pmd,
>   * // do all processing (HWOL->MFEX->EMC->SMC)
>   * }
>   */
> +
> +/* Do a batch minfilow extract into keys. */
> +uint32_t mf_mask = 0;
> +if (pmd->miniflow_extract_opt) {
> +mf_mask = pmd->miniflow_extract_opt(packets, keys,
> +batch_size, in_port,
> +(void *) pmd);
> +}
> +/* Perform first packet interation */
>  uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1;
>  uint32_t iter = lookup_pkts_bitmask;
>  while (iter) {
> @@ -159,6 +168,12 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
> *pmd,
>  pkt_metadata_init(>md, in_port);
>
>  struct dp_netdev_flow *f = NULL;
> +struct netdev_flow_key *key = [i];
> +
> +/* Check the minfiflow mask to see if the packet was correctly
> +* classifed by vector mfex else do a scalar miniflow extract
> +* for that packet. */
> +uint32_t mfex_hit = (mf_mask & (1 << i));
>
>  /* Check for partial hardware offload mark. */
>  uint32_t mark;
> @@ -166,7 +181,13 @@ dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread 
> *pmd,
>  f = mark_to_flow_find(pmd, mark);
>  if (f) {
>  rules[i] = >cr;
> -pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> +/* If AVX512 MFEX already classified the packet, use it. */
> +if (mfex_hit) {
> +pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(>mf);
> +} else {
> +pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> +}
> +
>  pkt_meta[i].bytes = dp_packet_size(packet);
>  phwol_hits++;
>  hwol_emc_smc_hitmask |= (1 << i);
> @@ -174,11 +195,12 @@ dp_netdev_input_outer_avx512(struct 
> dp_netdev_pmd_thread *pmd,
>  }
>  }
>
> -/* Do miniflow extract into keys. */
> -struct netdev_flow_key *key = [i];
> -miniflow_extract(packet, >mf);
> +if (!mfex_hit) {
> +/* Do a scalar miniflow extract into keys */
> +miniflow_extract(packet, >mf);
> +}
>
> -/* Cache TCP and byte values for all packets. */
> +/* Cache TCP and byte values for all packets */
>  pkt_meta[i].bytes = dp_packet_size(packet);
>  pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(>mf);
>
> diff --git a/lib/dpif-netdev-private-extract.c 
> b/lib/dpif-netdev-private-extract.c
> new file mode 100644
> index 0..fcc56ef26
> --- /dev/null
> +++ b/lib/dpif-netdev-private-extract.c
> @@ -0,0 +1,86 @@
> +/*
> + * Copyright (c) 2021 Intel.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * 

Re: [ovs-dev] [RFC net-next] openvswitch: Introduce per-cpu upcall dispatch

2021-06-30 Thread Mark Gray
On 19/05/2021 22:47, Pravin Shelar wrote:

Thanks for the review and sorry for the delay. I will be more responsive
to any further changes from this point on.

> On Fri, Apr 30, 2021 at 8:33 AM Mark Gray  wrote:
>>
>> The Open vSwitch kernel module uses the upcall mechanism to send
>> packets from kernel space to user space when it misses in the kernel
>> space flow table. The upcall sends packets via a Netlink socket.
>> Currently, a Netlink socket is created for every vport. In this way,
>> there is a 1:1 mapping between a vport and a Netlink socket.
>> When a packet is received by a vport, if it needs to be sent to
>> user space, it is sent via the corresponding Netlink socket.
>>
>> This mechanism, with various iterations of the corresponding user
>> space code, has seen some limitations and issues:
>>
>> * On systems with a large number of vports, there is a correspondingly
>> large number of Netlink sockets which can limit scaling.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
>> * Packet reordering on upcalls.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
>> * A thundering herd issue.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=183)
>>
>> This patch introduces an alternative, feature-negotiated, upcall
>> mode using a per-cpu dispatch rather than a per-vport dispatch.
>>
>> In this mode, the Netlink socket to be used for the upcall is
>> selected based on the CPU of the thread that is executing the upcall.
>> In this way, it resolves the issues above as:
>>
>> a) The number of Netlink sockets scales with the number of CPUs
>> rather than the number of vports.
>> b) Ordering per-flow is maintained as packets are distributed to
>> CPUs based on mechanisms such as RSS and flows are distributed
>> to a single user space thread.
>> c) Packets from a flow can only wake up one user space thread.
>>
>> The corresponding user space code can be found at:
>> https://mail.openvswitch.org/pipermail/ovs-dev/2021-April/382618.html
>>
>> Bugzilla: https://bugzilla.redhat.com/1844576
>> Signed-off-by: Mark Gray 
>> ---
>>  include/uapi/linux/openvswitch.h |  8 
>>  net/openvswitch/datapath.c   | 70 +++-
>>  net/openvswitch/datapath.h   | 18 
>>  net/openvswitch/flow_netlink.c   |  4 --
>>  4 files changed, 94 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/uapi/linux/openvswitch.h 
>> b/include/uapi/linux/openvswitch.h
>> index 8d16744edc31..6571b57b2268 100644
>> --- a/include/uapi/linux/openvswitch.h
>> +++ b/include/uapi/linux/openvswitch.h
>> @@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
>>   * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
>>   * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
>>   * not be sent.
>> + * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
>> + * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
>>   * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
>>   * datapath.  Always present in notifications.
>>   * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for 
>> the
>> @@ -87,6 +89,9 @@ enum ovs_datapath_attr {
>> OVS_DP_ATTR_USER_FEATURES,  /* OVS_DP_F_*  */
>> OVS_DP_ATTR_PAD,
>> OVS_DP_ATTR_MASKS_CACHE_SIZE,
>> +   OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in 
>> per-cpu
>> +* dispatch mode
>> +*/
>> __OVS_DP_ATTR_MAX
>>  };
>>
>> @@ -127,6 +132,9 @@ struct ovs_vport_stats {
>>  /* Allow tc offload recirc sharing */
>>  #define OVS_DP_F_TC_RECIRC_SHARING (1 << 2)
>>
>> +/* Allow per-cpu dispatch of upcalls */
>> +#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU   (1 << 3)
>> +
>>  /* Fixed logical ports. */
>>  #define OVSP_LOCAL  ((__u32)0)
>>
>> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
>> index 9d6ef6cb9b26..98d54f41fdaa 100644
>> --- a/net/openvswitch/datapath.c
>> +++ b/net/openvswitch/datapath.c
>> @@ -121,6 +121,8 @@ int lockdep_ovsl_is_held(void)
>>  #endif
>>
>>  static struct vport *new_vport(const struct vport_parms *);
>> +static u32 ovs_dp_get_upcall_portid(const struct datapath *, uint32_t);
>> +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr 
>> *);
>>  static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
>>  const struct sw_flow_key *,
>>  const struct dp_upcall_info *,
>> @@ -238,7 +240,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct 
>> sw_flow_key *key)
>>
>> memset(, 0, sizeof(upcall));
>> upcall.cmd = OVS_PACKET_CMD_MISS;
>> -   upcall.portid = ovs_vport_find_upcall_portid(p, skb);
>> +
>> +   if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
>> +   upcall.portid = ovs_dp_get_upcall_portid(dp, 
>> smp_processor_id());
>> 

Re: [ovs-dev] [RFC 1/3] ofproto: change type of n_handlers and n_revalidators

2021-06-30 Thread Mark Gray
On 28/05/2021 20:50, Flavio Leitner wrote:
> On Fri, Apr 30, 2021 at 11:31:27AM -0400, Mark Gray wrote:
>> 'n_handlers' and 'n_revalidators' are declared as type 'size_t'.
>> However, dpif_handlers_set() requires parameter 'n_handlers' as
>> type 'uint32_t'. This patch fixes this type mismatch.
> The change looks good, but I didn't understand the criteria used
> to do the change. For example, at udpif_stop_threads() you changed
> from 'size_t' to 'uint32_t', but variable 'i' is not required
> to be of the same type (marked in line below). However, I could
> find other similar cases left unchanged.
> 
> fbl
> 

Yes. The changes that you highlighted seem a bit arbitrary. I removed
them and just left the updates to the various function signatures and
variable declarations.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [RFC 2/3] dpif-netlink: fix report_loss() message

2021-06-30 Thread Mark Gray
On 28/05/2021 20:49, Flavio Leitner wrote:
> On Fri, Apr 30, 2021 at 11:31:28AM -0400, Mark Gray wrote:
>> Signed-off-by: Mark Gray 
> This looks like a bug fix for this commit:
> 1579cf677fcb dpif-linux: Implement the API functions to allow multiple ...
> 
> If you agree, please add the Fixes: tag.
> 
> fbl
> 

I agree. I will add the tag.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [RFC net-next] openvswitch: Introduce per-cpu upcall dispatch

2021-06-30 Thread Mark Gray
On 28/05/2021 20:49, Flavio Leitner wrote:
> 
> Hi Mark,
> 
> I think this patch is going in the right direction but there
> are some points that I think we should address. See below.
> 
> On Fri, Apr 30, 2021 at 11:33:25AM -0400, Mark Gray wrote:
>> The Open vSwitch kernel module uses the upcall mechanism to send
>> packets from kernel space to user space when it misses in the kernel
>> space flow table. The upcall sends packets via a Netlink socket.
>> Currently, a Netlink socket is created for every vport. In this way,
>> there is a 1:1 mapping between a vport and a Netlink socket.
>> When a packet is received by a vport, if it needs to be sent to
>> user space, it is sent via the corresponding Netlink socket.
>>
>> This mechanism, with various iterations of the corresponding user
>> space code, has seen some limitations and issues:
>>
>> * On systems with a large number of vports, there is a correspondingly
>> large number of Netlink sockets which can limit scaling.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
>> * Packet reordering on upcalls.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
>> * A thundering herd issue.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=183)
>>
>> This patch introduces an alternative, feature-negotiated, upcall
>> mode using a per-cpu dispatch rather than a per-vport dispatch.
>>
>> In this mode, the Netlink socket to be used for the upcall is
>> selected based on the CPU of the thread that is executing the upcall.
>> In this way, it resolves the issues above as:
>>
>> a) The number of Netlink sockets scales with the number of CPUs
>> rather than the number of vports.
>> b) Ordering per-flow is maintained as packets are distributed to
>> CPUs based on mechanisms such as RSS and flows are distributed
>> to a single user space thread.
>> c) Packets from a flow can only wake up one user space thread.
>>
>> The corresponding user space code can be found at:
>> https://mail.openvswitch.org/pipermail/ovs-dev/2021-April/382618.html
> 
> Thanks for writing a nice commit description.
> 
>>
>> Bugzilla: https://bugzilla.redhat.com/1844576
>> Signed-off-by: Mark Gray 
>> ---
>>  include/uapi/linux/openvswitch.h |  8 
>>  net/openvswitch/datapath.c   | 70 +++-
>>  net/openvswitch/datapath.h   | 18 
>>  net/openvswitch/flow_netlink.c   |  4 --
>>  4 files changed, 94 insertions(+), 6 deletions(-)
>>
>> diff --git a/include/uapi/linux/openvswitch.h 
>> b/include/uapi/linux/openvswitch.h
>> index 8d16744edc31..6571b57b2268 100644
>> --- a/include/uapi/linux/openvswitch.h
>> +++ b/include/uapi/linux/openvswitch.h
>> @@ -70,6 +70,8 @@ enum ovs_datapath_cmd {
>>   * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
>>   * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
>>   * not be sent.
>> + * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
>> + * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
>>   * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
>>   * datapath.  Always present in notifications.
>>   * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for 
>> the
>> @@ -87,6 +89,9 @@ enum ovs_datapath_attr {
>>  OVS_DP_ATTR_USER_FEATURES,  /* OVS_DP_F_*  */
>>  OVS_DP_ATTR_PAD,
>>  OVS_DP_ATTR_MASKS_CACHE_SIZE,
>> +OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls in 
>> per-cpu
>> + * dispatch mode
>> + */
>>  __OVS_DP_ATTR_MAX
>>  };
>>  
>> @@ -127,6 +132,9 @@ struct ovs_vport_stats {
>>  /* Allow tc offload recirc sharing */
>>  #define OVS_DP_F_TC_RECIRC_SHARING  (1 << 2)
>>  
>> +/* Allow per-cpu dispatch of upcalls */
>> +#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU(1 << 3)
>> +
>>  /* Fixed logical ports. */
>>  #define OVSP_LOCAL  ((__u32)0)
>>  
>> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
>> index 9d6ef6cb9b26..98d54f41fdaa 100644
>> --- a/net/openvswitch/datapath.c
>> +++ b/net/openvswitch/datapath.c
>> @@ -121,6 +121,8 @@ int lockdep_ovsl_is_held(void)
>>  #endif
>>  
>>  static struct vport *new_vport(const struct vport_parms *);
>> +static u32 ovs_dp_get_upcall_portid(const struct datapath *, uint32_t);
>> +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr 
>> *);
>>  static int queue_gso_packets(struct datapath *dp, struct sk_buff *,
>>   const struct sw_flow_key *,
>>   const struct dp_upcall_info *,
>> @@ -238,7 +240,12 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct 
>> sw_flow_key *key)
>>  
>>  memset(, 0, sizeof(upcall));
>>  upcall.cmd = OVS_PACKET_CMD_MISS;
>> -upcall.portid = ovs_vport_find_upcall_portid(p, skb);
>> +
>> +if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU)
>> +upcall.portid = 

Re: [ovs-dev] [RFC 3/3] dpif-netlink: Introduce per-cpu upcall dispatch

2021-06-30 Thread Mark Gray
On 08/06/2021 21:07, Flavio Leitner wrote:
> 
> Hi Mark,
> 
> This looks good to me.
> 
> Since the new scheme doesn't allow users to change the number
> of handlers, we must update ovs-vswitchd.conf.db(5) as well.

I updated this documentation

> 
> Some comments below.
> 
> On Fri, Apr 30, 2021 at 11:31:29AM -0400, Mark Gray wrote:
>> The Open vSwitch kernel module uses the upcall mechanism to send
>> packets from kernel space to user space when it misses in the kernel
>> space flow table. The upcall sends packets via a Netlink socket.
>> Currently, a Netlink socket is created for every vport. In this way,
>> there is a 1:1 mapping between a vport and a Netlink socket.
>> When a packet is received by a vport, if it needs to be sent to
>> user space, it is sent via the corresponding Netlink socket.
>>
>> This mechanism, with various iterations of the corresponding user
>> space code, has seen some limitations and issues:
>>
>> * On systems with a large number of vports, there is correspondingly
>> a large number of Netlink sockets which can limit scaling.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1526306)
>> * Packet reordering on upcalls.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=1844576)
>> * A thundering herd issue.
>> (https://bugzilla.redhat.com/show_bug.cgi?id=183)
>>
>> This patch introduces an alternative, feature-negotiated, upcall
>> mode using a per-cpu dispatch rather than a per-vport dispatch.
>>
>> In this mode, the Netlink socket to be used for the upcall is
>> selected based on the CPU of the thread that is executing the upcall.
>> In this way, it resolves the issues above as:
>>
>> a) The number of Netlink sockets scales with the number of CPUs
>> rather than the number of vports.
>> b) Ordering per-flow is maintained as packets are distributed to
>> CPUs based on mechanisms such as RSS and flows are distributed
>> to a single user space thread.
>> c) Packets from a flow can only wake up one user space thread.
>>
>> Reported-at: https://bugzilla.redhat.com/1844576
>> Signed-off-by: Mark Gray 
>> ---
>>  .../linux/compat/include/linux/openvswitch.h  |   7 +
>>  lib/dpif-netdev.c |   1 +
>>  lib/dpif-netlink.c| 405 +++---
>>  lib/dpif-provider.h   |  10 +
>>  lib/dpif.c|  17 +
>>  lib/dpif.h|   1 +
>>  ofproto/ofproto-dpif-upcall.c |  51 ++-
>>  ofproto/ofproto.c |  12 -
>>  8 files changed, 430 insertions(+), 74 deletions(-)
>>
>> diff --git a/datapath/linux/compat/include/linux/openvswitch.h 
>> b/datapath/linux/compat/include/linux/openvswitch.h
>> index 875de20250ce..f29265df055e 100644
>> --- a/datapath/linux/compat/include/linux/openvswitch.h
>> +++ b/datapath/linux/compat/include/linux/openvswitch.h
>> @@ -89,6 +89,8 @@ enum ovs_datapath_cmd {
>>   * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
>>   * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
>>   * not be sent.
>> + * OVS_DP_ATTR_PER_CPU_PIDS: Per-cpu array of PIDs for upcalls when
>> + * OVS_DP_F_DISPATCH_UPCALL_PER_CPU feature is set.
>>   * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
>>   * datapath.  Always present in notifications.
>>   * @OVS_DP_ATTR_MEGAFLOW_STATS: Statistics about mega flow masks usage for 
>> the
>> @@ -105,6 +107,8 @@ enum ovs_datapath_attr {
>>  OVS_DP_ATTR_MEGAFLOW_STATS, /* struct ovs_dp_megaflow_stats */
>>  OVS_DP_ATTR_USER_FEATURES,  /* OVS_DP_F_*  */
>>  OVS_DP_ATTR_PAD,
>> +OVS_DP_ATTR_PAD2,
>> +OVS_DP_ATTR_PER_CPU_PIDS,   /* Netlink PIDS to receive upcalls */
>>  __OVS_DP_ATTR_MAX
>>  };
>>  
>> @@ -146,6 +150,9 @@ struct ovs_vport_stats {
>>  /* Allow tc offload recirc sharing */
>>  #define OVS_DP_F_TC_RECIRC_SHARING  (1 << 2)
>>  
>> +/* Allow per-cpu dispatch of upcalls */
>> +#define OVS_DP_F_DISPATCH_UPCALL_PER_CPU (1 << 3)
>> +
>>  /* Fixed logical ports. */
>>  #define OVSP_LOCAL  ((__u32)0)
>>  
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 251788b04965..24e6911dd4ff 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -8488,6 +8488,7 @@ const struct dpif_class dpif_netdev_class = {
>>  dpif_netdev_operate,
>>  NULL,   /* recv_set */
>>  NULL,   /* handlers_set */
>> +NULL,   /* handlers_get */
> 
> That is number_handlers_required.
> 

oops. Thanks for catching this

>>  dpif_netdev_set_config,
>>  dpif_netdev_queue_to_priority,
>>  NULL,   /* recv */
>> diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
>> index 2ded5fdd01b3..349897e70632 100644
>> --- a/lib/dpif-netlink.c
>> +++ b/lib/dpif-netlink.c
>> @@ -80,6 +80,9 @@ enum { MAX_PORTS = USHRT_MAX };
>>  #define FLOW_DUMP_MAX_BATCH 50
>>  #define 

Re: [ovs-dev] [RFC 0/3] dpif-netlink: Introduce per-cpu upcall dispatching

2021-06-30 Thread Mark Gray
On 12/05/2021 03:11, Flavio Leitner wrote:
> The results show that this new patch-set addressed the main
> thundering herd issue and the scalability issue I reported
> during V10 review.
> 
> Unfortunately I can review the patches only next week.

Thanks again for testing and reviewing. Sorry for the delay. I will be
more responsive to any further changes from this point on.

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best mfex function

2021-06-30 Thread Van Haaren, Harry
> -Original Message-
> From: Flavio Leitner 
> Sent: Tuesday, June 29, 2021 7:11 PM
> To: Van Haaren, Harry 
> Cc: Eelco Chaudron ; Amber, Kumar
> ; d...@openvswitch.org; i.maxim...@ovn.org
> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select 
> the best
> mfex function
> 
> On Tue, Jun 29, 2021 at 04:32:05PM +, Van Haaren, Harry wrote:
> > > -Original Message-
> > > From: dev  On Behalf Of Eelco Chaudron
> > > Sent: Tuesday, June 29, 2021 1:38 PM
> > > To: Amber, Kumar 
> > > Cc: d...@openvswitch.org; i.maxim...@ovn.org
> > > Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to 
> > > select the
> best
> > > mfex function
> > >
> > > More comments below. FYI I’m only reviewing right now, no testing.
> >
> > Sure, thanks for reviews.
> >
> > > On 17 Jun 2021, at 18:27, Kumar Amber wrote:
> >
> > 
> >
> > > > +/* Allocate per thread PMD pointer space for study_stats. */
> > > > +static inline struct study_stats *
> > > > +get_study_stats(void)
> > > > +{
> > > > +struct study_stats *stats = study_stats_get();
> > > > +if (OVS_UNLIKELY(!stats)) {
> > > > +   stats = xzalloc(sizeof *stats);
> > > > +   study_stats_set_unsafe(stats);
> > > > +}
> > > > +return stats;
> > > > +}
> > > > +
> > >
> > > Just got a mind-meld with the code, and realized that the function might 
> > > be
> different
> > > per PMD thread due to this auto mode (and autovalidator mode in the 
> > > previous
> > > patch).
> > >
> > > This makes it only stronger that we need a way to see the currently 
> > > selected
> mode,
> > > and not per datapath, but per PMD per datapath!
> >
> > Study depends on the traffic pattern, so yes you're correct that it depends.
> > The study command was added after community suggested user-experience
> > would improve if the user doesn't have to provide an exact miniflow profile 
> > name.
> >
> > Study studies the traffic running on that PMD, compares all MFEX impls, and 
> > prints
> out
> > hits. It selects the _first_ implementation that surpasses the threshold of 
> > packets.
> >
> > Users are free to use the more specific names of MFEX impls instead of 
> > "study"
> > for fine-grained control over the MFEX impl in use, e.g.
> >
> > ovs-appctl dpif-netdev/miniflow-parser-set avx512_vbmi_ipv4_udp
> >
> > > Do we also need a way to set this per PMD?
> >
> > I don't feel there is real value here, but we could investigate adding an
> > optional parameter to the command indicating a PMD thread IDX to set?
> > We have access to "pmd->core_id" in our set() function, so limiting changes
> > to a specific PMD thread can be done ~ easily... but is it really required?
> 
> I think the concern here (at least from my side) is that users can
> set the algorithm globally or per DP, not per PMD. However, the
> study can set different algorithms per PMD. For example, say that
> 'study' indicates that alg#1 for PMD#1 and alg#2 for PMD#2 in the
> lab. Now we want to move to production and make that selection
> static, how can we do that?

That's a good question. Today the command doesn't give us per-PMD thread
control. Study can indeed result in different PMDs having different MFEX funcs.
 

> If we set study, how do we tell from the cmdline the algorithm
> chose for each PMD? Another example of the same situation: if
> we always start with 'study' and suddenly there is a traffic
> processing difference. How one can check what is different in
> the settings? The logs don't tell which PMD was affected.

Sure they do; the "pmd-cX" and "pmd-cY" below show what datapath thread selects 
what function.
Note that the first line is from the OVS command thread, which notes that 
"study" was selected.
The following two prints are from each datapath thread, noting the resulting 
function chosen by study.

2021-06-30T09:05:41Z|00134|dpif_netdev|INFO|Miniflow implementation set to 
study.
2021-06-30T09:05:41Z|1|dpif_mfex_extract_study(pmd-cX/id:X)|INFO|MFEX study 
chose impl avx512_vbmi_ipv4_udp: (hits 128/128 pkts)
2021-06-30T09:05:41Z|1|dpif_mfex_extract_study(pmd-cY/id:Y)|INFO|MFEX study 
chose impl avx512_vbmi_ipv4_udp: (hits 128/128 pkts)


> > Perfect is the enemy of good... I'd prefer focus on getting existing code 
> > changes
> merged,
> > and add additional (optional) parameters in future if deemed useful in real 
> > world
> testing?
> 
> True. Perhaps we have different use cases in mind. How do you expect
> users to use this feature? Do you think production users will always
> start with 'study'?

I was expecting OVS users to be aware of what L2-4 traffic they're
running, and to per-instance configure that statically for all datapath
threads, for example by running the command below:

$ ovs-appctl  dpif-netdev/miniflow-parser-set avx512_ipv4_udp

There is an assumption here that all datapath threads handle
the same outer traffic type. If that's not the case, we cannot manually
set different MFEX impls to different pmd threads 

Re: [ovs-dev] [v13 06/12] dpif-netdev: Add command to get dpif implementations.

2021-06-30 Thread Ferriter, Cian
Hi Eelco,

Thanks for you comment. My reply is inline.

Cian

> -Original Message-
> From: Eelco Chaudron 
> Sent: Tuesday 29 June 2021 09:25
> To: Ferriter, Cian ; Van Haaren, Harry 
> 
> Cc: ovs-dev@openvswitch.org; i.maxim...@ovn.org; Flavio Leitner 
> 
> Subject: Re: [ovs-dev] [v13 06/12] dpif-netdev: Add command to get dpif 
> implementations.
> 
> 
> 
> On 17 Jun 2021, at 18:18, Cian Ferriter wrote:
> 
> > From: Harry van Haaren 
> >
> > This commit adds a new command to retrieve the list of available
> > DPIF implementations. This can be used by to check what implementations
> > of the DPIF are available in any given OVS binary.
> >
> > Usage:
> >  $ ovs-appctl dpif-netdev/dpif-get
> >
> 
> This command only shows the actual available options, not the currently 
> selected ones.
> 
> I think from a support perspective, we need this command to also show the 
> current setting for each
> datapath. Having this only in the log will not work in cases where the log is 
> overwritten.

Yes, good idea. More visibility here is a good suggestion, rather than fishing 
through the logs to see what DPIF is set.

> 
> //Eelco
> 
> Guess this also is true for the MFEX patchset (will reply during my review) 
> and the already included
> “ovs-appctl dpif-netdev/subtable-lookup-prio-get”
> 
> 


I'll improve the DPIF get command as per your suggestion. I'm following the 
thread where the DPIF, MFEX and DPCLS get commands are being discussed. I'll 
take the output of this thread into account for the next revision of the 
dpif-impl-get command.

Thread:
https://mail.openvswitch.org/pipermail/ovs-dev/2021-June/384644.html
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3 ovn 8/9] northd: move build_empty_lb_event_flow in build_lswitch_flows_for_lb

2021-06-30 Thread Lorenzo Bianconi
Introduce build_lswitch_flows_for_lb routine in order to visit first
each load_balancer and then related datapath (logical switches) during
lb flow installation.
This patch allows to reduce memory footprint and cpu utilization in
ovn-northd.

Signed-off-by: Lorenzo Bianconi 
---
 northd/ovn-northd.c | 140 
 1 file changed, 78 insertions(+), 62 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 37a13c56c..e653b0dd5 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -5198,7 +5198,7 @@ ls_has_lb_vip(struct ovn_datapath *od)
 
 static void
 build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
- struct shash *meter_groups, struct hmap *lbs)
+ struct hmap *lbs)
 {
 /* Do not send ND packets to conntrack */
 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
@@ -5229,73 +5229,49 @@ build_pre_lb(struct ovn_datapath *od, struct hmap 
*lflows,
  110, lflows);
 }
 
-bool vip_configured = false;
 for (int i = 0; i < od->nbs->n_load_balancer; i++) {
 struct nbrec_load_balancer *nb_lb = od->nbs->load_balancer[i];
 struct ovn_northd_lb *lb =
 ovn_northd_lb_find(lbs, _lb->header_.uuid);
 ovs_assert(lb);
 
-struct ds action = DS_EMPTY_INITIALIZER;
-struct ds match = DS_EMPTY_INITIALIZER;
-
-for (size_t j = 0; j < lb->n_vips; j++) {
-struct ovn_lb_vip *lb_vip = >vips[j];
-
-ds_clear();
-ds_clear();
-if (build_empty_lb_event_flow(lb_vip, nb_lb, meter_groups,
-   , )) {
-ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_PRE_LB, 130,
-ds_cstr(), ds_cstr(),
-_lb->header_);
-}
-
-/* Ignore L4 port information in the key because fragmented packets
- * may not have L4 information.  The pre-stateful table will send
- * the packet through ct() action to de-fragment. In stateful
- * table, we will eventually look at L4 information. */
+/* 'REGBIT_CONNTRACK_NAT' is set to let the pre-stateful table send
+ * packet to conntrack for defragmentation and possibly for unNATting.
+ *
+ * Send all the packets to conntrack in the ingress pipeline if the
+ * logical switch has a load balancer with VIP configured. Earlier
+ * we used to set the REGBIT_CONNTRACK_DEFRAG flag in the ingress
+ * pipeline if the IP destination matches the VIP. But this causes
+ * few issues when a logical switch has no ACLs configured with
+ * allow-related.
+ * To understand the issue, lets a take a TCP load balancer -
+ * 10.0.0.10:80=10.0.0.3:80.
+ * If a logical port - p1 with IP - 10.0.0.5 opens a TCP connection
+ * with the VIP - 10.0.0.10, then the packet in the ingress pipeline
+ * of 'p1' is sent to the p1's conntrack zone id and the packet is
+ * load balanced to the backend - 10.0.0.3. For the reply packet from
+ * the backend lport, it is not sent to the conntrack of backend
+ * lport's zone id. This is fine as long as the packet is valid.
+ * Suppose the backend lport sends an invalid TCP packet (like
+ * incorrect sequence number), the packet gets * delivered to the
+ * lport 'p1' without unDNATing the packet to the VIP - 10.0.0.10.
+ * And this causes the connection to be reset by the lport p1's VIF.
+ *
+ * We can't fix this issue by adding a logical flow to drop ct.inv
+ * packets in the egress pipeline since it will drop all other
+ * connections not destined to the load balancers.
+ *
+ * To fix this issue, we send all the packets to the conntrack in the
+ * ingress pipeline if a load balancer is configured. We can now
+ * add a lflow to drop ct.inv packets.
+ */
+if (lb->n_vips) {
+ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
+  100, "ip", REGBIT_CONNTRACK_NAT" = 1; next;");
+ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB,
+  100, "ip", REGBIT_CONNTRACK_NAT" = 1; next;");
+break;
 }
-ds_destroy();
-ds_destroy();
-
-vip_configured = (vip_configured || lb->n_vips);
-}
-
-/* 'REGBIT_CONNTRACK_NAT' is set to let the pre-stateful table send
- * packet to conntrack for defragmentation and possibly for unNATting.
- *
- * Send all the packets to conntrack in the ingress pipeline if the
- * logical switch has a load balancer with VIP configured. Earlier
- * we used to set the REGBIT_CONNTRACK_DEFRAG flag in the ingress pipeline
- * if the IP destination matches the VIP. But this causes few issues when
- * a 

[ovs-dev] [PATCH v3 ovn 9/9] northd: move build_lb_rules in build_lswitch_flows_for_lb

2021-06-30 Thread Lorenzo Bianconi
Move stateful lb rules for logical switches in
build_lswitch_flows_for_lb routine in order to reduce cpu utilization

Signed-off-by: Lorenzo Bianconi 
---
 northd/ovn-northd.c | 42 ++
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index e653b0dd5..3ae2b8d9f 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -6028,8 +6028,7 @@ build_qos(struct ovn_datapath *od, struct hmap *lflows) {
 }
 
 static void
-build_lb_rules(struct ovn_datapath *od, struct hmap *lflows,
-   struct ovn_northd_lb *lb)
+build_lb_rules(struct hmap *lflows, struct ovn_northd_lb *lb)
 {
 struct ds action = DS_EMPTY_INITIALIZER;
 struct ds match = DS_EMPTY_INITIALIZER;
@@ -6079,15 +6078,15 @@ build_lb_rules(struct ovn_datapath *od, struct hmap 
*lflows,
 
 ds_put_format(, "ct.new && %s.dst == %s", ip_match,
   lb_vip->vip_str);
+int priority = 110;
 if (lb_vip->vip_port) {
 ds_put_format(, " && %s.dst == %d", proto, lb_vip->vip_port);
-ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_STATEFUL, 120,
-ds_cstr(), ds_cstr(),
->nlb->header_);
-} else {
-ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_STATEFUL, 110,
-ds_cstr(), ds_cstr(),
->nlb->header_);
+priority += 10;
+}
+for (int j = 0; j < lb->n_nb_ls; j++) {
+ovn_lflow_add_with_hint(lflows, lb->nb_ls[j], S_SWITCH_IN_STATEFUL,
+priority, ds_cstr(),
+ds_cstr(), >nlb->header_);
 }
 }
 ds_destroy();
@@ -6095,7 +6094,7 @@ build_lb_rules(struct ovn_datapath *od, struct hmap 
*lflows,
 }
 
 static void
-build_stateful(struct ovn_datapath *od, struct hmap *lflows, struct hmap *lbs)
+build_stateful(struct ovn_datapath *od, struct hmap *lflows)
 {
 /* Ingress and Egress stateful Table (Priority 0): Packets are
  * allowed by default. */
@@ -6112,19 +6111,6 @@ build_stateful(struct ovn_datapath *od, struct hmap 
*lflows, struct hmap *lbs)
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_STATEFUL, 100,
   REGBIT_CONNTRACK_COMMIT" == 1",
   "ct_commit { ct_label.blocked = 0; }; next;");
-
-/* Load balancing rules for new connections get committed to conntrack
- * table.  So even if REGBIT_CONNTRACK_COMMIT is set in a previous table
- * a higher priority rule for load balancing below also commits the
- * connection, so it is okay if we do not hit the above match on
- * REGBIT_CONNTRACK_COMMIT. */
-for (int i = 0; i < od->nbs->n_load_balancer; i++) {
-struct ovn_northd_lb *lb =
-ovn_northd_lb_find(lbs, >nbs->load_balancer[i]->header_.uuid);
-
-ovs_assert(lb);
-build_lb_rules(od, lflows, lb);
-}
 }
 
 static void
@@ -6892,7 +6878,7 @@ build_lswitch_lflows_pre_acl_and_acl(struct ovn_datapath 
*od,
 build_acl_hints(od, lflows);
 build_acls(od, lflows, port_groups, meter_groups);
 build_qos(od, lflows);
-build_stateful(od, lflows, lbs);
+build_stateful(od, lflows);
 build_lb_hairpin(od, lflows);
 }
 }
@@ -8988,6 +8974,7 @@ build_lswitch_flows_for_lb(struct ovn_northd_lb *lb, 
struct hmap *lflows,
 ds_clear();
 ds_clear();
 
+/* pre-stateful lb */
 if (build_empty_lb_event_flow(lb_vip, lb->nlb, meter_groups,
, )) {
 for (int j = 0; j < lb->n_nb_ls; j++) {
@@ -9006,6 +8993,13 @@ build_lswitch_flows_for_lb(struct ovn_northd_lb *lb, 
struct hmap *lflows,
 ds_destroy();
 ds_destroy();
 
+/* stateful lb
+ * Load balancing rules for new connections get committed to conntrack
+ * table.  So even if REGBIT_CONNTRACK_COMMIT is set in a previous table
+ * a higher priority rule for load balancing below also commits the
+ * connection, so it is okay if we do not hit the above match on
+ * REGBIT_CONNTRACK_COMMIT. */
+build_lb_rules(lflows, lb);
 }
 
 static void
-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3 ovn 7/9] lb: link logical switches assigned for the same lb

2021-06-30 Thread Lorenzo Bianconi
As it has been already done for logical routers, add logical switches
datapath references in ovn_northd_lb data structure.
This is a preliminary patch to invert the logic used during the lb flow
creation in order to visit lb first and then related datapath.

Signed-off-by: Lorenzo Bianconi 
---
 lib/lb.c| 11 +++
 lib/lb.h|  6 ++
 northd/ovn-northd.c |  1 +
 3 files changed, 18 insertions(+)

diff --git a/lib/lb.c b/lib/lb.c
index d24672b82..e8abdbb20 100644
--- a/lib/lb.c
+++ b/lib/lb.c
@@ -255,6 +255,16 @@ ovn_northd_lb_add_lr(struct ovn_northd_lb *lb, struct 
ovn_datapath *od)
 lb->nb_lr[lb->n_nb_lr++] = od;
 }
 
+void
+ovn_northd_lb_add_ls(struct ovn_northd_lb *lb, struct ovn_datapath *od)
+{
+if (lb->n_allocated_nb_ls == lb->n_nb_ls) {
+lb->nb_ls = x2nrealloc(lb->nb_ls, >n_allocated_nb_ls,
+   sizeof *lb->nb_ls);
+}
+lb->nb_ls[lb->n_nb_ls++] = od;
+}
+
 void
 ovn_northd_lb_destroy(struct ovn_northd_lb *lb)
 {
@@ -269,6 +279,7 @@ ovn_northd_lb_destroy(struct ovn_northd_lb *lb)
 free(lb->selection_fields);
 free(lb->dps);
 free(lb->nb_lr);
+free(lb->nb_ls);
 free(lb);
 }
 
diff --git a/lib/lb.h b/lib/lb.h
index 4e8fd6604..06763a3f2 100644
--- a/lib/lb.h
+++ b/lib/lb.h
@@ -49,6 +49,10 @@ struct ovn_northd_lb {
 size_t n_nb_lr;
 size_t n_allocated_nb_lr;
 struct ovn_datapath **nb_lr;
+
+size_t n_nb_ls;
+size_t n_allocated_nb_ls;
+struct ovn_datapath **nb_ls;
 };
 
 struct ovn_lb_vip {
@@ -91,6 +95,8 @@ void ovn_northd_lb_add_datapath(struct ovn_northd_lb *,
 const struct sbrec_datapath_binding *);
 void
 ovn_northd_lb_add_lr(struct ovn_northd_lb *lb, struct ovn_datapath *od);
+void
+ovn_northd_lb_add_ls(struct ovn_northd_lb *lb, struct ovn_datapath *od);
 
 struct ovn_controller_lb {
 const struct sbrec_load_balancer *slb; /* May be NULL. */
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index d1507d4b5..37a13c56c 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -3435,6 +3435,7 @@ build_ovn_lbs(struct northd_context *ctx, struct hmap 
*datapaths,
 lb = ovn_northd_lb_find(lbs, lb_uuid);
 
 ovn_northd_lb_add_datapath(lb, od->sb);
+ovn_northd_lb_add_ls(lb, od);
 }
 }
 
-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3 ovn 5/9] northd: get rid of add_router_lb_flow

2021-06-30 Thread Lorenzo Bianconi
Remove add_router_lb_flow routine and move leftover lb flow
installation code in build_lrouter_snat_flows_for_lb routine

Signed-off-by: Lorenzo Bianconi 
---
 northd/ovn-northd.c | 258 
 1 file changed, 119 insertions(+), 139 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index ddbc6289e..2e69394b3 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -8767,92 +8767,6 @@ enum lb_snat_type {
 SKIP_SNAT,
 };
 
-static void
-add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od,
-   enum lb_snat_type snat_type, struct ovn_lb_vip *lb_vip,
-   const char *proto, struct nbrec_load_balancer *lb,
-   struct sset *nat_entries)
-{
-const char *ip_match = NULL;
-if (IN6_IS_ADDR_V4MAPPED(_vip->vip)) {
-ip_match = "ip4";
-} else {
-ip_match = "ip6";
-}
-
-if (sset_contains(nat_entries, lb_vip->vip_str)) {
-/* The load balancer vip is also present in the NAT entries.
- * So add a high priority lflow to advance the the packet
- * destined to the vip (and the vip port if defined)
- * in the S_ROUTER_IN_UNSNAT stage.
- * There seems to be an issue with ovs-vswitchd. When the new
- * connection packet destined for the lb vip is received,
- * it is dnat'ed in the S_ROUTER_IN_DNAT stage in the dnat
- * conntrack zone. For the next packet, if it goes through
- * unsnat stage, the conntrack flags are not set properly, and
- * it doesn't hit the established state flows in
- * S_ROUTER_IN_DNAT stage. */
-struct ds unsnat_match = DS_EMPTY_INITIALIZER;
-ds_put_format(_match, "%s && %s.dst == %s && %s",
-  ip_match, ip_match, lb_vip->vip_str, proto);
-if (lb_vip->vip_port) {
-ds_put_format(_match, " && %s.dst == %d", proto,
-  lb_vip->vip_port);
-}
-
-ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_UNSNAT, 120,
-ds_cstr(_match), "next;", >header_);
-
-ds_destroy(_match);
-}
-
-if (!od->l3dgw_port || !od->l3redirect_port || !lb_vip->n_backends) {
-return;
-}
-
-/* Add logical flows to UNDNAT the load balanced reverse traffic in
- * the router egress pipleine stage - S_ROUTER_OUT_UNDNAT if the logical
- * router has a gateway router port associated.
- */
-struct ds undnat_match = DS_EMPTY_INITIALIZER;
-ds_put_format(_match, "%s && (", ip_match);
-
-for (size_t i = 0; i < lb_vip->n_backends; i++) {
-struct ovn_lb_backend *backend = _vip->backends[i];
-ds_put_format(_match, "(%s.src == %s", ip_match,
-  backend->ip_str);
-
-if (backend->port) {
-ds_put_format(_match, " && %s.src == %d) || ",
-  proto, backend->port);
-} else {
-ds_put_cstr(_match, ") || ");
-}
-}
-
-ds_chomp(_match, ' ');
-ds_chomp(_match, '|');
-ds_chomp(_match, '|');
-ds_chomp(_match, ' ');
-ds_put_format(_match, ") && outport == %s && "
- "is_chassis_resident(%s)", od->l3dgw_port->json_key,
- od->l3redirect_port->json_key);
-if (snat_type == FORCE_SNAT || snat_type == SKIP_SNAT) {
-char *action = xasprintf("flags.%s_snat_for_lb = 1; ct_dnat;",
- snat_type == SKIP_SNAT ? "skip" : "force");
-ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_UNDNAT, 120,
-ds_cstr(_match), action,
->header_);
-free(action);
-} else {
-ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_UNDNAT, 120,
-ds_cstr(_match), "ct_dnat;",
->header_);
-}
-
-ds_destroy(_match);
-}
-
 static void
 build_lrouter_snat_flows_for_lb(struct ovn_lb_vip *lb_vip,
 struct ovn_northd_lb *lb,
@@ -8901,9 +8815,88 @@ build_lrouter_snat_flows_for_lb(struct ovn_lb_vip 
*lb_vip,
 new_match = xasprintf("ct.new && %s", ds_cstr());
 est_match = xasprintf("ct.est && %s", ds_cstr());
 
+const char *ip_match = NULL;
+if (IN6_IS_ADDR_V4MAPPED(_vip->vip)) {
+ip_match = "ip4";
+} else {
+ip_match = "ip6";
+}
+
+/* Add logical flows to UNDNAT the load balanced reverse traffic in
+ * the router egress pipleine stage - S_ROUTER_OUT_UNDNAT if the logical
+ * router has a gateway router port associated.
+ */
+struct ds undnat_match = DS_EMPTY_INITIALIZER;
+ds_put_format(_match, "%s && (", ip_match);
+
+for (size_t i = 0; i < lb_vip->n_backends; i++) {
+struct ovn_lb_backend *backend = _vip->backends[i];
+ds_put_format(_match, "(%s.src == %s", ip_match,
+  backend->ip_str);
+
+if 

[ovs-dev] [PATCH v3 ovn 6/9] northd: remove dead code in build_lrouter_nat_defrag_and_lb

2021-06-30 Thread Lorenzo Bianconi
Remove if condition that is never executed in
build_lrouter_nat_defrag_and_lb routine

Signed-off-by: Lorenzo Bianconi 
---
 northd/ovn-northd.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 2e69394b3..d1507d4b5 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -12012,13 +12012,6 @@ build_lrouter_nat_defrag_and_lb(struct ovn_datapath 
*od,
   "ip", "flags.loopback = 1; ct_dnat;");
 }
 
-/* Load balancing and packet defrag are only valid on
- * Gateway routers or router with gateway port. */
-if (!smap_get(>nbr->options, "chassis") && !od->l3dgw_port) {
-sset_destroy(_entries);
-return;
-}
-
 build_lrouter_lb_flows(lflows, od, lbs, match);
 
 sset_destroy(_entries);
-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3 ovn 4/9] northd: move lb_{skip, force}_snat code in build_lrouter_snat_flows_for_lb

2021-06-30 Thread Lorenzo Bianconi
Introduce build_lrouter_snat_flows_for_lb routine to configuring
lb_{skip,force}_snat flows for each configured load_balancer

Signed-off-by: Lorenzo Bianconi 
---
 northd/ovn-northd.c | 205 
 1 file changed, 152 insertions(+), 53 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index b6889d887..ddbc6289e 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -3384,6 +3384,30 @@ void build_lb_vip_actions(struct ovn_lb_vip *lb_vip,
 }
 }
 
+static void
+build_ovn_lr_lbs(struct hmap *datapaths, struct hmap *lbs)
+{
+struct ovn_northd_lb *lb;
+struct ovn_datapath *od;
+
+HMAP_FOR_EACH (od, key_node, datapaths) {
+if (!od->nbr) {
+continue;
+}
+if (!smap_get(>nbr->options, "chassis") && !od->l3dgw_port) {
+continue;
+}
+
+for (size_t i = 0; i < od->nbr->n_load_balancer; i++) {
+const struct uuid *lb_uuid =
+>nbr->load_balancer[i]->header_.uuid;
+lb = ovn_northd_lb_find(lbs, lb_uuid);
+
+ovn_northd_lb_add_lr(lb, od);
+}
+}
+}
+
 static void
 build_ovn_lbs(struct northd_context *ctx, struct hmap *datapaths,
   struct hmap *lbs)
@@ -3414,23 +3438,6 @@ build_ovn_lbs(struct northd_context *ctx, struct hmap 
*datapaths,
 }
 }
 
-HMAP_FOR_EACH (od, key_node, datapaths) {
-if (!od->nbr) {
-continue;
-}
-if (!smap_get(>nbr->options, "chassis") && !od->l3dgw_port) {
-continue;
-}
-
-for (size_t i = 0; i < od->nbr->n_load_balancer; i++) {
-const struct uuid *lb_uuid =
->nbr->load_balancer[i]->header_.uuid;
-lb = ovn_northd_lb_find(lbs, lb_uuid);
-
-ovn_northd_lb_add_lr(lb, od);
-}
-}
-
 /* Delete any stale SB load balancer rows. */
 const struct sbrec_load_balancer *sbrec_lb, *next;
 SBREC_LOAD_BALANCER_FOR_EACH_SAFE (sbrec_lb, next, ctx->ovnsb_idl) {
@@ -8762,41 +8769,10 @@ enum lb_snat_type {
 
 static void
 add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od,
-   struct ds *match, struct ds *actions, int priority,
enum lb_snat_type snat_type, struct ovn_lb_vip *lb_vip,
const char *proto, struct nbrec_load_balancer *lb,
struct sset *nat_entries)
 {
-/* A match and actions for new connections. */
-char *new_match = xasprintf("ct.new && %s", ds_cstr(match));
-if (snat_type == FORCE_SNAT || snat_type == SKIP_SNAT) {
-char *new_actions = xasprintf("flags.%s_snat_for_lb = 1; %s",
-snat_type == SKIP_SNAT ? "skip" : "force",
-ds_cstr(actions));
-ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, priority,
-new_match, new_actions, >header_);
-free(new_actions);
-} else {
-ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, priority,
-new_match, ds_cstr(actions), >header_);
-}
-
-/* A match and actions for established connections. */
-char *est_match = xasprintf("ct.est && %s", ds_cstr(match));
-if (snat_type == FORCE_SNAT || snat_type == SKIP_SNAT) {
-char *est_actions = xasprintf("flags.%s_snat_for_lb = 1; ct_dnat;",
-snat_type == SKIP_SNAT ? "skip" : "force");
-ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, priority,
-est_match, est_actions, >header_);
-free(est_actions);
-} else {
-ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, priority,
-est_match, "ct_dnat;", >header_);
-}
-
-free(new_match);
-free(est_match);
-
 const char *ip_match = NULL;
 if (IN6_IS_ADDR_V4MAPPED(_vip->vip)) {
 ip_match = "ip4";
@@ -8877,6 +8853,126 @@ add_router_lb_flow(struct hmap *lflows, struct 
ovn_datapath *od,
 ds_destroy(_match);
 }
 
+static void
+build_lrouter_snat_flows_for_lb(struct ovn_lb_vip *lb_vip,
+struct ovn_northd_lb *lb,
+struct ovn_northd_lb_vip *vips_nb,
+struct hmap *lflows)
+{
+char *new_match, *skip_snat_new_action = NULL;
+char *est_match, *skip_snat_est_action = NULL;
+struct ds action = DS_EMPTY_INITIALIZER;
+struct ds match = DS_EMPTY_INITIALIZER;
+
+build_lb_vip_actions(lb_vip, vips_nb, ,
+ lb->selection_fields, false);
+
+/* Higher priority rules are added for load-balancing in DNAT
+ * table.  For every match (on a VIP[:port]), we add two flows.
+ * One flow is for specific matching on ct.new with an action
+ * of "ct_lb($targets);". The other flow is for ct.est with
+ * an action of "ct_dnat;".
+ */
+if (IN6_IS_ADDR_V4MAPPED(_vip->vip)) {
+ds_put_format(, "ip && 

[ovs-dev] [PATCH v3 ovn 3/9] northd: move build_empty_lb_event_flow in build_lrouter_flows_for_lb

2021-06-30 Thread Lorenzo Bianconi
Introduce build_lrouter_flows_for_lb routine in order to visit first each
load_balancer and then related datapath during lb flow installation.
This patch allows to reduce memory footprint and cpu utilization in
ovn-northd.

Signed-off-by: Lorenzo Bianconi 
---
 northd/ovn-northd.c | 109 +---
 1 file changed, 73 insertions(+), 36 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index f23b299d8..b6889d887 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -5126,52 +5126,53 @@ ls_has_dns_records(const struct nbrec_logical_switch 
*nbs)
 return false;
 }
 
-static void
-build_empty_lb_event_flow(struct ovn_datapath *od, struct hmap *lflows,
-  struct ovn_lb_vip *lb_vip,
-  struct nbrec_load_balancer *lb,
-  int pl, struct shash *meter_groups)
+static bool
+build_empty_lb_event_flow(struct ovn_lb_vip *lb_vip,
+  const struct nbrec_load_balancer *lb,
+  struct shash *meter_groups,
+  struct ds *match, struct ds *action)
 {
 bool controller_event = smap_get_bool(>options, "event", false) ||
 controller_event_en; /* deprecated */
 if (!controller_event || lb_vip->n_backends ||
 lb_vip->empty_backend_rej) {
-return;
+return false;
 }
 
+ds_clear(action);
+ds_clear(match);
+
 bool ipv4 = IN6_IS_ADDR_V4MAPPED(_vip->vip);
-struct ds match = DS_EMPTY_INITIALIZER;
-char *meter = "", *action;
+char *meter = "";
 
 if (meter_groups && shash_find(meter_groups, "event-elb")) {
 meter = "event-elb";
 }
 
-ds_put_format(, "ip%s.dst == %s && %s",
+ds_put_format(match, "ip%s.dst == %s && %s",
   ipv4 ? "4": "6", lb_vip->vip_str, lb->protocol);
 
 char *vip = lb_vip->vip_str;
 if (lb_vip->vip_port) {
-ds_put_format(, " && %s.dst == %u", lb->protocol,
+ds_put_format(match, " && %s.dst == %u", lb->protocol,
   lb_vip->vip_port);
 vip = xasprintf("%s%s%s:%u", ipv4 ? "" : "[", lb_vip->vip_str,
 ipv4 ? "" : "]", lb_vip->vip_port);
 }
 
-action = xasprintf("trigger_event(event = \"%s\", "
-   "meter = \"%s\", vip = \"%s\", "
-   "protocol = \"%s\", "
-   "load_balancer = \"" UUID_FMT "\");",
-   event_to_string(OVN_EVENT_EMPTY_LB_BACKENDS),
-   meter, vip, lb->protocol,
-   UUID_ARGS(>header_.uuid));
-ovn_lflow_add_with_hint(lflows, od, pl, 130, ds_cstr(), action,
->header_);
-ds_destroy();
+ds_put_format(action,
+  "trigger_event(event = \"%s\", "
+   "meter = \"%s\", vip = \"%s\", "
+   "protocol = \"%s\", "
+   "load_balancer = \"" UUID_FMT "\");",
+   event_to_string(OVN_EVENT_EMPTY_LB_BACKENDS),
+   meter, vip, lb->protocol,
+   UUID_ARGS(>header_.uuid));
 if (lb_vip->vip_port) {
 free(vip);
 }
-free(action);
+
+return true;
 }
 
 static bool
@@ -5227,16 +5228,28 @@ build_pre_lb(struct ovn_datapath *od, struct hmap 
*lflows,
 ovn_northd_lb_find(lbs, _lb->header_.uuid);
 ovs_assert(lb);
 
+struct ds action = DS_EMPTY_INITIALIZER;
+struct ds match = DS_EMPTY_INITIALIZER;
+
 for (size_t j = 0; j < lb->n_vips; j++) {
 struct ovn_lb_vip *lb_vip = >vips[j];
-build_empty_lb_event_flow(od, lflows, lb_vip, nb_lb,
-  S_SWITCH_IN_PRE_LB, meter_groups);
+
+ds_clear();
+ds_clear();
+if (build_empty_lb_event_flow(lb_vip, nb_lb, meter_groups,
+   , )) {
+ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_PRE_LB, 130,
+ds_cstr(), ds_cstr(),
+_lb->header_);
+}
 
 /* Ignore L4 port information in the key because fragmented packets
  * may not have L4 information.  The pre-stateful table will send
  * the packet through ct() action to de-fragment. In stateful
  * table, we will eventually look at L4 information. */
 }
+ds_destroy();
+ds_destroy();
 
 vip_configured = (vip_configured || lb->n_vips);
 }
@@ -8752,11 +8765,8 @@ add_router_lb_flow(struct hmap *lflows, struct 
ovn_datapath *od,
struct ds *match, struct ds *actions, int priority,
enum lb_snat_type snat_type, struct ovn_lb_vip *lb_vip,
const char *proto, struct nbrec_load_balancer *lb,
-   struct shash *meter_groups, struct sset 

[ovs-dev] [PATCH v3 ovn 2/9] lib: link logical routers assigned for the same lb

2021-06-30 Thread Lorenzo Bianconi
add logical routers datapath references in ovn_northd_lb data structure.
This is a preliminary patch to invert the logic used during the lb flow
creation in order to visit lb first and then related datapath.

Signed-off-by: Lorenzo Bianconi 
---
 lib/lb.c| 11 +++
 lib/lb.h|  6 ++
 northd/ovn-northd.c | 17 +
 3 files changed, 34 insertions(+)

diff --git a/lib/lb.c b/lib/lb.c
index 4cb46b346..d24672b82 100644
--- a/lib/lb.c
+++ b/lib/lb.c
@@ -245,6 +245,16 @@ ovn_northd_lb_add_datapath(struct ovn_northd_lb *lb,
 lb->dps[lb->n_dps++] = sb;
 }
 
+void
+ovn_northd_lb_add_lr(struct ovn_northd_lb *lb, struct ovn_datapath *od)
+{
+if (lb->n_allocated_nb_lr == lb->n_nb_lr) {
+lb->nb_lr = x2nrealloc(lb->nb_lr, >n_allocated_nb_lr,
+   sizeof *lb->nb_lr);
+}
+lb->nb_lr[lb->n_nb_lr++] = od;
+}
+
 void
 ovn_northd_lb_destroy(struct ovn_northd_lb *lb)
 {
@@ -258,6 +268,7 @@ ovn_northd_lb_destroy(struct ovn_northd_lb *lb)
 sset_destroy(>ips_v6);
 free(lb->selection_fields);
 free(lb->dps);
+free(lb->nb_lr);
 free(lb);
 }
 
diff --git a/lib/lb.h b/lib/lb.h
index 58e6bb031..4e8fd6604 100644
--- a/lib/lb.h
+++ b/lib/lb.h
@@ -45,6 +45,10 @@ struct ovn_northd_lb {
 size_t n_dps;
 size_t n_allocated_dps;
 const struct sbrec_datapath_binding **dps;
+
+size_t n_nb_lr;
+size_t n_allocated_nb_lr;
+struct ovn_datapath **nb_lr;
 };
 
 struct ovn_lb_vip {
@@ -85,6 +89,8 @@ struct ovn_northd_lb * ovn_northd_lb_find(struct hmap *, 
const struct uuid *);
 void ovn_northd_lb_destroy(struct ovn_northd_lb *);
 void ovn_northd_lb_add_datapath(struct ovn_northd_lb *,
 const struct sbrec_datapath_binding *);
+void
+ovn_northd_lb_add_lr(struct ovn_northd_lb *lb, struct ovn_datapath *od);
 
 struct ovn_controller_lb {
 const struct sbrec_load_balancer *slb; /* May be NULL. */
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 6e182b1cb..f23b299d8 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -3414,6 +3414,23 @@ build_ovn_lbs(struct northd_context *ctx, struct hmap 
*datapaths,
 }
 }
 
+HMAP_FOR_EACH (od, key_node, datapaths) {
+if (!od->nbr) {
+continue;
+}
+if (!smap_get(>nbr->options, "chassis") && !od->l3dgw_port) {
+continue;
+}
+
+for (size_t i = 0; i < od->nbr->n_load_balancer; i++) {
+const struct uuid *lb_uuid =
+>nbr->load_balancer[i]->header_.uuid;
+lb = ovn_northd_lb_find(lbs, lb_uuid);
+
+ovn_northd_lb_add_lr(lb, od);
+}
+}
+
 /* Delete any stale SB load balancer rows. */
 const struct sbrec_load_balancer *sbrec_lb, *next;
 SBREC_LOAD_BALANCER_FOR_EACH_SAFE (sbrec_lb, next, ctx->ovnsb_idl) {
-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3 ovn 1/9] northd: move snat_type out of vip loop

2021-06-30 Thread Lorenzo Bianconi
Move snat_type out of vip loop in build_lrouter_lb_flows() since there
is not vip dependency

Signed-off-by: Lorenzo Bianconi 
---
 northd/ovn-northd.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index e96494ba3..6e182b1cb 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -8867,10 +8867,13 @@ build_lrouter_lb_flows(struct hmap *lflows, struct 
ovn_datapath *od,
 ovn_northd_lb_find(lbs, _lb->header_.uuid);
 ovs_assert(lb);
 
-bool lb_skip_snat = smap_get_bool(_lb->options, "skip_snat", false);
-if (lb_skip_snat) {
+enum lb_snat_type snat_type = NO_FORCE_SNAT;
+if (smap_get_bool(_lb->options, "skip_snat", false)) {
 ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 120,
   "flags.skip_snat_for_lb == 1 && ip", "next;");
+snat_type = SKIP_SNAT;
+} else if (lb_force_snat_ip || od->lb_force_snat_router_ip) {
+snat_type = FORCE_SNAT;
 }
 
 for (size_t j = 0; j < lb->n_vips; j++) {
@@ -8934,13 +8937,6 @@ build_lrouter_lb_flows(struct hmap *lflows, struct 
ovn_datapath *od,
 ds_put_format(match, " && is_chassis_resident(%s)",
   od->l3redirect_port->json_key);
 }
-
-enum lb_snat_type snat_type = NO_FORCE_SNAT;
-if (lb_skip_snat) {
-snat_type = SKIP_SNAT;
-} else if (lb_force_snat_ip || od->lb_force_snat_router_ip) {
-snat_type = FORCE_SNAT;
-}
 add_router_lb_flow(lflows, od, match, actions, prio,
snat_type, lb_vip, proto, nb_lb,
meter_groups, nat_entries);
-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH v3 ovn 0/9] northd: rework ovn-northd lb flow installation

2021-06-30 Thread Lorenzo Bianconi
Rework lb flow logic in order to visit first each load_balancer and then
related datapath during lb flow installation.
This patch allows to reduce memory footprint and cpu utilization in
ovn-northd.

Testing environment:
ovn-nbctl lr-list |wc -l
308
ovn-nbctl ls-list |wc -l
615
ovn-nbctl lb-list |wc -l
14524

Time needed for build_lrouter_lb_flows() to run for all datapaths/lbs (logical 
routers)
Total samples: 22
Maximum: 6937 msec
Minimum: 6869 msec
95th percentile: 6933.00 msec
Short term average: 6916.599206 msec
Long term average: 6914.809656 msec

Time needed for build_pre_lb()/build_stateful()[lb-only] to run for all 
datapaths/lbs (logical switches)
  Total samples: 20
  Maximum: 1735 msec
  Minimum: 1693 msec
  95th percentile: 1735.00 msec
  Short term average: 1731.136610 msec
  Long term average: 1698.853040 msec

Time needed for build_lrouter_flows_for_lb() to run for all lbs/datapaths 
(logical routers)
   Total samples: 22
   Maximum: 2745 msec
   Minimum: 2674 msec
   95th percentile: 2742.00 msec
   Short term average: 2724.775973 msec
   Long term average: 2681.334522 msec

Time needed for build_lswitch_flows_for_lb() to run for all lbs/datapaths 
(logical switches)
  Total samples: 20
  Maximum: 406 msec
  Minimum: 354 msec
  95th percentile: 406.00 msec
  Short term average: 383.915676 msec
  Long term average: 363.318006 mse

This series does not introduce any new feature to ovn-northd.

Changes since v2:
- fix memory leak in build_lrouter_snat_flows_for_lb()

Changes since v1:
- rebase ontop of ovn-master
- add build_lswitch_flows_for_lb routine

Lorenzo Bianconi (9):
  northd: move snat_type out of vip loop
  lib: link logical routers assigned for the same lb
  northd: move build_empty_lb_event_flow in build_lrouter_flows_for_lb
  northd: move lb_{skip,force}_snat code in
build_lrouter_snat_flows_for_lb
  northd: get rid of add_router_lb_flow
  northd: remove dead code in build_lrouter_nat_defrag_and_lb
  lb: link logical switches assigned for the same lb
  northd: move build_empty_lb_event_flow in build_lswitch_flows_for_lb
  northd: move build_lb_rules in build_lswitch_flows_for_lb

 lib/lb.c|  22 ++
 lib/lb.h|  12 +
 northd/ovn-northd.c | 607 +++-
 3 files changed, 404 insertions(+), 237 deletions(-)

-- 
2.31.1

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best mfex function

2021-06-30 Thread Van Haaren, Harry
> -Original Message-
> From: Eelco Chaudron 
> Sent: Wednesday, June 30, 2021 10:18 AM
> To: Van Haaren, Harry 
> Cc: Amber, Kumar ; d...@openvswitch.org;
> i.maxim...@ovn.org
> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select 
> the best
> mfex function
> 
> 
> 
> On 29 Jun 2021, at 18:32, Van Haaren, Harry wrote:
> 
> >> -Original Message-
> >> From: dev  On Behalf Of Eelco Chaudron
> >> Sent: Tuesday, June 29, 2021 1:38 PM
> >> To: Amber, Kumar 
> >> Cc: d...@openvswitch.org; i.maxim...@ovn.org
> >> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to 
> >> select the
> best
> >> mfex function



> > Perfect is the enemy of good... I'd prefer focus on getting existing code 
> > changes
> merged,
> > and add additional (optional) parameters in future if deemed useful in real 
> > world
> testing?
> 
> See Flavio’s reply, as those were the concerns same concerns I thought of.

Yes - thanks for combining threads - I'm writing a detailed reply there as we 
speak here :)
I'll send that reply shortly.



> >>> +if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
> >>> +/* Set the implementation to index with max_hits. */
> >>> +pmd->miniflow_extract_opt =
> >>> +miniflow_funcs[best_func_index].extract_func;
> >>> +VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
> >>> +  miniflow_funcs[best_func_index].name, max_hits,
> >>> +  stats->pkt_count);
> >>
> >> We have no idea which PMD the mode is selected for guess we might need to 
> >> add
> >> this?
> >>
> >> Maybe we should report the numbers/hits for the other methods, as they 
> >> might
> be
> >> equal, and some might be faster in execution time?
> >
> > As above, the implementations are sorted in performance order. Performance
> > here can be known by micro-benchmarks, and developers of such SIMD optimized
> > code can be expected to know which impl is fastest.
> 
> Don’t think we can, as it’s not documented in the code, and some one can just 
> add
> his own, and has no clue about the existing ones.

Yes, in theory somebody could add his own, and get this wrong. There are many 
many
things that could go wrong when making code changes. We cannot document 
everything.


> > In our current code, the avx512_vbmi_* impls are always before the avx512_*
> > impls, as the VBMI instruction set allows a faster runtime.
> 
> Guess we need some documentation in the developer's section on how to add
> processor optimized functions, and how to benchmark them (and maybe some
> benchmark data for the current implementations).
> Also, someone can write a sloppy avx512_vbmi* function that might be slower 
> than
> an avx512_*, right?

What are we trying to achieve here? What is the root problem that is being 
addressed?

Yes, somebody "could" write sloppy (complex, advanced, ISA specific, SIMD) 
avx512 code,
and have it be slower. Who is realistically going to do that?

I'm fine with documenting a few things if they make sense to document, but
trying to "hand hold" at every level just doesn't work. Adding sections on how
to benchmark code, and how function pointers work and how to add them?
These things are documented in various places across the internet.

If there's really an interest to learn AVX512 SIMD optimization, reach out to 
the
OVS community, put me on CC, and I'll be willing to help. Adding documentation
ad nauseam is not the solution, as each optimization is likely to have subtle 
differences.


> > 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 07/12] test/sytem-dpdk: Add unit test for mfex autovalidator

2021-06-30 Thread Eelco Chaudron


On 29 Jun 2021, at 20:49, Amber, Kumar wrote:

> Hi Flavio,
>
> Comments Inline.
>
>> -Original Message-
>> From: Flavio Leitner 
>> Sent: Tuesday, June 29, 2021 11:49 PM
>> To: Amber, Kumar 
>> Cc: Eelco Chaudron ; d...@openvswitch.org;
>> i.maxim...@ovn.org
>> Subject: Re: [ovs-dev] [v4 07/12] test/sytem-dpdk: Add unit test for mfex
>> autovalidator
>>
>> On Tue, Jun 29, 2021 at 05:11:00PM +, Amber, Kumar wrote:
>>> Hi Eelco, Flavio,
>>>
>>> Pls find my replies Inline
>>>
 -Original Message-
 From: Flavio Leitner 
 Sent: Tuesday, June 29, 2021 7:51 PM
 To: Eelco Chaudron 
 Cc: Amber, Kumar ; Van Haaren, Harry
 ; d...@openvswitch.org;
 i.maxim...@ovn.org
 Subject: Re: [ovs-dev] [v4 07/12] test/sytem-dpdk: Add unit test for
 mfex autovalidator

 On Tue, Jun 29, 2021 at 03:50:22PM +0200, Eelco Chaudron wrote:
>
>
> On 28 Jun 2021, at 4:57, Flavio Leitner wrote:
>
>> Hi,
>>
>>
>> On Thu, Jun 17, 2021 at 09:57:49PM +0530, Kumar Amber wrote:
>>> Tests:
>>>   6: OVS-DPDK - MFEX Autovalidator
>>>   7: OVS-DPDK - MFEX Autovalidator Fuzzy
>>>
>>> Added a new directory to store the PCAP file used in the tests
>>> and a script to generate the fuzzy traffic type pcap to be used
>>> in fuzzy unit test.
>>
>>
>> I haven't tried this yet but am I right that these tests are
>> going to pass a pcap to send traffic in a busy loop for 5
>> seconds in the first case and 20 seconds in the second case?
>>
>> I see that when autovalidator is set OVS will crash if one
>> implementation returns a different value, so I wonder why we
>> need to run for that long.
>
> I think we should remove the assert (already suggested by Harry),
> so it will not crass by accident if someone selects autovalidator
> in the field (and runs into an issue).
> Failure will then be detected by the ERROR log entries on shutdown.

 That's true for the testsuite, but not in production as there is
 nothing to disable that.

 Perhaps if autovalidator detects an issue, it should log an ERROR
 level log to report to testsuite, disable the failing mode and make
 sure OVS is either in default or in another functional mode.
>>>
>>> So I have put the following :
>>> Removed the assert
>>> Allow the Auto-validator to run for all implementation and for a full
>> batch
>>> Document error via Vlog_Error
>>> Set the auto-validator to default {Scalar} when returning out in case
>> of failure.
>>
>> Sounds like a plan to me.
>> Is that okay with you Eelco?

ACK, this sounds good to me.

> I’m wondering if there is another way than a simple delay, as
> these tend to
 cause issues later on. Can we check packets processed or something?

 Yeah, maybe we can pass all packets like 5x at least.
>>>
>>> Sure I will try to find something to do it more nicely.
>>> But just a thought keeping it 20sec allows for a full-stabilization and also
>> thorough testing of stability as well.
>>> So keeping it may not be just a bad idea.
>>
>> The issue is that if every test decides to delay seconds, the testsuite
>> becomes impractical. We have removed 'sleep' over time. Instead, we have
>> functions to wait for a certain cmdline output, or some event.
>> Yes, there are still some left to be fixed.
>>
>> Back to the point, maybe there is a signal of some sort we can get that
>> indicates the stability you're looking for.
>>
>
> I agree to the point and I am looking for a singal but currently due to 
> assert removal we don’t have any marker.
> To Minimize the time, I did analysis of the time taken in each test-case :
>
> 1) for the simple test-case we don’t need the 5sec wait time as PCAP only 
> contains one traffic or each type.
> 2) for fuzzy we do need at least 5sec for all 10k packets to be sent at-least 
> 2x and also stability.

Guess you know the number of packets, so you could check in a 1sec delay loop 
the number of received packets for the flows, and exit if you saw enough. And 
just to be sure, you terminate after 5~10 seconds?

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best mfex function

2021-06-30 Thread Eelco Chaudron


On 29 Jun 2021, at 18:32, Van Haaren, Harry wrote:

>> -Original Message-
>> From: dev  On Behalf Of Eelco Chaudron
>> Sent: Tuesday, June 29, 2021 1:38 PM
>> To: Amber, Kumar 
>> Cc: d...@openvswitch.org; i.maxim...@ovn.org
>> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select 
>> the best
>> mfex function
>>
>> More comments below. FYI I’m only reviewing right now, no testing.
>
> Sure, thanks for reviews.
>
>> On 17 Jun 2021, at 18:27, Kumar Amber wrote:
>
> 
>
>>> +/* Allocate per thread PMD pointer space for study_stats. */
>>> +static inline struct study_stats *
>>> +get_study_stats(void)
>>> +{
>>> +struct study_stats *stats = study_stats_get();
>>> +if (OVS_UNLIKELY(!stats)) {
>>> +   stats = xzalloc(sizeof *stats);
>>> +   study_stats_set_unsafe(stats);
>>> +}
>>> +return stats;
>>> +}
>>> +
>>
>> Just got a mind-meld with the code, and realized that the function might be 
>> different
>> per PMD thread due to this auto mode (and autovalidator mode in the previous
>> patch).
>>
>> This makes it only stronger that we need a way to see the currently selected 
>> mode,
>> and not per datapath, but per PMD per datapath!
>
> Study depends on the traffic pattern, so yes you're correct that it depends.
> The study command was added after community suggested user-experience
> would improve if the user doesn't have to provide an exact miniflow profile 
> name.
>
> Study studies the traffic running on that PMD, compares all MFEX impls, and 
> prints out
> hits. It selects the _first_ implementation that surpasses the threshold of 
> packets.
>
> Users are free to use the more specific names of MFEX impls instead of "study"
> for fine-grained control over the MFEX impl in use, e.g.
>
> ovs-appctl dpif-netdev/miniflow-parser-set avx512_vbmi_ipv4_udp
>
>> Do we also need a way to set this per PMD?
>
> I don't feel there is real value here, but we could investigate adding an
> optional parameter to the command indicating a PMD thread IDX to set?
> We have access to "pmd->core_id" in our set() function, so limiting changes
> to a specific PMD thread can be done ~ easily... but is it really required?
>
> Perfect is the enemy of good... I'd prefer focus on getting existing code 
> changes merged,
> and add additional (optional) parameters in future if deemed useful in real 
> world testing?

See Flavio’s reply, as those were the concerns same concerns I thought of.

>>> +uint32_t
>>> +mfex_study_traffic(struct dp_packet_batch *packets,
>>> +   struct netdev_flow_key *keys,
>>> +   uint32_t keys_size, odp_port_t in_port,
>>> +   void *pmd_handle)
>>> +{
>>> +uint32_t hitmask = 0;
>>> +uint32_t mask = 0;
>>> +struct dp_netdev_pmd_thread *pmd = pmd_handle;
>>> +struct dpif_miniflow_extract_impl *miniflow_funcs;
>>> +uint32_t impl_count = dpif_miniflow_extract_info_get(_funcs);
>>> +struct study_stats *stats = get_study_stats();
>>> +
>>> +/* Run traffic optimized miniflow_extract to collect the hitmask
>>> + * to be compared after certain packets have been hit to choose
>>> + * the best miniflow_extract version for that traffic. */
>>> +for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
>>> +if (miniflow_funcs[i].available) {
>>> +hitmask = miniflow_funcs[i].extract_func(packets, keys, 
>>> keys_size,
>>> + in_port, pmd_handle);
>>> +stats->impl_hitcount[i] += count_1bits(hitmask);
>>> +
>>> +/* If traffic is not classified than we dont overwrite the keys
>>> + * array in minfiflow implementations so its safe to create a
>>> + * mask for all those packets whose miniflow have been 
>>> created. */
>>> +mask |= hitmask;
>>> +}
>>> +}
>>> +stats->pkt_count += dp_packet_batch_size(packets);
>>> +
>>> +/* Choose the best implementation after a minimum packets have been
>>> + * processed. */
>>> +if (stats->pkt_count >= MFEX_MAX_COUNT) {
>>> +uint32_t best_func_index = MFEX_IMPL_START_IDX;
>>> +uint32_t max_hits = 0;
>>> +for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
>>> +if (stats->impl_hitcount[i] > max_hits) {
>>> +max_hits = stats->impl_hitcount[i];
>>> +best_func_index = i;
>>> +}
>>> +}
>>> +
>>> +if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
>>> +/* Set the implementation to index with max_hits. */
>>> +pmd->miniflow_extract_opt =
>>> +miniflow_funcs[best_func_index].extract_func;
>>> +VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
>>> +  miniflow_funcs[best_func_index].name, max_hits,
>>> +  stats->pkt_count);
>>
>> We have no idea which PMD the mode is selected for guess we might need to add

Re: [ovs-dev] [PATCH v2 6/8] ovs-thread: Quiesce when joining pthreads

2021-06-30 Thread Ilya Maximets
On 5/20/21 3:35 PM, Gaetan Rivet wrote:
> Joining pthreads makes the caller quiescent. It should register as such,
> as joined threads may wait on an RCU callback executing before quitting,
> deadlocking the caller.

Hi, Gaetan.

This patch doesn't look right to me.  The problem is that users of this
function has no idea that the quiescent state will be entered by this
function.  And this is really hard to track down, because it can be called
very deep inside some separate part of the code base that user at the
top level might not even know about.  For example, a lot of call chains
in ovsdb-server may lead to xpthread_join called from ovsdb/log.c.
Even though ovsdb-server is single-threaded now, insertion of the
ovsrcu_quiesce_start() into xpthread_join() will effectively mean that
ovsdb-server will never be able to use RCU if it will become multi-threaded
someday, e.g. it will not be able to use CMAPs.

So, instead of doing that, callers should enter quiescent state before
joining, and this should be done at the highest level of a call chain
possible.  We have the same thing with cond_wait() implementation, you
may add similar comment to the join() function as we have for cond_wait().

Best regards, Ilya Maximets.
___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [v4 02/12] dpif-netdev: Add auto validation function for miniflow extract

2021-06-30 Thread Eelco Chaudron


On 29 Jun 2021, at 18:15, Amber, Kumar wrote:

> Hi Eelco ,
>
> Sorry the formatting seems broken on this email thread.
> Replies are inlined .

Thanks, looking forward to the v5 (hopefully once I finished this version of 
the series. I’m currently at patch 10 ;)

> From: Eelco Chaudron 
> Sent: Tuesday, June 29, 2021 7:36 PM
> To: Amber, Kumar 
> Cc: Van Haaren, Harry ; d...@openvswitch.org; 
> i.maxim...@ovn.org; Stokes, Ian ; Flavio Leitner 
> 
> Subject: Re: [ovs-dev] [v4 02/12] dpif-netdev: Add auto validation function 
> for miniflow extract
>
>
> Not sure how you replied, but it’s hard to see which comments are mine, and 
> which are yours.
>
> On 29 Jun 2021, at 14:27, Amber, Kumar wrote:
>
> Hi Eelco,
>
> Thanks Again for reviews , Pls find my replies inline.
>
> From: Eelco Chaudron mailto:echau...@redhat.com>>
> Sent: Tuesday, June 29, 2021 5:14 PM
> To: Van Haaren, Harry 
> mailto:harry.van.haa...@intel.com>> ; Amber, 
> Kumar mailto:kumar.am...@intel.com>>
> Cc: d...@openvswitch.org ; 
> i.maxim...@ovn.org ; Stokes, Ian 
> mailto:ian.sto...@intel.com>> ; Flavio Leitner 
> mailto:f...@sysclose.org>>
> Subject: Re: [ovs-dev] [v4 02/12] dpif-netdev: Add auto validation function 
> for miniflow extract
>
> On 17 Jun 2021, at 18:27, Kumar Amber wrote:
>
> This patch introduced the auto-validation function which
> allows users to compare the batch of packets obtained from
> different miniflow implementations against the linear
> miniflow extract and return a hitmask.
>
> The autovaidator function can be triggered at runtime using the
> following command:
>
> $ ovs-appctl dpif-netdev/miniflow-parser-set autovalidator
>
> Signed-off-by: Kumar Amber 
> mailto:kumar.am...@intel.com>>
> Co-authored-by: Harry van Haaren 
> mailto:harry.van.haa...@intel.com>>
> Signed-off-by: Harry van Haaren 
> mailto:harry.van.haa...@intel.com>>
> ---
> lib/dpif-netdev-private-extract.c | 141 ++
> lib/dpif-netdev-private-extract.h | 15 
> lib/dpif-netdev.c | 2 +-
> 3 files changed, 157 insertions(+), 1 deletion(-)
>
> diff --git a/lib/dpif-netdev-private-extract.c 
> b/lib/dpif-netdev-private-extract.c
> index fcc56ef26..0741c19f9 100644
> --- a/lib/dpif-netdev-private-extract.c
> +++ b/lib/dpif-netdev-private-extract.c
> @@ -32,6 +32,11 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev_extract);
>
> /* Implementations of available extract options. */
> static struct dpif_miniflow_extract_impl mfex_impls[] = {
> + {
> + .probe = NULL,
> + .extract_func = dpif_miniflow_extract_autovalidator,
> + .name = "autovalidator",
> + },
> {
> .probe = NULL,
> .extract_func = NULL,
> @@ -84,3 +89,139 @@ dpif_miniflow_extract_info_get(struct 
> dpif_miniflow_extract_impl **out_ptr)
> *out_ptr = mfex_impls;
> return ARRAY_SIZE(mfex_impls);
> }
> +
> +uint32_t
> +dpif_miniflow_extract_autovalidator(struct dp_packet_batch *packets,
> + struct netdev_flow_key *keys,
> + uint32_t keys_size, odp_port_t in_port,
> + void *pmd_handle)
> +{
> + const size_t cnt = dp_packet_batch_size(packets);
> + uint16_t good_l2_5_ofs[NETDEV_MAX_BURST];
> + uint16_t good_l3_ofs[NETDEV_MAX_BURST];
> + uint16_t good_l4_ofs[NETDEV_MAX_BURST];
> + uint16_t good_l2_pad_size[NETDEV_MAX_BURST];
> + struct dp_packet *packet;
> + struct dp_netdev_pmd_thread *pmd = pmd_handle;
> + struct dpif_miniflow_extract_impl *miniflow_funcs;
> +
> + int32_t mfunc_count = dpif_miniflow_extract_info_get(_funcs);
> + if (mfunc_count < 0) {
>
> In theory 0 could not be returned, but just to cover the corner case can we 
> change this to include zero.
>
> The code has been adapted as per Flavio comments so will not be a concern.
>
> + pmd->miniflow_extract_opt = NULL;
>
> Guess the above needs to be atomic.
>
> Removed based on Flavio comments.
>
> + VLOG_ERR("failed to get miniflow extract function implementations\n");
>
> Capital F to be in sync with your other error messages?
>
> Removed based on Flavio comments.
>
> + return 0;
> + }
> + ovs_assert(keys_size >= cnt);
>
>
> I don’t think we should assert here. Just return an error like above, so in 
> production, we get notified, and this implementation gets disabled.
>
> Actually we do else one would most likely be overwriting the assigned array 
> space for keys and will hit a Seg fault at some point.
>
> And hence we would like to know at the compile time if this is the case.
>
> But this is not a compile time check, it will crash OVS. You could just do 
> this:
>
> if (keys_size < cnt) {
> pmd->miniflow_extract_opt = NULL;
> VLOG_ERR(“Invalid key size supplied etc. etc.\n”);
> return 0;
> }
>
> Or you could process up to key_size packets
>
> Reply:   sure I have taken the first approach in v5 as it safe and avoid any 
> risk of Seg fault in V5.
>
> + struct netdev_flow_key test_keys[NETDEV_MAX_BURST];
> +
> + /* Run scalar miniflow_extract to get default result. */
> + DP_PACKET_BATCH_FOR_EACH (i, packet, packets) {
> + pkt_metadata_init(>md, 

Re: [ovs-dev] [v4 01/12] dpif-netdev: Add command line and function pointer for miniflow extract

2021-06-30 Thread Eelco Chaudron


On 29 Jun 2021, at 17:23, Van Haaren, Harry wrote:

>> -Original Message-
>> From: Eelco Chaudron 
>> Sent: Tuesday, June 29, 2021 2:56 PM
>> To: Amber, Kumar 
>> Cc: Van Haaren, Harry ; d...@openvswitch.org;
>> i.maxim...@ovn.org; Flavio Leitner 
>> Subject: Re: [ovs-dev] [v4 01/12] dpif-netdev: Add command line and function
>> pointer for miniflow extract
>>
>>
>>
>> On 29 Jun 2021, at 13:59, Amber, Kumar wrote:
>>
>>> Hi Eelco,
>>>
>>> Thanks a lot for the comments and my replies are inline.
>>>
>>
>> 
>>
> +return;
> +}
> +
> +/* Add all mfex functions to reply string. */
> +struct ds reply = DS_EMPTY_INITIALIZER;
> +ds_put_cstr(, "Available Optimized Miniflow Extracts:\n");
> +for (uint32_t i = 0; i < count; i++) {
> +ds_put_format(, "  %s (available: %s)\n",
> +  mfex_impls[i].name, mfex_impls[i].available ?
> +  "True" : "False");
> +}
> +unixctl_command_reply(conn, ds_cstr());
> +ds_destroy();

 I think this command must output the currently configured values for all
 data paths, or else there is no easy way to see the current setting.

>>>
>>> We are planning to do a separate patch for implementing the same for DPIF,
>>> MFEX adnd DPCLS.
>>>
>>
>> If you do, please do it ASAP, as I think this feature should not get in 
>> without being
>> able to see in the field what the actual configuration is.
>
> Hi Eelco,
>  
> OK it seems that there's a lot of focus around visibility of implementation 
> used here.
> That's good and makes sense, lets focus to get that improved.
>  
> So moving forward, how about the below output for each command?
> (Note, I had a quick chat with Amber & Cian over IM here to get to the below!)
>  
> The mapping is not always very obvious, as e.g. DPCLS ports can be 
> re-assigned between PMD threads.
> (Note the implementation of DPCLS might be a bit tricky, as specialized 
> subtable searches
> aren't externally exposed. I'm confident we'll find a solution.)
>  
> DPIF and MFEX are enabled per-PMD thread, and are always consistent for all 
> datapath threads.

Not sure what you meant the the “always consistent for all datapath threads”? 
If you mean all PMDs from the same data-path have the same MFEX function, this 
is not true once you run the study mode.

>  
> Today's commands have very similar output, now with (name: value) data points 
> added.
> Example for DPIF:   (pmds: 15,16)  means pmd threads 15 and 16 are running 
> that impl.
>  
> Thoughts on the below commands, and added info?  Regards, -Harry

I think the output for the additional PMDs is what we need. I’ve never looked 
at the DPCL implementation, so no idea if the number of ports is enough, or we 
need the explicit ports, etc.? I’ll let others who reviewed that reply ;)


Thanks Harry for digging into this!

>  
> $ ovs-appctl dpif-netdev/subtable-lookup-prio-get
> Available lookup functions (priority : name)
>   0 : autovalidator (ports: none)
>   1 : generic (ports: none)
>   3 : avx512_gather (ports: 2) # number of DPCLS ports using this impl
>  
> $ ovs-appctl dpif-netdev/dpif-set-impl
> Available DPIF impls:
>   dpif_scalar (pmds: 15,16)# PMD thread ids using this DPIF impl
>   dpif_avx512 (pmds: none)
>  
> $ ovs-appctl  dpif-netdev/miniflow-parser-get
> Available Optimized Miniflow Extracts:
>   autovalidator (available: True, pmds: none)
>   disable (available: True, pmds: none)
>   study (available: True, pmds: none)
>   avx512_vbmi_ipv4_udp (available: True, pmds: 15,16) # PMD thread 
> ids using this MFEX impl
> . 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn] ovs-sandbox: Allow specifying initial contents for NB and SB database.

2021-06-30 Thread Dumitru Ceara
On 6/29/21 8:51 PM, Ben Pfaff wrote:
> This makes it easier to test northd behavior with particular database
> contents, like the ones that Dumitru posted to the mailing list:
> https://mail.openvswitch.org/pipermail/ovs-dev/2021-June/384519.html
> 
> You just do something like this:
> make sandbox SANDBOXFLAGS="--nbdb-source=$HOME/Downloads/ovnnb_db.db 
> --sbdb-source=$HOME/Downloads/ovnsb_db.db --ddlog"
> 
> Signed-off-by: Ben Pfaff 
> CC: Dumitru Ceara 
> ---

Hi Ben,

Thanks for this!

Nit: I think the 0-day bot is right about not using tabs in ovs-sandbox.
We currently only use spaces in that file.

With tabs converted to spaces:

Acked-by: Dumitru Ceara 

Regards,
Dumitru

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH v7 1/4] conntrack: handle already natted packets

2021-06-30 Thread Dumitru Ceara
On 6/29/21 6:04 PM, Paolo Valerio wrote:
> Dumitru Ceara  writes:
> 
>> On 6/25/21 2:01 PM, Paolo Valerio wrote:
>>> Dumitru Ceara  writes:
>>>
 On 6/21/21 12:06 PM, Paolo Valerio wrote:
> when a packet gets dnatted and then recirculated, it could be possible
> that it matches another rule that performs another nat action.
> The kernel datapath handles this situation turning to a no-op the
> second nat action, so natting only once the packet.  In the userspace
> datapath instead, when the ct action gets executed, an initial lookup
> of the translated packet fails to retrieve the connection related to
> the packet, leading to the creation of a new entry in ct for the src
> nat action with a subsequent failure of the connection establishment.
>
> with the following flows:
>
> table=0,priority=30,in_port=1,ip,nw_dst=192.168.2.100,actions=ct(commit,nat(dst=10.1.1.2:80),table=1)
> table=0,priority=20,in_port=2,ip,actions=ct(nat,table=1)
> table=0,priority=10,ip,actions=resubmit(,2)
> table=0,priority=10,arp,actions=NORMAL
> table=0,priority=0,actions=drop
> table=1,priority=5,ip,actions=ct(commit,nat(src=10.1.1.240),table=2)
> table=2,in_port=ovs-l0,actions=2
> table=2,in_port=ovs-r0,actions=1
>
> establishing a connection from 10.1.1.1 to 192.168.2.100 the outcome is:
>
> tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=4000,dport=80),reply=(src=10.1.1.2,dst=10.1.1.240,sport=80,dport=4000),protoinfo=(state=ESTABLISHED)
> tcp,orig=(src=10.1.1.1,dst=192.168.2.100,sport=4000,dport=80),reply=(src=10.1.1.2,dst=10.1.1.1,sport=80,dport=4000),protoinfo=(state=ESTABLISHED)
>
> with this patch applied the outcome is:
>
> tcp,orig=(src=10.1.1.1,dst=192.168.2.100,sport=4000,dport=80),reply=(src=10.1.1.2,dst=10.1.1.1,sport=80,dport=4000),protoinfo=(state=ESTABLISHED)
>
> The patch performs, for already natted packets, a lookup of the
> reverse key in order to retrieve the related entry, it also adds a
> test case that besides testing the scenario ensures that the other ct
> actions are executed.
>
> Reported-by: Dumitru Ceara 
> Signed-off-by: Paolo Valerio 
> ---

 Hi Paolo,

 Thanks for the patch!  I tested it and it works fine for OVN.  I have a
 few comments/questions below.

>>>
>>> Thanks for the test and for the review.
>>>
>  lib/conntrack.c |   30 +-
>  tests/system-traffic.at |   35 +++
>  2 files changed, 64 insertions(+), 1 deletion(-)
>
> diff --git a/lib/conntrack.c b/lib/conntrack.c
> index 99198a601..7e8b16a3e 100644
> --- a/lib/conntrack.c
> +++ b/lib/conntrack.c
> @@ -1281,6 +1281,33 @@ process_one_fast(uint16_t zone, const uint32_t 
> *setmark,
>  }
>  }
>  
> +static void
> +initial_conn_lookup(struct conntrack *ct, struct conn_lookup_ctx *ctx,
> + long long now, bool natted)

 Nit: indentation.

>>>
>>> ACK
>>>
> +{
> +bool found;
> +
> +if (natted) {
> +/* if the packet has been already natted (e.g. a previous
> + * action took place), retrieve it performing a lookup of its
> + * reverse key. */
> +conn_key_reverse(>key);
> +}
> +
> +found = conn_key_lookup(ct, >key, ctx->hash,
> +now, >conn, >reply);
> +if (natted) {
> +if (OVS_LIKELY(found)) {
> +ctx->reply = !ctx->reply;
> +ctx->key = ctx->reply ? ctx->conn->rev_key : ctx->conn->key;
> +ctx->hash = conn_key_hash(>key, ct->hash_basis);
> +} else {
> +/* in case of failure restore the initial key. */
> +conn_key_reverse(>key);

 Can the lookup actually fail?  I mean, if the packet was natted, there
 must have been a connection on which it got natted.  Anyway, I think we
 should probably also increment a coverage counter.  I guess dropping
 such packets would be hard, right?

>>>
>>> I agree, it should not fail. If I'm not missing something, if it
>>> happens, it should be because there's been a problem somewhere else
>>> (e.g. a polluted ct_state value or more in general an unexpected
>>> scenario). For this reason, I think it's better not to drop it or even
>>> set it as invalid.
>>
>> I'm not sure, won't this create horrible to debug bugs when packets get
>> forwarded in an unexpected way?  Setting it as invalid isn't good enough
>> in my opinion because there might be flows later in the pipeline that
>> perform actions (other than drop) on packets with ct_state +inv.
>>
>> The problem I have (because I don't know the conntrack code) is that I
>> see no easy way to drop the packet.
>>
>>>
>>> Yes, the coverage counter gives more meaning to the else branch.
>>> 

Re: [ovs-dev] [PATCH v11] ofproto-dpif: APIs and CLI option to add/delete static fdb entry

2021-06-30 Thread Eelco Chaudron



On 29 Jun 2021, at 22:43, Vasu Dasari wrote:

> Currently there is an option to add/flush/show ARP/ND neighbor. This covers L3
> side.  For L2 side, there is only fdb show command. This patch gives an option
> to add/del an fdb entry via ovs-appctl.
>
> CLI command looks like:
>
> To add:
> ovs-appctl fdb/add
> ovs-appctl fdb/add br0 p1 0 50:54:00:00:00:05
>
> To del:
> ovs-appctl fdb/del   
> ovs-appctl fdb/del br0 0 50:54:00:00:00:05
>
> Added two new APIs to provide convenient interface to add and delete 
> static-macs.
> bool xlate_add_static_mac_entry(const struct ofproto_dpif *, ofp_port_t 
> in_port,
>struct eth_addr dl_src, int vlan);
> bool xlate_delete_static_mac_entry(const struct ofproto_dpif *,
>   struct eth_addr dl_src, int vlan);
>
> 1. Static entry should not age. To indicate that entry being programmed is a 
> static entry,
>'expires' field in 'struct mac_entry' will be set to a 
> MAC_ENTRY_AGE_STATIC_ENTRY. A
>check for this value is made while deleting mac entry as part of regular 
> aging process.
> 2. Another change to of mac-update logic, when a packet with same dl_src as 
> that of a
>static-mac entry arrives on any port, the logic will not modify the 
> expires field.
> 3. While flushing fdb entries, made sure static ones are not evicted.
> 4. Updated "ovs-appctl fdb/stats-show br0" to display numberof static entries 
> in switch
>
> Added following tests:
>   ofproto-dpif - static-mac add/del/flush
>   ofproto-dpif - static-mac mac moves
>
> Signed-off-by: Vasu Dasari 
> Reported-at: 
> https://mail.openvswitch.org/pipermail/ovs-discuss/2019-June/048894.html
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1597752
> Tested-by: Eelco Chaudron 
> Acked-by: Eelco Chaudron 
> ---

Small doc change, so did not testing, only a review.

Acked-by: Eelco Chaudron 

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev