[ovs-dev] [PATCH ovn] northd: Add ACL label

2021-07-12 Thread Priyankar Jain
Allow adding label to an ACL to identify which ACL allowed a particular
flow in the connection tracking table.

The ACL label covers 32 bits at the end of ct_label. Since only allowed
connections are committed, only "allow" and "allow-related" ACLs can
have the label.

If the ACL allowing the connection changes, the label associated with the
new ACL gets updated in the ct_label field. This is achieved by committing
every packet that hits the ACL with the label to the connection tracking
table.
In case the new ACL doesn't have a label, the ct_label field is not
cleared. This is done to prevent any performance change with ACLs that
don't have label set.
For the packets which hits an ACL without label, the behaviour remains the
same as before with respect to the conntrack commit.

Performance:
We used ftrace to measure the time taken by an extra conntrack commit for
the packets hitting the ACL with label. We measured the time taken to
execute the ovs_ct_execute call inside the sock_sendmsg call.

ACL used :-
 from-lport  2000 (tcp && ip4.src == 10.0.0.11 && ip4.dst == 10.0.0.12) 
allow-related --label=0x1234

It was observed that the extra ovs_ct_execute call accounted for 1-2%
of the round trip time (sock_sendmsg duration). The actual percentage
is expected to be lesser since it doesn't take into account the tracing
overhead which is substantial for smaller functions.

Signed-off-by: Priyankar Jain 
---
 lib/logical-fields.c  |   2 +
 northd/ovn-northd.c   |  53 +++
 northd/ovn_northd.dl  |  75 +++---
 ovn-nb.ovsschema  |   3 +-
 ovn-nb.xml|  12 +
 tests/ovn-nbctl.at|  22 
 tests/ovn-northd.at   | 107 --
 tests/ovn.at  |   1 +
 utilities/ovn-nbctl.8.xml |   2 +-
 utilities/ovn-nbctl.c |  32 +++-
 10 files changed, 273 insertions(+), 36 deletions(-)

diff --git a/lib/logical-fields.c b/lib/logical-fields.c
index 72853013e..7b3d431e0 100644
--- a/lib/logical-fields.c
+++ b/lib/logical-fields.c
@@ -146,6 +146,8 @@ ovn_init_symtab(struct shash *symtab)
 "ct_label[32..79]", WR_CT_COMMIT);
 expr_symtab_add_subfield_scoped(symtab, "ct_label.ecmp_reply_port", NULL,
 "ct_label[80..95]", WR_CT_COMMIT);
+expr_symtab_add_subfield_scoped(symtab, "ct_label.label", NULL,
+"ct_label[96..127]", WR_CT_COMMIT);
 
 expr_symtab_add_field(symtab, "ct_state", MFF_CT_STATE, NULL, false);
 
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 562dc62b2..92590a49d 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -238,6 +238,7 @@ enum ovn_stage {
 #define REGBIT_ACL_HINT_BLOCK "reg0[10]"
 #define REGBIT_LKUP_FDB   "reg0[11]"
 #define REGBIT_HAIRPIN_REPLY  "reg0[12]"
+#define REGBIT_ACL_LABEL  "reg0[13]"
 
 #define REG_ORIG_DIP_IPV4 "reg1"
 #define REG_ORIG_DIP_IPV6 "xxreg1"
@@ -270,6 +271,9 @@ enum ovn_stage {
 #define REG_SRC_IPV4 "reg1"
 #define REG_SRC_IPV6 "xxreg1"
 
+/* Register used for setting a label for ACLs in a Logical Switch. */
+#define REG_LABEL "reg3"
+
 #define FLAGBIT_NOT_VXLAN "flags[1] == 0"
 
 /*
@@ -278,14 +282,15 @@ enum ovn_stage {
  * Logical Switch pipeline:
  * ++--+---+--+
  * | R0 | REGBIT_{CONNTRACK/DHCP/DNS}  |   |  |
- * || REGBIT_{HAIRPIN/HAIRPIN_REPLY}   | X |  |
- * || REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} | X |  |
+ * || REGBIT_{HAIRPIN/HAIRPIN_REPLY}   |   |  |
+ * || REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} |   |  |
+ * || REGBIT_ACL_LABEL | X |  |
  * ++--+ X |  |
  * | R1 | ORIG_DIP_IPV4 (>= IN_STATEFUL)   | R |  |
  * ++--+ E |  |
  * | R2 | ORIG_TP_DPORT (>= IN_STATEFUL)   | G |  |
  * ++--+ 0 |  |
- * | R3 |   UNUSED |   |  |
+ * | R3 |  ACL LABEL   |   |  |
  * ++--+---+--+
  * | R4 |   UNUSED |   |  |
  * ++--+ X |   ORIG_DIP_IPV6  |
@@ -5732,7 +5737,12 @@ consider_acl(struct hmap *

Re: [ovs-dev] [PATCH ovn] northd: Add ACL label

2021-07-20 Thread Priyankar Jain

ping for review.

On 12/07/21 1:04 pm, Priyankar Jain wrote:

Allow adding label to an ACL to identify which ACL allowed a particular
flow in the connection tracking table.

The ACL label covers 32 bits at the end of ct_label. Since only allowed
connections are committed, only "allow" and "allow-related" ACLs can
have the label.

If the ACL allowing the connection changes, the label associated with the
new ACL gets updated in the ct_label field. This is achieved by committing
every packet that hits the ACL with the label to the connection tracking
table.
In case the new ACL doesn't have a label, the ct_label field is not
cleared. This is done to prevent any performance change with ACLs that
don't have label set.
For the packets which hits an ACL without label, the behaviour remains the
same as before with respect to the conntrack commit.

Performance:
We used ftrace to measure the time taken by an extra conntrack commit for
the packets hitting the ACL with label. We measured the time taken to
execute the ovs_ct_execute call inside the sock_sendmsg call.

ACL used :-
  from-lport  2000 (tcp && ip4.src == 10.0.0.11 && ip4.dst == 10.0.0.12) 
allow-related --label=0x1234

It was observed that the extra ovs_ct_execute call accounted for 1-2%
of the round trip time (sock_sendmsg duration). The actual percentage
is expected to be lesser since it doesn't take into account the tracing
overhead which is substantial for smaller functions.

Signed-off-by: Priyankar Jain 
---
  lib/logical-fields.c  |   2 +
  northd/ovn-northd.c   |  53 +++
  northd/ovn_northd.dl  |  75 +++---
  ovn-nb.ovsschema  |   3 +-
  ovn-nb.xml|  12 +
  tests/ovn-nbctl.at|  22 
  tests/ovn-northd.at   | 107 --
  tests/ovn.at  |   1 +
  utilities/ovn-nbctl.8.xml |   2 +-
  utilities/ovn-nbctl.c |  32 +++-
  10 files changed, 273 insertions(+), 36 deletions(-)

diff --git a/lib/logical-fields.c b/lib/logical-fields.c
index 72853013e..7b3d431e0 100644
--- a/lib/logical-fields.c
+++ b/lib/logical-fields.c
@@ -146,6 +146,8 @@ ovn_init_symtab(struct shash *symtab)
  "ct_label[32..79]", WR_CT_COMMIT);
  expr_symtab_add_subfield_scoped(symtab, "ct_label.ecmp_reply_port", NULL,
  "ct_label[80..95]", WR_CT_COMMIT);
+expr_symtab_add_subfield_scoped(symtab, "ct_label.label", NULL,
+"ct_label[96..127]", WR_CT_COMMIT);
  
  expr_symtab_add_field(symtab, "ct_state", MFF_CT_STATE, NULL, false);
  
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c

index 562dc62b2..92590a49d 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -238,6 +238,7 @@ enum ovn_stage {
  #define REGBIT_ACL_HINT_BLOCK "reg0[10]"
  #define REGBIT_LKUP_FDB   "reg0[11]"
  #define REGBIT_HAIRPIN_REPLY  "reg0[12]"
+#define REGBIT_ACL_LABEL  "reg0[13]"
  
  #define REG_ORIG_DIP_IPV4 "reg1"

  #define REG_ORIG_DIP_IPV6 "xxreg1"
@@ -270,6 +271,9 @@ enum ovn_stage {
  #define REG_SRC_IPV4 "reg1"
  #define REG_SRC_IPV6 "xxreg1"
  
+/* Register used for setting a label for ACLs in a Logical Switch. */

+#define REG_LABEL "reg3"
+
  #define FLAGBIT_NOT_VXLAN "flags[1] == 0"
  
  /*

@@ -278,14 +282,15 @@ enum ovn_stage {
   * Logical Switch pipeline:
   * 
++--+---+--+
   * | R0 | REGBIT_{CONNTRACK/DHCP/DNS}  |   |  
|
- * || REGBIT_{HAIRPIN/HAIRPIN_REPLY}   | X |  |
- * || REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} | X |  |
+ * || REGBIT_{HAIRPIN/HAIRPIN_REPLY}   |   |  |
+ * || REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} |   |  |
+ * || REGBIT_ACL_LABEL | X |  |
   * ++--+ X |  
|
   * | R1 | ORIG_DIP_IPV4 (>= IN_STATEFUL)   | R |  
|
   * ++--+ E |  
|
   * | R2 | ORIG_TP_DPORT (>= IN_STATEFUL)   | G |  
|
   * ++--+ 0 |  
|
- * | R3 |   UNUSED |   |  |
+ * | R3 |  ACL LABEL   |   |  |
   * 
++--+---+--+
   * | R4 |   UNUSED |   |  
|
   * ++

Re: [ovs-dev] [PATCH ovn] northd: Add ACL label

2021-07-23 Thread Priyankar Jain

Sure. Thanks Numan.

Best Regards,
Priyankar Jain

On 23/07/21 7:57 pm, Numan Siddique wrote:

On Tue, Jul 20, 2021 at 12:17 PM Priyankar Jain
 wrote:


ping for review.


Hi Priyankar,

I'll take a look at the patch and come back to you with comments.

Thanks
Numan



On 12/07/21 1:04 pm, Priyankar Jain wrote:

Allow adding label to an ACL to identify which ACL allowed a particular
flow in the connection tracking table.

The ACL label covers 32 bits at the end of ct_label. Since only allowed
connections are committed, only "allow" and "allow-related" ACLs can
have the label.

If the ACL allowing the connection changes, the label associated with the
new ACL gets updated in the ct_label field. This is achieved by committing
every packet that hits the ACL with the label to the connection tracking
table.
In case the new ACL doesn't have a label, the ct_label field is not
cleared. This is done to prevent any performance change with ACLs that
don't have label set.
For the packets which hits an ACL without label, the behaviour remains the
same as before with respect to the conntrack commit.

Performance:
We used ftrace to measure the time taken by an extra conntrack commit for
the packets hitting the ACL with label. We measured the time taken to
execute the ovs_ct_execute call inside the sock_sendmsg call.

ACL used :-
   from-lport  2000 (tcp && ip4.src == 10.0.0.11 && ip4.dst == 10.0.0.12) 
allow-related --label=0x1234

It was observed that the extra ovs_ct_execute call accounted for 1-2%
of the round trip time (sock_sendmsg duration). The actual percentage
is expected to be lesser since it doesn't take into account the tracing
overhead which is substantial for smaller functions.

Signed-off-by: Priyankar Jain 
---
   lib/logical-fields.c  |   2 +
   northd/ovn-northd.c   |  53 +++
   northd/ovn_northd.dl  |  75 +++---
   ovn-nb.ovsschema  |   3 +-
   ovn-nb.xml|  12 +
   tests/ovn-nbctl.at|  22 
   tests/ovn-northd.at   | 107 --
   tests/ovn.at  |   1 +
   utilities/ovn-nbctl.8.xml |   2 +-
   utilities/ovn-nbctl.c |  32 +++-
   10 files changed, 273 insertions(+), 36 deletions(-)

diff --git a/lib/logical-fields.c b/lib/logical-fields.c
index 72853013e..7b3d431e0 100644
--- a/lib/logical-fields.c
+++ b/lib/logical-fields.c
@@ -146,6 +146,8 @@ ovn_init_symtab(struct shash *symtab)
   "ct_label[32..79]", WR_CT_COMMIT);
   expr_symtab_add_subfield_scoped(symtab, "ct_label.ecmp_reply_port", NULL,
   "ct_label[80..95]", WR_CT_COMMIT);
+expr_symtab_add_subfield_scoped(symtab, "ct_label.label", NULL,
+"ct_label[96..127]", WR_CT_COMMIT);

   expr_symtab_add_field(symtab, "ct_state", MFF_CT_STATE, NULL, false);

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 562dc62b2..92590a49d 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -238,6 +238,7 @@ enum ovn_stage {
   #define REGBIT_ACL_HINT_BLOCK "reg0[10]"
   #define REGBIT_LKUP_FDB   "reg0[11]"
   #define REGBIT_HAIRPIN_REPLY  "reg0[12]"
+#define REGBIT_ACL_LABEL  "reg0[13]"

   #define REG_ORIG_DIP_IPV4 "reg1"
   #define REG_ORIG_DIP_IPV6 "xxreg1"
@@ -270,6 +271,9 @@ enum ovn_stage {
   #define REG_SRC_IPV4 "reg1"
   #define REG_SRC_IPV6 "xxreg1"

+/* Register used for setting a label for ACLs in a Logical Switch. */
+#define REG_LABEL "reg3"
+
   #define FLAGBIT_NOT_VXLAN "flags[1] == 0"

   /*
@@ -278,14 +282,15 @@ enum ovn_stage {
* Logical Switch pipeline:
* 
++--+---+--+
* | R0 | REGBIT_{CONNTRACK/DHCP/DNS}  |   | 
 |
- * || REGBIT_{HAIRPIN/HAIRPIN_REPLY}   | X |  |
- * || REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} | X |  |
+ * || REGBIT_{HAIRPIN/HAIRPIN_REPLY}   |   |  |
+ * || REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} |   |  |
+ * || REGBIT_ACL_LABEL | X |  |
* ++--+ X | 
 |
* | R1 | ORIG_DIP_IPV4 (>= IN_STATEFUL)   | R | 
 |
* ++--+ E | 
 |
* | R2 | ORIG_TP_DPORT (>= IN_STATEFUL)   | G | 
 |
* ++--+ 0 | 
 |
- * | R3 |   UNUSED |   |

[ovs-dev] [PATCH ovn v2] northd: Add ACL label

2021-07-29 Thread Priyankar Jain
Allow adding label to an ACL to identify which ACL allowed a particular
flow in the connection tracking table.

The ACL label covers 32 bits at the end of ct_label. Since only allowed
connections are committed, only "allow" and "allow-related" ACLs can
have the label.

If the ACL allowing the connection changes, the label associated with the
new ACL gets updated in the ct_label field. This is achieved by committing
every packet that hits the ACL with the label to the connection tracking
table.
In case the new ACL doesn't have a label, the ct_label field is not
cleared. This is done to prevent any performance change with ACLs that
don't have label set.
For the packets which hits an ACL without label, the behaviour remains the
same as before with respect to the conntrack commit.

Performance:
We used ftrace to measure the time taken by an extra conntrack commit for
the packets hitting the ACL with label. We measured the time taken to
execute the ovs_ct_execute call inside the sock_sendmsg call.

ACL used :-
 from-lport  2000 (tcp && ip4.src == 10.0.0.11 && ip4.dst == 10.0.0.12) 
allow-related --label=1234

It was observed that the extra ovs_ct_execute call accounted for 1-2%
of the round trip time (sock_sendmsg duration). The actual percentage
is expected to be lesser since it doesn't take into account the tracing
overhead which is substantial for smaller functions.

Signed-off-by: Priyankar Jain 
---

v1 -> v2
* Changed type of label from string (hex) to unsigned integer.
* Added two system tests for checking the correct ct_label programming.
* Updated ovn-northd.8.xml with the new logical flows added as part of
  this patch.
* Rebased with latest master.

 lib/logical-fields.c  |   2 +
 northd/ovn-northd.8.xml   |  26 +++-
 northd/ovn-northd.c   |  53 +++--
 northd/ovn_northd.dl  |  76 +---
 ovn-nb.ovsschema  |   7 +-
 ovn-nb.xml|  12 ++
 tests/ovn-nbctl.at|  19 +++
 tests/ovn-northd.at   | 107 -
 tests/ovn.at  |   1 +
 tests/system-ovn.at   | 244 ++
 utilities/ovn-nbctl.8.xml |   2 +-
 utilities/ovn-nbctl.c |  38 +-
 12 files changed, 545 insertions(+), 42 deletions(-)

diff --git a/lib/logical-fields.c b/lib/logical-fields.c
index 72853013e..7b3d431e0 100644
--- a/lib/logical-fields.c
+++ b/lib/logical-fields.c
@@ -146,6 +146,8 @@ ovn_init_symtab(struct shash *symtab)
 "ct_label[32..79]", WR_CT_COMMIT);
 expr_symtab_add_subfield_scoped(symtab, "ct_label.ecmp_reply_port", NULL,
 "ct_label[80..95]", WR_CT_COMMIT);
+expr_symtab_add_subfield_scoped(symtab, "ct_label.label", NULL,
+"ct_label[96..127]", WR_CT_COMMIT);
 
 expr_symtab_add_field(symtab, "ct_state", MFF_CT_STATE, NULL, false);
 
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 99a19f853..641372b77 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -670,13 +670,19 @@
 the next; action.  If there are any stateful ACLs
 on this datapath, then allow ACLs translate to
 ct_commit; next; (which acts as a hint for the next tables
-to commit the connection to conntrack),
+to commit the connection to conntrack). In case the ACL
+has a label then reg3 is loaded with the label value and
+reg0[13] bit is set to 1 (which acts as a hint for the
+next tables to commit the label to conntrack).
   
   
 allow-related ACLs translate into logical
 flows with the ct_commit(ct_label=0/1); next; actions
 for new connections and reg0[1] = 1; next; for existing
-connections.
+connections.  In case the ACL has a label then
+reg3 is loaded with the label value and
+reg0[13] bit is set to 1 (which acts as a hint for the
+next tables to commit the label to conntrack).
   
   
 allow-stateless ACLs translate into logical
@@ -876,9 +882,19 @@
   
 
   
-A priority-100 flow commits packets to connection tracker using
-ct_commit; next; action based on a hint provided by
-the previous tables (with a match for reg0[1] == 1).
+A priority 100 flow is added which commits the packet to the conntrack
+and sets the most significant 32-bits of ct_label with the
+reg3 value based on the hint provided by previous tables
+(with a match for reg0[1] == 1 && reg0[13] == 1).
+This is used by the ACLs with label to commit the label
+value to conntrack.
+  
+
+  
+For ACLs without label, a second priority-100 flow commits
+packets to connection tracker using ct_commit; next;
+action based on a hint provided by the previous tabl

[ovs-dev] [PATCH ovn v3] northd: Add ACL label

2021-08-01 Thread Priyankar Jain
Allow adding label to an ACL to identify which ACL allowed a particular
flow in the connection tracking table.

The ACL label covers 32 bits at the end of ct_label. Since only allowed
connections are committed, only "allow" and "allow-related" ACLs can
have the label.

If the ACL allowing the connection changes, the label associated with the
new ACL gets updated in the ct_label field. This is achieved by committing
every packet that hits the ACL with the label to the connection tracking
table.
In case the new ACL doesn't have a label, the ct_label field is not
cleared. This is done to prevent any performance change with ACLs that
don't have label set.
For the packets which hits an ACL without label, the behaviour remains the
same as before with respect to the conntrack commit.

Performance:
We used ftrace to measure the time taken by an extra conntrack commit for
the packets hitting the ACL with label. We measured the time taken to
execute the ovs_ct_execute call inside the sock_sendmsg call.

ACL used :-
 from-lport  2000 (tcp && ip4.src == 10.0.0.11 && ip4.dst == 10.0.0.12) 
allow-related --label=1234

It was observed that the extra ovs_ct_execute call accounted for 1-2%
of the round trip time (sock_sendmsg duration). The actual percentage
is expected to be lesser since it doesn't take into account the tracing
overhead which is substantial for smaller functions.

Signed-off-by: Priyankar Jain 
Acked-by: Numan Siddique 
---

v2 -> v3
* Added NEWS entry.

v1 -> v2
* Changed type of label from string (hex) to unsigned integer.
* Added two system tests for checking the correct ct_label programming.
* Updated ovn-northd.8.xml with the new logical flows added as part of
  this patch.
* Rebased with latest master.

 NEWS  |   3 +
 lib/logical-fields.c  |   2 +
 northd/ovn-northd.8.xml   |  26 +++-
 northd/ovn-northd.c   |  53 +++--
 northd/ovn_northd.dl  |  76 +---
 ovn-nb.ovsschema  |   7 +-
 ovn-nb.xml|  12 ++
 tests/ovn-nbctl.at|  19 +++
 tests/ovn-northd.at   | 107 -
 tests/ovn.at  |   1 +
 tests/system-ovn.at   | 244 ++
 utilities/ovn-nbctl.8.xml |   2 +-
 utilities/ovn-nbctl.c |  38 +-
 13 files changed, 548 insertions(+), 42 deletions(-)

diff --git a/NEWS b/NEWS
index eefdae9bc..1ba739975 100644
--- a/NEWS
+++ b/NEWS
@@ -1,6 +1,9 @@
 Post-v21.06.0
 -
   - Added Control Plane Protection support (control plane traffic metering).
+  - Introduced a new "label" field for "allow" and "allow-related" ACLs
+which helps in debugging/backtracking the ACL which allowed a particular
+connection.
 
 OVN v21.06.0 - 18 Jun 2021
 -
diff --git a/lib/logical-fields.c b/lib/logical-fields.c
index 72853013e..7b3d431e0 100644
--- a/lib/logical-fields.c
+++ b/lib/logical-fields.c
@@ -146,6 +146,8 @@ ovn_init_symtab(struct shash *symtab)
 "ct_label[32..79]", WR_CT_COMMIT);
 expr_symtab_add_subfield_scoped(symtab, "ct_label.ecmp_reply_port", NULL,
 "ct_label[80..95]", WR_CT_COMMIT);
+expr_symtab_add_subfield_scoped(symtab, "ct_label.label", NULL,
+"ct_label[96..127]", WR_CT_COMMIT);
 
 expr_symtab_add_field(symtab, "ct_state", MFF_CT_STATE, NULL, false);
 
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 99a19f853..641372b77 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -670,13 +670,19 @@
 the next; action.  If there are any stateful ACLs
 on this datapath, then allow ACLs translate to
 ct_commit; next; (which acts as a hint for the next tables
-to commit the connection to conntrack),
+to commit the connection to conntrack). In case the ACL
+has a label then reg3 is loaded with the label value and
+reg0[13] bit is set to 1 (which acts as a hint for the
+next tables to commit the label to conntrack).
   
   
 allow-related ACLs translate into logical
 flows with the ct_commit(ct_label=0/1); next; actions
 for new connections and reg0[1] = 1; next; for existing
-connections.
+connections.  In case the ACL has a label then
+reg3 is loaded with the label value and
+reg0[13] bit is set to 1 (which acts as a hint for the
+next tables to commit the label to conntrack).
   
   
 allow-stateless ACLs translate into logical
@@ -876,9 +882,19 @@
   
 
   
-A priority-100 flow commits packets to connection tracker using
-ct_commit; next; action based on a hint provided by
-the previous tables (with a match for reg0[1] == 1).
+A pri

[ovs-dev] [PATCH ovn v4] northd: Add ACL label

2021-08-02 Thread Priyankar Jain
Allow adding label to an ACL to identify which ACL allowed a particular
flow in the connection tracking table.

The ACL label covers 32 bits at the end of ct_label. Since only allowed
connections are committed, only "allow" and "allow-related" ACLs can
have the label.

If the ACL allowing the connection changes, the label associated with the
new ACL gets updated in the ct_label field. This is achieved by committing
every packet that hits the ACL with the label to the connection tracking
table.
In case the new ACL doesn't have a label, the ct_label field is not
cleared. This is done to prevent any performance change with ACLs that
don't have label set.
For the packets which hits an ACL without label, the behaviour remains the
same as before with respect to the conntrack commit.

Performance:
We used ftrace to measure the time taken by an extra conntrack commit for
the packets hitting the ACL with label. We measured the time taken to
execute the ovs_ct_execute call inside the sock_sendmsg call.

ACL used :-
 from-lport  2000 (tcp && ip4.src == 10.0.0.11 && ip4.dst == 10.0.0.12) 
allow-related --label=1234

It was observed that the extra ovs_ct_execute call accounted for 1-2%
of the round trip time (sock_sendmsg duration). The actual percentage
is expected to be lesser since it doesn't take into account the tracing
overhead which is substantial for smaller functions.

Signed-off-by: Priyankar Jain 
Acked-by: Numan Siddique 
---

v3 -> v4
* Resolved merge conflicts with master.

v2 -> v3
* Added NEWS entry.

v1 -> v2
* Changed type of label from string (hex) to unsigned integer.
* Added two system tests for checking the correct ct_label programming.
* Updated ovn-northd.8.xml with the new logical flows added as part of
  this patch.
* Rebased with latest master.

 NEWS  |   3 +
 lib/logical-fields.c  |   2 +
 northd/ovn-northd.8.xml   |  26 +++-
 northd/ovn-northd.c   |  53 +++--
 northd/ovn_northd.dl  |  76 +---
 ovn-nb.ovsschema  |   7 +-
 ovn-nb.xml|  12 ++
 tests/ovn-nbctl.at|  19 +++
 tests/ovn-northd.at   | 107 -
 tests/ovn.at  |   1 +
 tests/system-ovn.at   | 244 ++
 utilities/ovn-nbctl.8.xml |   2 +-
 utilities/ovn-nbctl.c |  38 +-
 13 files changed, 548 insertions(+), 42 deletions(-)

diff --git a/NEWS b/NEWS
index f328666da..45dc45353 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,9 @@ Post-v21.06.0
   - Added Control Plane Protection support (control plane traffic metering).
   - Added path MTU discovery for ingress traffic originated outside of the
 cluster.
+  - Introduced a new "label" field for "allow" and "allow-related" ACLs
+which helps in debugging/backtracking the ACL which allowed a particular
+connection.
 
 OVN v21.06.0 - 18 Jun 2021
 -
diff --git a/lib/logical-fields.c b/lib/logical-fields.c
index 72853013e..7b3d431e0 100644
--- a/lib/logical-fields.c
+++ b/lib/logical-fields.c
@@ -146,6 +146,8 @@ ovn_init_symtab(struct shash *symtab)
 "ct_label[32..79]", WR_CT_COMMIT);
 expr_symtab_add_subfield_scoped(symtab, "ct_label.ecmp_reply_port", NULL,
 "ct_label[80..95]", WR_CT_COMMIT);
+expr_symtab_add_subfield_scoped(symtab, "ct_label.label", NULL,
+"ct_label[96..127]", WR_CT_COMMIT);
 
 expr_symtab_add_field(symtab, "ct_state", MFF_CT_STATE, NULL, false);
 
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 08d484760..c5ea15774 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -670,13 +670,19 @@
 the next; action.  If there are any stateful ACLs
 on this datapath, then allow ACLs translate to
 ct_commit; next; (which acts as a hint for the next tables
-to commit the connection to conntrack),
+to commit the connection to conntrack). In case the ACL
+has a label then reg3 is loaded with the label value and
+reg0[13] bit is set to 1 (which acts as a hint for the
+next tables to commit the label to conntrack).
   
   
 allow-related ACLs translate into logical
 flows with the ct_commit(ct_label=0/1); next; actions
 for new connections and reg0[1] = 1; next; for existing
-connections.
+connections.  In case the ACL has a label then
+reg3 is loaded with the label value and
+reg0[13] bit is set to 1 (which acts as a hint for the
+next tables to commit the label to conntrack).
   
   
 allow-stateless ACLs translate into logical
@@ -876,9 +882,19 @@
   
 
   
-A priority-100 flow commits packets to connection tracker using
-ct_commit; next

Re: [ovs-dev] [PATCH ovn v4] northd: Add ACL label

2021-08-11 Thread Priyankar Jain

Hi Numan,

In case the patch looks good to you, can you please merge it in master?

Thanks and Regards,
Priyankar Jain

On 02/08/21 2:02 pm, Priyankar Jain wrote:

Allow adding label to an ACL to identify which ACL allowed a particular
flow in the connection tracking table.

The ACL label covers 32 bits at the end of ct_label. Since only allowed
connections are committed, only "allow" and "allow-related" ACLs can
have the label.

If the ACL allowing the connection changes, the label associated with the
new ACL gets updated in the ct_label field. This is achieved by committing
every packet that hits the ACL with the label to the connection tracking
table.
In case the new ACL doesn't have a label, the ct_label field is not
cleared. This is done to prevent any performance change with ACLs that
don't have label set.
For the packets which hits an ACL without label, the behaviour remains the
same as before with respect to the conntrack commit.

Performance:
We used ftrace to measure the time taken by an extra conntrack commit for
the packets hitting the ACL with label. We measured the time taken to
execute the ovs_ct_execute call inside the sock_sendmsg call.

ACL used :-
  from-lport  2000 (tcp && ip4.src == 10.0.0.11 && ip4.dst == 10.0.0.12) 
allow-related --label=1234

It was observed that the extra ovs_ct_execute call accounted for 1-2%
of the round trip time (sock_sendmsg duration). The actual percentage
is expected to be lesser since it doesn't take into account the tracing
overhead which is substantial for smaller functions.

Signed-off-by: Priyankar Jain 
Acked-by: Numan Siddique 
---

v3 -> v4
* Resolved merge conflicts with master.

v2 -> v3
* Added NEWS entry.

v1 -> v2
* Changed type of label from string (hex) to unsigned integer.
* Added two system tests for checking the correct ct_label programming.
* Updated ovn-northd.8.xml with the new logical flows added as part of
   this patch.
* Rebased with latest master.

  NEWS  |   3 +
  lib/logical-fields.c  |   2 +
  northd/ovn-northd.8.xml   |  26 +++-
  northd/ovn-northd.c   |  53 +++--
  northd/ovn_northd.dl  |  76 +---
  ovn-nb.ovsschema  |   7 +-
  ovn-nb.xml|  12 ++
  tests/ovn-nbctl.at|  19 +++
  tests/ovn-northd.at   | 107 -
  tests/ovn.at  |   1 +
  tests/system-ovn.at   | 244 ++
  utilities/ovn-nbctl.8.xml |   2 +-
  utilities/ovn-nbctl.c |  38 +-
  13 files changed, 548 insertions(+), 42 deletions(-)

diff --git a/NEWS b/NEWS
index f328666da..45dc45353 100644
--- a/NEWS
+++ b/NEWS
@@ -3,6 +3,9 @@ Post-v21.06.0
- Added Control Plane Protection support (control plane traffic metering).
- Added path MTU discovery for ingress traffic originated outside of the
  cluster.
+  - Introduced a new "label" field for "allow" and "allow-related" ACLs
+which helps in debugging/backtracking the ACL which allowed a particular
+connection.
  
  OVN v21.06.0 - 18 Jun 2021

  -
diff --git a/lib/logical-fields.c b/lib/logical-fields.c
index 72853013e..7b3d431e0 100644
--- a/lib/logical-fields.c
+++ b/lib/logical-fields.c
@@ -146,6 +146,8 @@ ovn_init_symtab(struct shash *symtab)
  "ct_label[32..79]", WR_CT_COMMIT);
  expr_symtab_add_subfield_scoped(symtab, "ct_label.ecmp_reply_port", NULL,
  "ct_label[80..95]", WR_CT_COMMIT);
+expr_symtab_add_subfield_scoped(symtab, "ct_label.label", NULL,
+"ct_label[96..127]", WR_CT_COMMIT);
  
  expr_symtab_add_field(symtab, "ct_state", MFF_CT_STATE, NULL, false);
  
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml

index 08d484760..c5ea15774 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -670,13 +670,19 @@
  the next; action.  If there are any stateful ACLs
  on this datapath, then allow ACLs translate to
  ct_commit; next; (which acts as a hint for the next 
tables
-to commit the connection to conntrack),
+to commit the connection to conntrack). In case the ACL
+has a label then reg3 is loaded with the label value and
+reg0[13] bit is set to 1 (which acts as a hint for the
+next tables to commit the label to conntrack).


  allow-related ACLs translate into logical
  flows with the ct_commit(ct_label=0/1); next; actions
  for new connections and reg0[1] = 1; next; for existing
-connections.
+connections.  In case the ACL has a label then
+reg3 is loaded with the label value and
+reg0[13] bit is set to 1 (which acts as a hint for the
+next tables to commit the label to conntrack).
 

[ovs-dev] [PATCH] python-ovs: [jsonrpc] Maintain Connection.output as bytes instead of string

2022-12-28 Thread Priyankar Jain
Currently _send() in stream.py[L#415] converts the string buffer to bytes in
case the caller passes it a string buffer.

This becomes a problem when retrying this send call (say due to EAGAIN errors)
with SSLStream and underlying OpenSSL implementation.
Converting the buffer from string to bytes effectively changes the buffer
address (even though its content might be same).
But the OpenSSL implementation expects that the buffer contents and address
to remain same when retrying send/ssl_write on retryable errors such
as EAGAIN.

Following is the sample stacktrace seen when stream.send() return EAGAIN error
and the call is retried by jsonrpc.run():

```
File "/usr/local/lib/python3.9/site-packages/ovs/db/idl.py", line 1652,
in commit_block
self.idl.run()
  File "/usr/local/lib/python3.9/site-packages/ovs/db/idl.py", line 240,
in run
self._session.run()
  File "/usr/local/lib/python3.9/site-packages/ovs/jsonrpc.py", line
503, in run
self.rpc.run()
  File "/usr/local/lib/python3.9/site-packages/ovs/jsonrpc.py", line
202, in run
retval = self.stream.send(self.output)
  File "/usr/local/lib/python3.9/site-packages/ovs/stream.py", line 831,
in send
return super(SSLStream, self).send(buf)
  File "/usr/local/lib/python3.9/site-packages/ovs/stream.py", line 413,
in send
return self.socket.send(buf)
  File "/usr/local/lib/python3.9/site-packages/OpenSSL/SSL.py", line
1652, in send
self._raise_ssl_error(self._ssl, result)
  File "/usr/local/lib/python3.9/site-packages/OpenSSL/SSL.py", line
1566, in _raise_ssl_error
_raise_current_error()
  File "/usr/local/lib/python3.9/site-packages/OpenSSL/_util.py", line
57, in exception_from_error_queue
raise exception_type(errors)
OpenSSL.SSL.Error: [('SSL routines', 'ssl3_write_pending', 'bad write
retry')]
```

This patch fixes it by maintaining the data to be sent as bytes instead
of string in jsonrpc.py so that on retries buffer remains unchanged.

Signed-off-by: Priyankar Jain 
---
 python/ovs/jsonrpc.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/ovs/jsonrpc.py b/python/ovs/jsonrpc.py
index d5127268a..60c631874 100644
--- a/python/ovs/jsonrpc.py
+++ b/python/ovs/jsonrpc.py
@@ -186,7 +186,7 @@ class Connection(object):
 self.stream = stream
 self.status = 0
 self.input = ""
-self.output = ""
+self.output = b''
 self.parser = None
 self.received_bytes = 0
 
@@ -238,7 +238,7 @@ class Connection(object):
 self.__log_msg("send", msg)
 
 was_empty = len(self.output) == 0
-self.output += ovs.json.to_string(msg.to_json())
+self.output += ovs.json.to_string(msg.to_json()).encode("utf-8")
 if was_empty:
 self.run()
 return self.status
@@ -365,7 +365,7 @@ class Connection(object):
 if self.status == 0:
 self.status = error
 self.stream.close()
-self.output = ""
+self.output = b''
 
 
 class Session(object):
-- 
2.22.3

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


[ovs-dev] [PATCH ovn] northd: Allow Load balancer to be attached to router with multiple GW ports

2024-07-17 Thread Priyankar Jain
This commit removes the restriction of support LB for router with only
<= 1 Distributed gateway ports.
Added datapath and logical flows validation cases.

Signed-off-by: Priyankar Jain 
---
 northd/en-lr-stateful.c |  12 ---
 northd/northd.c |  12 +--
 tests/ovn-northd.at |  86 +
 tests/ovn.at| 167 
 4 files changed, 260 insertions(+), 17 deletions(-)

diff --git a/northd/en-lr-stateful.c b/northd/en-lr-stateful.c
index baf1bd2f8..f09691af6 100644
--- a/northd/en-lr-stateful.c
+++ b/northd/en-lr-stateful.c
@@ -516,18 +516,6 @@ lr_stateful_record_create(struct lr_stateful_table *table,
 
 table->array[od->index] = lr_stateful_rec;
 
-/* Load balancers are not supported (yet) if a logical router has multiple
- * distributed gateway port.  Log a warning. */
-if (lr_stateful_rec->has_lb_vip && lr_has_multiple_gw_ports(od)) {
-static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
-VLOG_WARN_RL(&rl, "Load-balancers are configured on logical "
- "router %s, which has %"PRIuSIZE" distributed "
- "gateway ports. Load-balancer is not supported "
- "yet when there is more than one distributed "
- "gateway port on the router.",
- od->nbr->name, od->n_l3dgw_ports);
-}
-
 return lr_stateful_rec;
 }
 
diff --git a/northd/northd.c b/northd/northd.c
index 6898daa00..e6f53f361 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -11026,10 +11026,9 @@ static void
 build_distr_lrouter_nat_flows_for_lb(struct lrouter_nat_lb_flows_ctx *ctx,
  enum lrouter_nat_lb_flow_type type,
  struct ovn_datapath *od,
+ struct ovn_port *dgp,
  struct lflow_ref *lflow_ref)
 {
-struct ovn_port *dgp = od->l3dgw_ports[0];
-
 const char *undnat_action;
 
 switch (type) {
@@ -11060,7 +11059,7 @@ build_distr_lrouter_nat_flows_for_lb(struct 
lrouter_nat_lb_flows_ctx *ctx,
 
 if (ctx->lb_vip->n_backends || !ctx->lb_vip->empty_backend_rej) {
 ds_put_format(ctx->new_match, " && is_chassis_resident(%s)",
-  od->l3dgw_ports[0]->cr_port->json_key);
+  dgp->cr_port->json_key);
 }
 
 ovn_lflow_add_with_hint__(ctx->lflows, od, S_ROUTER_IN_DNAT, ctx->prio,
@@ -11263,8 +11262,11 @@ build_lrouter_nat_flows_for_lb(
 if (!od->n_l3dgw_ports) {
 bitmap_set1(gw_dp_bitmap[type], index);
 } else {
-build_distr_lrouter_nat_flows_for_lb(&ctx, type, od,
- lb_dps->lflow_ref);
+for (int i = 0; i < od->n_l3dgw_ports; i++) {
+struct ovn_port *dgp = od->l3dgw_ports[i];
+build_distr_lrouter_nat_flows_for_lb(&ctx, type, od, dgp,
+ lb_dps->lflow_ref);
+}
 }
 
 if (lb->affinity_timeout) {
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index a389d1988..5be48f49e 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -12721,3 +12721,89 @@ AT_CHECK([ovn-sbctl dump-flows lr | grep lr_in_dnat | 
ovn_strip_lflows], [0], [d
 
 AT_CLEANUP
 ])
+
+OVN_FOR_EACH_NORTHD_NO_HV([
+AT_SETUP([ovn-northd -- LB on Lr with multiple gw ports])
+AT_KEYWORDS([lb-multiple-l3dgw-ports])
+ovn_start
+
+# Logical network:
+# 1 Logical Router, 3 bridged Logical Switches,
+# 1 gateway chassis attached to each corresponding LRP.
+# LB added attached to DR
+#
+#| S1 (gw1)
+#|
+#  ls    DR -- S3 (gw3)
+# (20.0.0.0/24)  |
+#| S2 (gw2)
+#
+# Validate basic LB logical flows.
+
+check ovn-sbctl chassis-add gw1 geneve 127.0.0.1
+check ovn-sbctl chassis-add gw2 geneve 128.0.0.1
+check ovn-sbctl chassis-add gw3 geneve 129.0.0.1
+
+check ovn-nbctl lr-add DR
+check ovn-nbctl lrp-add DR DR-S1 02:ac:10:01:00:01 172.16.1.1/24
+check ovn-nbctl lrp-add DR DR-S2 03:ac:10:01:00:01 172.16.2.1/24
+check ovn-nbctl lrp-add DR DR-S3 04:ac:10:01:00:01 172.16.3.1/24
+check ovn-nbctl lrp-add DR DR-ls 05:ac:10:01:00:01 20.0.0.1/24
+
+check ovn-nbctl ls-add S1
+check ovn-nbctl lsp-add S1 S1-DR
+check ovn-nbctl lsp-set-type S1-DR router
+check ovn-nbctl lsp-set-addresses S1-DR router
+check ovn-nbctl --wait=sb lsp-set-options S1-DR router-port=DR-S1
+
+check ovn-nbctl ls-add S2
+check ovn-nbctl lsp-add S2 S2-DR
+check ovn-nbctl lsp-set-type S2-DR router
+check ovn-nbctl lsp-set-addresses S2-DR router
+check ovn-nbctl --wait=sb lsp-set-options S2-DR router-port=DR-S2
+
+check ovn-nbctl ls-add S3
+check ovn-nbctl lsp-add S3 S3-DR
+check ovn-nbct

Re: [ovs-dev] [PATCH ovn] northd: Allow Load balancer to be attached to router with multiple GW ports

2024-08-06 Thread Priyankar Jain

Hi,
Please find my reply inline.

Thanks,
Priyankar

On 31/07/24 4:46 am, Numan Siddique wrote:

On Wed, Jul 17, 2024 at 5:04 AM Priyankar Jain
 wrote:

This commit removes the restriction of support LB for router with only
<= 1 Distributed gateway ports.
Added datapath and logical flows validation cases.

Signed-off-by: Priyankar Jain 

Hi Priyankar,

Thanks for the patch.

Can you please tell me the use case ?

Lets say a logical router lr1 has sw1 (10.0.0.0/24) , sw2
(20.0.0.0/24) and sw3 (30.0.0.0/24) and the corresponding
gw ports are bound to chassis ch1, ch2 and ch3 respectively.

Let's say we attach load balancers - lb1 (10.0.0.40:80 ->
20.0.0.40:80), lb2 (20.0.0.50:80 -> 30.0.0.50:80) and
lb3 (30.0.0.60:80 -> 10.0.0.60:80) to lr1.

If I understand correctly, routing for sw1 (10.0.0.0/24) is handled on
the chassis ch1 and so on for other switches.

This patch generates the below logical flows in the router pipeline
for load balancer lb1, lb2 and lb3

   table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 10.0.0.40 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw1")),
action=(ct_lb_mark(backends=20.0.0.40:80);)
   table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 10.0.0.40 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw2")),
action=(ct_lb_mark(backends=20.0.0.40:80);)
   table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 10.0.0.40 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw3")),
action=(ct_lb_mark(backends=20.0.0.40:80);)
   table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 20.0.0.50 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw1")),
action=(ct_lb_mark(backends=30.0.0.50:80);)
   table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 20.0.0.50 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw2")),
action=(ct_lb_mark(backends=30.0.0.50:80);)
   table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 20.0.0.50 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw3")),
action=(ct_lb_mark(backends=30.0.0.50:80);)
   table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 30.0.0.60 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw1")),
action=(ct_lb_mark(backends=10.0.0.60:80);)
   table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 30.0.0.60 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw2")),
action=(ct_lb_mark(backends=10.0.0.60:80);)
   table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 30.0.0.60 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw3")),
action=(ct_lb_mark(backends=10.0.0.60:80);)

If a logical port of sw2 (20.0.0.4) sends any packet to 10.0.0.0/24,
then ideally  the routing will be handled in ch1 since lr1-sw1 is
bound on ch1.

Now with the above logical flows,  load balancing could happen in the
source chassis itself. Is that ok ?

In our usecase, sw2 is not expected to handle load balancing for
10.0.0.0/24 CIDR. it is expected to handle LB only for 20.0.0.0/24
(Note: LB is not attached to the logical switch itself, since the
same LS can be connected to multiple such LRs.)

 | S1 (gw1) [10.0.0.0/24]
 |
  ls1    LR -- S2 (gw2) [20.0.0.0/24]
 (40.0.0.0/24)

LB1: 10.0.0.40:80 --> 40.0.0.10:80, 40.0.0.20:80
LB2: 30.0.0.40:80 --> 40.0.0.30:80, 40.0.0.40:80

Routes:
10.0.0.0/24 next-hop via S1
default next-hop via S2


In our usecase, backends are only present on ls1.
S1 and S2 does not have any backends. Similarly,
S1 needs to handle the traffic only for LB: 10.0.0.40
S2 needs to handle the traffic only for LB: 30.0.0.40

Hence, ingress coming from S2 happens on gw2 during the
request part. and egress again happens on gw2 (due to
above routes) for the response.

Although, the patch does not handle all the usecases, but with
existing OVN impl, it does not even program any LB rules.
Hence, this patch acheives the above restricted use-case.
Let me know, if i need to add some logs etc..



Shouldn't this patch generate the flows like this for lb1, lb2 and lb3 ?

   table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 &&

Re: [ovs-dev] [PATCH ovn] northd: Allow Load balancer to be attached to router with multiple GW ports

2024-08-06 Thread Priyankar Jain

Hi,

On 06/08/24 6:15 pm, Roberto Bartzen Acosta wrote:

Hi Priyankar,

So, what is the difference between the patch and the one already proposed
[1]?

Ahh I just got to know about this patch. Looks same to me.

The change you are proposing has already been proposed before, and in
discussion with Mark we came to the conclusion that this breaks the DNAT
case when using multiple chassis. So, I pushed the v4 [2][3] with Stateless
NAT implementation to solve this problem.
I reviewed this patch. If we are using stateless NAT, does that mean 
packet won't
be passed through CT in the egress direction? If so, wouldn't this cause 
TCP state
machine to break wrt CT (and CT table marking the packets as invalid). 
say SYN gets
commited to CT (ingress pipeline ct_lb_mark action), syn-ACK won't be 
commited
as only egress will run on GW node. Again the ACK would be commited. So 
are we
sure wrt sequence numbers/window scaling, we are safe wrt TCP CT state 
machine

here?

Also, as i mentioned in my previous reply, our usecase is much more 
restricted

and routing ensures ingress during request (from outside to LR) and egress (
for undnat) runs on the gw node.

Thanks,
Priyankar

[1] 
https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_pipermail_ovs-2Ddev_2024-2DFebruary_411981.html&d=DwIFaQ&c=s883GpUCOChKOHiocYtGcg&r=BWfApgwqQ_BTXd_yEq_kwWLM-KTqkdPvolDMCJmUagg&m=WfKrYrekTgMkBBiSsurQ7bYA31uPfqnE-Ybpq06XApWiO9Ij2MlCi8dO02I-kZf0&s=JR1CKXLiNrDGVt85mXYTutzBqhpCpiABOutl2TMPWEE&e=
[2] 
https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_pipermail_ovs-2Ddev_2024-2DJuly_415977.html&d=DwIFaQ&c=s883GpUCOChKOHiocYtGcg&r=BWfApgwqQ_BTXd_yEq_kwWLM-KTqkdPvolDMCJmUagg&m=WfKrYrekTgMkBBiSsurQ7bYA31uPfqnE-Ybpq06XApWiO9Ij2MlCi8dO02I-kZf0&s=QA8-pREOG2X_f-Hk8zDEeXxyV6jOoNx3pRxuKAIhsCk&e=
[3]
https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_ovsrobot_ovn_commit_9379c81152df2bcd0626269988efb8ac36d4f624&d=DwIFaQ&c=s883GpUCOChKOHiocYtGcg&r=BWfApgwqQ_BTXd_yEq_kwWLM-KTqkdPvolDMCJmUagg&m=WfKrYrekTgMkBBiSsurQ7bYA31uPfqnE-Ybpq06XApWiO9Ij2MlCi8dO02I-kZf0&s=axc06cz9XfMSiEKgrD6QmmoYwrBVxBnm1T2gEx-_byY&e=

Thanks,
Roberto

Em ter., 6 de ago. de 2024 às 07:57, Priyankar Jain <
priyankar.j...@nutanix.com> escreveu:


Hi,
Please find my reply inline.

Thanks,
Priyankar

On 31/07/24 4:46 am, Numan Siddique wrote:

On Wed, Jul 17, 2024 at 5:04 AM Priyankar Jain
 wrote:

This commit removes the restriction of support LB for router with only
<= 1 Distributed gateway ports.
Added datapath and logical flows validation cases.

Signed-off-by: Priyankar Jain 

Hi Priyankar,

Thanks for the patch.

Can you please tell me the use case ?

Lets say a logical router lr1 has sw1 (10.0.0.0/24) , sw2
(20.0.0.0/24) and sw3 (30.0.0.0/24) and the corresponding
gw ports are bound to chassis ch1, ch2 and ch3 respectively.

Let's say we attach load balancers - lb1 (10.0.0.40:80 ->
20.0.0.40:80), lb2 (20.0.0.50:80 -> 30.0.0.50:80) and
lb3 (30.0.0.60:80 -> 10.0.0.60:80) to lr1.

If I understand correctly, routing for sw1 (10.0.0.0/24) is handled on
the chassis ch1 and so on for other switches.

This patch generates the below logical flows in the router pipeline
for load balancer lb1, lb2 and lb3

table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 10.0.0.40 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw1")),
action=(ct_lb_mark(backends=20.0.0.40:80);)
table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 10.0.0.40 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw2")),
action=(ct_lb_mark(backends=20.0.0.40:80);)
table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 10.0.0.40 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw3")),
action=(ct_lb_mark(backends=20.0.0.40:80);)
table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 20.0.0.50 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw1")),
action=(ct_lb_mark(backends=30.0.0.50:80);)
table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 20.0.0.50 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw2")),
action=(ct_lb_mark(backends=30.0.0.50:80);)
table=8 (lr_in_dnat ), priority=110  , match=(ct.new &&
!ct.rel && ip4 && ip4.dst == 20.0.0.50 && tcp && tcp.dst == 80 &&
is_chassis_resident("cr-lr0-sw3")),
action=(ct_lb_mark(backends=30.0.0.50:80);)
table=8 (lr_i

Re: [ovs-dev] [PATCH ovn] northd: Allow Load balancer to be attached to router with multiple GW ports

2024-08-13 Thread Priyankar Jain



On 07/08/24 2:33 am, Numan Siddique wrote:

!---|
   CAUTION: External Email

|---!

On Tue, Aug 6, 2024 at 4:42 PM Mark Michelson  wrote:

Hi I just wanted to weigh in.

On 8/6/24 13:26, Roberto Bartzen Acosta via dev wrote:

Hi Priyankar,

Em ter., 6 de ago. de 2024 às 12:35, Priyankar Jain <
priyankar.j...@nutanix.com> escreveu:


Hi,

On 06/08/24 6:15 pm, Roberto Bartzen Acosta wrote:

Hi Priyankar,

So, what is the difference between the patch and the one already proposed
[1]?

Ahh I just got to know about this patch. Looks same to me.

The change you are proposing has already been proposed before, and in
discussion with Mark we came to the conclusion that this breaks the DNAT
case when using multiple chassis. So, I pushed the v4 [2][3] with

Stateless

NAT implementation to solve this problem.

I agree that this patch looks a lot like v1 of Roberto's series. I can't
comment on v4 of the patch yet since I haven't had a chance to look at
it yet.

I have it on my list to get reviewed before the end of the week.


I reviewed this patch. If we are using stateless NAT, does that mean
packet won't
be passed through CT in the egress direction? If so, wouldn't this cause
TCP state
machine to break wrt CT (and CT table marking the packets as invalid).
say SYN gets
commited to CT (ingress pipeline ct_lb_mark action), syn-ACK won't be
commited
as only egress will run on GW node. Again the ACK would be commited. So
are we
sure wrt sequence numbers/window scaling, we are safe wrt TCP CT state
machine
here?


My proposal does not affect LB egress since the output from the backends is
still done integrated with the ls_in_lb pipeline, and the DGP egress flow
depends on how you configure the routing/external NAT (e.g. OVN-IC uses
direct routing for this).

In addition, the v4 patch adds the ability to receive packets in the
lr_in_dnat pipeline via different chassis_resident and, additionally,
deliver the packet to ct_lb_mark to process the CT state in the LB pipeline
before delivering it to the backend IPs (regardless of the chassis that
receives the traffic).



Also, as i mentioned in my previous reply, our usecase is much more
restricted
and routing ensures ingress during request (from outside to LR) and egress
(
for undnat) runs on the gw node.


The problem with this is that there's nothing in the attached patch to
enforce this restricted scneario. It's possible for people to shoot
themselves in the foot.

In the current OVN main code, when a logical router with multiple DGPs
is configured
with load balancers,  then ovn-northd logs a warning, but it also
generates logical flows
for the VIPs and all generated logical flows related to the LBs in the
router pipeline
has a  match - "is_chassis_resident()". Looks like
they are shooting themselves in the foot anyway.

Maybe we should not generate these logical flows OR we just accept this patch
documenting the limitations.  Let the user know that they may shoot
themselves in the foot.
@Mark Michelson  wdyt ?

Thanks
Numan



Hi, Can you please let me know the next steps here? Is there any other 
direction

I should think of solving the above usecase.
In case we want to accept this patch, we can document the usecase where 
it would

work and where it won't.
Also, I am not very sure abt the statless unDNAT for reply direction 
that was

introduced in the other patch series. Its implication on conntrack.

Regards,
Priyankar


Have you ever applied the patch v4 to your use case? Considering the test
cases you write in your patch, the tests passed with my version v4 without
any issues. What do you think?


##  ##
## ovn 24.03.90 test suite. ##
##  ##

OVN end-to-end tests

533: Multiple DGP and LB traffic -- parallelization=yes --
ovn_monitor_all=yes ok
534: Multiple DGP and LB traffic -- parallelization=yes --
ovn_monitor_all=no ok


Thanks,
Roberto




Thanks,
Priyankar

[1]

https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_pipermail_ovs-2Ddev_2024-2DFebruary_411981.html&d=DwIFaQ&c=s883GpUCOChKOHiocYtGcg&r=BWfApgwqQ_BTXd_yEq_kwWLM-KTqkdPvolDMCJmUagg&m=WfKrYrekTgMkBBiSsurQ7bYA31uPfqnE-Ybpq06XApWiO9Ij2MlCi8dO02I-kZf0&s=JR1CKXLiNrDGVt85mXYTutzBqhpCpiABOutl2TMPWEE&e=

[2]

https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_pipermail_ovs-2Ddev_2024-2DJuly_415977.html&d=DwIFaQ&c=s883GpUCOChKOHiocYtGcg&r=BWfApgwqQ_BTXd_yEq_kwWLM-KTqkdPvolDMCJmUagg&m=WfKrYrekTgMkBBiSsurQ7bYA31uPfqnE-Ybpq06XApWiO9Ij2MlCi8dO02I-kZf0&s=QA8-pREOG2X_f-Hk8zDEeXxyV6jOoNx3pRxuKAIhsCk&e=

[3]


https://urldefense.proofpoint.com/v2/url?u=https-3A__github.com_ovsrobot_ovn_commit_9379c81152df2bcd0626269988efb8ac36d4f624&d=DwIFaQ&c=s883GpUCOChKOHiocYtG

[ovs-dev] [PATCH ovn] northd: Added svc_monitor_ipv4 in NB_Global options.

2024-01-08 Thread Priyankar Jain
This commit adds a new optional svc_monitor_ipv4 config in the NB_Global
options column. This IP address can be used to send the packets to the
OVN controller bypassing most of the logical switches pipelines.

Usage: Currently Load balancer health check requires a source IP address
to be allocated from the subnets where backends are present.
This change removes that requirement and instead user can give one IP
address for the system and set it in NB_Global:options:svc_monitor_ipv4.
While configuring the health check for the load balancer user can
provide this same IP while specifying the ip-port mappings.

Signed-off-by: Priyankar Jain 
---
 northd/en-sync-sb.c | 10 +
 northd/northd.c | 34 --
 northd/northd.h |  1 +
 northd/ovn-northd.8.xml | 40 +++--
 tests/ovn-northd.at | 98 +
 5 files changed, 118 insertions(+), 65 deletions(-)

diff --git a/northd/en-sync-sb.c b/northd/en-sync-sb.c
index 2ec3bf54f..95e9ead29 100644
--- a/northd/en-sync-sb.c
+++ b/northd/en-sync-sb.c
@@ -372,6 +372,16 @@ sync_addr_sets(struct ovsdb_idl_txn *ovnsb_txn,
 sync_addr_set(ovnsb_txn, "svc_monitor_mac", &svc, &sb_address_sets);
 sorted_array_destroy(&svc);
 
+/* Service monitor IP. */
+const char *svc_monitor_ip4 = northd_get_svc_monitor_ip4();
+int num_addr = 0; /* Create empty address-set by default */
+if (svc_monitor_ip4) {
+num_addr = 1;
+}
+struct sorted_array ip_svc = sorted_array_create(&svc_monitor_ip4,
+ num_addr, false);
+sync_addr_set(ovnsb_txn, "svc_monitor_ip4", &ip_svc, &sb_address_sets);
+
 /* sync port group generated address sets first */
 const struct nbrec_port_group *nb_port_group;
 NBREC_PORT_GROUP_TABLE_FOR_EACH (nb_port_group,
diff --git a/northd/northd.c b/northd/northd.c
index db3cd272e..f6d35143a 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -82,6 +82,7 @@ static bool use_common_zone = false;
 static char svc_monitor_mac[ETH_ADDR_STRLEN + 1];
 static struct eth_addr svc_monitor_mac_ea;
 
+static char *svc_monitor_ip4 = NULL;
 /* If this option is 'true' northd will make use of ct.inv match fields.
  * Otherwise, it will avoid using it.  The default is true. */
 static bool use_ct_inv_match = true;
@@ -7197,7 +7198,8 @@ build_pre_acls(struct ovn_datapath *od,
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 0, "1", "next;");
 
 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 110,
-  "eth.dst == $svc_monitor_mac", "next;");
+  "eth.dst == $svc_monitor_mac || ip4.dst == $svc_monitor_ip4",
+  "next;");
 
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 110,
   "eth.src == $svc_monitor_mac", "next;");
@@ -7370,7 +7372,8 @@ build_pre_lb(struct ovn_datapath *od, const struct shash 
*meter_groups,
 
 /* Do not send service monitor packets to conntrack. */
 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
-  "eth.dst == $svc_monitor_mac", "next;");
+  "eth.dst == $svc_monitor_mac || ip4.dst == $svc_monitor_ip4",
+  "next;");
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110,
   "eth.src == $svc_monitor_mac", "next;");
 
@@ -8238,7 +8241,7 @@ build_acls(struct ovn_datapath *od, const struct 
chassis_features *features,
 /* Add a 34000 priority flow to advance the service monitor reply
 * packets to skip applying ingress ACLs. */
 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL_EVAL, 34000,
-"eth.dst == $svc_monitor_mac",
+"eth.dst == $svc_monitor_mac || ip4.dst == 
$svc_monitor_ip4",
 REGBIT_ACL_VERDICT_ALLOW" = 1; next;");
 
 /* Add a 34000 priority flow to advance the service monitor packets
@@ -10325,7 +10328,8 @@ build_lswitch_destination_lookup_bmcast(struct 
ovn_datapath *od,
 ovs_assert(od->nbs);
 
 ovn_lflow_metered(lflows, od, S_SWITCH_IN_L2_LKUP, 110,
-  "eth.dst == $svc_monitor_mac && (tcp || icmp || icmp6)",
+  "(eth.dst == $svc_monitor_mac || ip4.dst == 
$svc_monitor_ip4)"
+  " && (tcp || icmp || icmp6)",
   "handle_svc_check(inport);",
   copp_meter_get(COPP_SVC_MONITOR, od->nbs->copp,
  meter_groups));
@@ -17891,6 +17895,22 @@ ovnnb_db_run(struct northd_input *input_data,
 smap_replace(&options, "svc_monitor_mac", svc_monitor_mac);
 }
 
+const char *monitor_ip4 = smap_get(&nb->options, "s

[ovs-dev] [PATCH ovn v2] northd: Added svc_monitor_ipv4 in NB_Global options.

2024-01-08 Thread Priyankar Jain
This commit adds a new optional svc_monitor_ipv4 config in the NB_Global
options column. This IP address can be used to send the packets to the
OVN controller bypassing most of the logical switches pipelines.

Usage: Currently Load balancer health check requires a source IP address
to be allocated from the subnets where backends are present.
This change removes that requirement and instead user can give one IP
address for the system and set it in NB_Global:options:svc_monitor_ipv4.
While configuring the health check for the load balancer user can
provide this same IP while specifying the ip-port mappings.

Signed-off-by: Priyankar Jain 
---

Changes since v1:
 - Fixed one testcase in ovn-northd.at

---
 northd/en-sync-sb.c | 10 +
 northd/northd.c | 34 +--
 northd/northd.h |  1 +
 northd/ovn-northd.8.xml | 40 +++--
 tests/ovn-northd.at | 96 -
 5 files changed, 116 insertions(+), 65 deletions(-)

diff --git a/northd/en-sync-sb.c b/northd/en-sync-sb.c
index 2ec3bf54f..95e9ead29 100644
--- a/northd/en-sync-sb.c
+++ b/northd/en-sync-sb.c
@@ -372,6 +372,16 @@ sync_addr_sets(struct ovsdb_idl_txn *ovnsb_txn,
 sync_addr_set(ovnsb_txn, "svc_monitor_mac", &svc, &sb_address_sets);
 sorted_array_destroy(&svc);
 
+/* Service monitor IP. */
+const char *svc_monitor_ip4 = northd_get_svc_monitor_ip4();
+int num_addr = 0; /* Create empty address-set by default */
+if (svc_monitor_ip4) {
+num_addr = 1;
+}
+struct sorted_array ip_svc = sorted_array_create(&svc_monitor_ip4,
+ num_addr, false);
+sync_addr_set(ovnsb_txn, "svc_monitor_ip4", &ip_svc, &sb_address_sets);
+
 /* sync port group generated address sets first */
 const struct nbrec_port_group *nb_port_group;
 NBREC_PORT_GROUP_TABLE_FOR_EACH (nb_port_group,
diff --git a/northd/northd.c b/northd/northd.c
index db3cd272e..f6d35143a 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -82,6 +82,7 @@ static bool use_common_zone = false;
 static char svc_monitor_mac[ETH_ADDR_STRLEN + 1];
 static struct eth_addr svc_monitor_mac_ea;
 
+static char *svc_monitor_ip4 = NULL;
 /* If this option is 'true' northd will make use of ct.inv match fields.
  * Otherwise, it will avoid using it.  The default is true. */
 static bool use_ct_inv_match = true;
@@ -7197,7 +7198,8 @@ build_pre_acls(struct ovn_datapath *od,
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 0, "1", "next;");
 
 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_ACL, 110,
-  "eth.dst == $svc_monitor_mac", "next;");
+  "eth.dst == $svc_monitor_mac || ip4.dst == $svc_monitor_ip4",
+  "next;");
 
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 110,
   "eth.src == $svc_monitor_mac", "next;");
@@ -7370,7 +7372,8 @@ build_pre_lb(struct ovn_datapath *od, const struct shash 
*meter_groups,
 
 /* Do not send service monitor packets to conntrack. */
 ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
-  "eth.dst == $svc_monitor_mac", "next;");
+  "eth.dst == $svc_monitor_mac || ip4.dst == $svc_monitor_ip4",
+  "next;");
 ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110,
   "eth.src == $svc_monitor_mac", "next;");
 
@@ -8238,7 +8241,7 @@ build_acls(struct ovn_datapath *od, const struct 
chassis_features *features,
 /* Add a 34000 priority flow to advance the service monitor reply
 * packets to skip applying ingress ACLs. */
 ovn_lflow_add(lflows, od, S_SWITCH_IN_ACL_EVAL, 34000,
-"eth.dst == $svc_monitor_mac",
+"eth.dst == $svc_monitor_mac || ip4.dst == 
$svc_monitor_ip4",
 REGBIT_ACL_VERDICT_ALLOW" = 1; next;");
 
 /* Add a 34000 priority flow to advance the service monitor packets
@@ -10325,7 +10328,8 @@ build_lswitch_destination_lookup_bmcast(struct 
ovn_datapath *od,
 ovs_assert(od->nbs);
 
 ovn_lflow_metered(lflows, od, S_SWITCH_IN_L2_LKUP, 110,
-  "eth.dst == $svc_monitor_mac && (tcp || icmp || icmp6)",
+  "(eth.dst == $svc_monitor_mac || ip4.dst == 
$svc_monitor_ip4)"
+  " && (tcp || icmp || icmp6)",
   "handle_svc_check(inport);",
   copp_meter_get(COPP_SVC_MONITOR, od->nbs->copp,
  meter_groups));
@@ -17891,6 +17895,22 @@ ovnnb_db_run(struct northd_input *input_data,
 smap_replace(&options, "svc_monitor_mac&q

[ovs-dev] [PATCH ovn] northd: Added lb_vip_mac config option in Logical_Switch.

2024-01-08 Thread Priyankar Jain
Currently load balancer applied to a logical switch has the
following restriction:
- VIP of the load balancer cannot reside in the subnet prefix as the
  clients as OVN does not install ARP responder flows for the LB VIP.

This change adds a new config option "lb_vip_mac" in the logical_switch
table which is expected to be a MAC address. If the logical_switch has
this option configured, northd will program an ARP responder flow for
all the LB VIPs of the logical_switch with this MAC address.

Usecase: With this change, CMS can set the lb_vip_mac value to same as
the default gateway MAC. This allows CMS to allocate VIP of the Load
balancer from any subnet prefix.

Signed-off-by: Priyankar Jain 
---
 northd/northd.c |  71 ++
 northd/northd.h |   2 +
 northd/ovn-northd.8.xml |  49 ++
 tests/ovn.at| 109 
 4 files changed, 231 insertions(+)

diff --git a/northd/northd.c b/northd/northd.c
index db3cd272e..ebca2c073 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -790,8 +790,11 @@ init_lb_for_datapath(struct ovn_datapath *od)
 {
 if (od->nbs) {
 od->has_lb_vip = ls_has_lb_vip(od);
+od->lb_vip_mac = nullable_xstrdup(
+smap_get(&od->nbs->other_config, "lb_vip_mac"));
 } else {
 od->has_lb_vip = lr_has_lb_vip(od);
+od->lb_vip_mac = NULL;
 }
 }
 
@@ -800,6 +803,9 @@ destroy_lb_for_datapath(struct ovn_datapath *od)
 {
 ovn_lb_ip_set_destroy(od->lb_ips);
 od->lb_ips = NULL;
+
+free(od->lb_vip_mac);
+od->lb_vip_mac = NULL;
 }
 
 /* A group of logical router datapaths which are connected - either
@@ -12204,6 +12210,70 @@ build_lrouter_nat_flows_for_lb(struct ovn_lb_vip 
*lb_vip,
 }
 }
 
+static void
+build_lb_rules_arp_nd_rsp(struct hmap *lflows, struct ovn_lb_datapaths *lb_dps,
+  const struct ovn_datapaths *ls_datapaths,
+  struct ds *match, struct ds *actions)
+{
+if (!lb_dps->n_nb_ls) {
+return;
+}
+
+const struct ovn_northd_lb *lb = lb_dps->lb;
+for (size_t i = 0; i < lb->n_vips; i++) {
+struct ovn_lb_vip *lb_vip = &lb->vips[i];
+
+size_t index;
+BITMAP_FOR_EACH_1 (index, ods_size(ls_datapaths), lb_dps->nb_ls_map) {
+struct ovn_datapath *od = ls_datapaths->array[index];
+if (!od->lb_vip_mac) {
+  continue;
+}
+ds_clear(match);
+ds_clear(actions);
+if (IN6_IS_ADDR_V4MAPPED(&lb_vip->vip)) {
+ds_put_format(match, "arp.tpa == %s && arp.op == 1",
+  lb_vip->vip_str);
+ds_put_format(actions,
+"eth.dst = eth.src; "
+"eth.src = %s; "
+"arp.op = 2; /* ARP reply */ "
+"arp.tha = arp.sha; "
+"arp.sha = %s; "
+"arp.tpa = arp.spa; "
+"arp.spa = %s; "
+"outport = inport; "
+"flags.loopback = 1; "
+"output;",
+od->lb_vip_mac, od->lb_vip_mac,
+lb_vip->vip_str);
+} else {
+ds_put_format(match, "nd_ns && nd.target == %s",
+  lb_vip->vip_str);
+ds_put_format(actions,
+"nd_na { "
+"eth.dst = eth.src; "
+"eth.src = %s; "
+"ip6.src = %s; "
+"nd.target = %s; "
+"nd.tll = %s; "
+"outport = inport; "
+"flags.loopback = 1; "
+"output; "
+"};",
+od->lb_vip_mac,
+lb_vip->vip_str,
+lb_vip->vip_str,
+od->lb_vip_mac);
+}
+ovn_lflow_add_with_hint(lflows, od,
+S_SWITCH_IN_ARP_ND_RSP, 130,
+ds_cstr(match), ds_cstr(actions),
+&lb->nlb->header_);
+}
+}
+}
+
 static void
 build_lswitch_flows_for_lb(struct ovn_lb_datapaths *lb_dps,
struct hmap *lflows,
@@ -12255,6 +12325,7 @@ build_lswitch_flows_for_lb(struct ovn_lb_datapaths 
*lb_dps,
 ls_datapaths, match, action);
 build_lb_rules(lflows, lb_dps, ls_datapaths, features, match, 

Re: [ovs-dev] [PATCH ovn v2] northd: Added svc_monitor_ipv4 in NB_Global options.

2024-02-05 Thread Priyankar Jain

Hi,

On 02/02/24 8:44 am, Mark Michelson wrote:

Hi Priyankar,

I'm not sure I understand the purpose of this patch. Even without this 
patch, you could configure whatever IP address you want in the 
ip_port_mappings as the source address for the health checks. I don't 
see anything in the code that restricts the IP to be within the subnet 
of the backend IP address. And even if there is such a restriction, I 
don't see anything in this patch that lifts that restriction.


There is no restriction from any OVN configuration point of view. But 
consider the scenario where the source IP in the ip_port_mappings 
belongs to the subnet outside of the backend VM. In that case, the 
backend VM will receive the health probe correctly, but when it tries to 
send the reply back to the probe, the VM tries to send packet to the 
router GW instead (since IP belongs to diff subnet than that of VM.). 
Hence, the reverse direction packet never reaches the OVN controller 
because dst-mac is set to the VM subnet's GW.


Sample tcpdump on the Backend VM interface:

source ip:169.254.1.21, svc_monitor_mac = 52:89:2d:d7:12:d0, backend VM 
mac: 50:6b:8d:15:c9:0e, backend VM IP: 50.0.0.93


 04:23:18.080058 52:89:2d:d7:12:d0 > 50:6b:8d:15:c9:0e, ethertype IPv4 
(0x0800), length 54: (tos 0x0, ttl 63, id 0, offset 0, flags [DF], proto 
TCP (6), length 40)
    169.254.1.21.54010 > 50.0.0.93.80: Flags [S], cksum 0xa921 
(correct), seq 1988485367, win 65160, length 0
04:23:18.080512 50:6b:8d:15:c9:0e > e0:19:95:6d:37:58, ethertype IPv4 
(0x0800), length 58: (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto 
TCP (6), length 44)
    50.0.0.93.80 > 169.254.1.21.54010: Flags [S.], cksum 0xdd8e 
(incorrect -> 0x7c1d), seq 2754347589, ack 1988485368, win 28040, 
options [mss 1402], length 0


Notice in the reply from the VM, the destination MAC is set to the 
default GW's MAC. Hence, it is not delivered to the ovn-controller.


With my patch, the user can set an IP address for the service monitor in 
NB_GLOBAL and use the same IP address in the ip-port-mappings. This way 
even if the dst mac is GW, as long as it is destined to svc monitor IP, 
it will be delivered to OVN controller.




The extra matches added in this patch (eth.dst == $svc_monitor_mac || 
ip4.dst == $svc_monitor_ip4) don't make much sense to me either. When 
OVN sends a health check, the reply from the backend will have eth.dst 
set to the service monitor MAC address. Since the existing matches do 
not care what the destination IPv4 address is, the existing match 
should work fine. If there was something in the code that was dropping 
packets addressed to IP addresses outside the subnet of the configured 
load balancer backends, this patch doesn't appear to be doing anything 
to prevent such droppage.


We didn't notice this behaviour. The VM was sending the reply to the 
default gateway mac address.
And finally, if I'm mistaken about my above two points, then my final 
problem with this patch is that it only adds support for IPv4. I 
noticed that in the documentation for the Service_Monitor table in the 
southbound database, it claims that "Service monitoring for IPv6 
services is not supported." And the "src_ip" field is documented as 
"Source IPv4 address to use in the service monitor packet." 
Unfortunately, these are inaccurate. IPv6 service monitoring was added 
as a feature about 3 months after the initial IPv4 service monitoring 
was added. Unfortunately, it looks as though we did not update the 
documentation, and we need to get this fixed. However, the northbound 
load balancer health check documentation makes it clear that IPv6 is 
supported, and the code also has support for IPv6 in it. So if I'm 
incorrect in the first two paragraphs and this option is really 
needed, then it needs to support IPv6 as well.


ohh my bad. I forgot about IPv6. I'll add for it as well.

Thanks,
Priyankar



Thanks,
Mark Michelson

On 1/8/24 08:00, Priyankar Jain wrote:

This commit adds a new optional svc_monitor_ipv4 config in the NB_Global
options column. This IP address can be used to send the packets to the
OVN controller bypassing most of the logical switches pipelines.

Usage: Currently Load balancer health check requires a source IP address
to be allocated from the subnets where backends are present.
This change removes that requirement and instead user can give one IP
address for the system and set it in NB_Global:options:svc_monitor_ipv4.
While configuring the health check for the load balancer user can
provide this same IP while specifying the ip-port mappings.

Signed-off-by: Priyankar Jain 
---

Changes since v1:
  - Fixed one testcase in ovn-northd.at

---
  northd/en-sync-sb.c | 10 +
  northd/northd.c | 34 +--
  northd/northd.h |  1 +
  northd/ovn-northd.8.xml | 40 +++--
  tests/ovn-northd.at | 96 

Re: [ovs-dev] [PATCH ovn] northd: Added lb_vip_mac config option in Logical_Switch.

2024-02-05 Thread Priyankar Jain

Hi,

On 01/02/24 5:15 am, Numan Siddique wrote:

On Mon, Jan 8, 2024 at 8:13 AM Priyankar Jain
 wrote:

Currently load balancer applied to a logical switch has the
following restriction:
- VIP of the load balancer cannot reside in the subnet prefix as the
   clients as OVN does not install ARP responder flows for the LB VIP.


Hi Priyankar,

Sorry for the late reviews.

No worries. Thanks for you comments on the patch.


The above statement is actually not correct.  OVN does allow the VIP
of the load balancer
to be from the same subnet prefix.   But in order for this to work,
this logical switch
has to be associated with a logical router.

My bad. I would repurpose the commit message to make it more clearer.


Eg.
--
ovn-nbctl ls-add sw0

ovn-nbctl lsp-add sw0 sw0-p1
ovn-nbctl lsp-set-addresses sw0-p1 "50:54:00:00:00:03 10.0.0.3"
ovn-nbctl lsp-set-port-security sw0-p1 "50:54:00:00:00:03 10.0.0.3"

ovn-nbctl lr-add lr0
ovn-nbctl lrp-add lr0 lr0-sw0 00:00:00:00:ff:01 10.0.0.1/24
ovn-nbctl lsp-add sw0 sw0-lr0
ovn-nbctl lsp-set-type sw0-lr0 router
ovn-nbctl lsp-set-addresses sw0-lr0 router
ovn-nbctl lsp-set-options sw0-lr0 router-port=lr0-sw0

ovn-nbctl lb-add lb1 10.0.0.10:80 10.0.0.3:80,20.0.0.3:80
ovn-nbctl --wait=sb ls-lb-add sw0 lb1

# This will work once you do
ovn-nbctl lr-lb-add lr0 lb1
---

Do you have a use case where you attach a load balancer to a logical switch
and this logical switch doesn't connect to any logical router ?   If
so, then perhaps we can consider
this patch.  However if you always attach a logical router to a
logical switch (like how its
done in the test case added in this patch),  then just attaching the
lb to the router would do.


There are actually 2 usecases for us:

1. Logical switch (with localnet port) is not connected to any logical 
router. Routing is handled by the underlay here. And we have LB attached 
to logical switch.


2. Load balancer only applied to logical switches. Say we don't want the 
load balancer to be accessible from outside world (NS connectivity). 
Initially i thought it logical_router_policy can help but these gets 
applied (lr_in_policy) only after the LB (lr_in_defrag). Hence, policies 
see only translated addresses. In this case the LB is only applied to 
the logical switches and not to the logical routers.




Can you please confirm if this is sufficient for your use case ?
Perhaps we should document it in OVN :)

Thanks
Numan



This change adds a new config option "lb_vip_mac" in the logical_switch
table which is expected to be a MAC address. If the logical_switch has
this option configured, northd will program an ARP responder flow for
all the LB VIPs of the logical_switch with this MAC address.

Usecase: With this change, CMS can set the lb_vip_mac value to same as
the default gateway MAC. This allows CMS to allocate VIP of the Load
balancer from any subnet prefix.

Signed-off-by: Priyankar Jain 





---
  northd/northd.c |  71 ++
  northd/northd.h |   2 +
  northd/ovn-northd.8.xml |  49 ++
  tests/ovn.at| 109 
  4 files changed, 231 insertions(+)

diff --git a/northd/northd.c b/northd/northd.c
index db3cd272e..ebca2c073 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -790,8 +790,11 @@ init_lb_for_datapath(struct ovn_datapath *od)
  {
  if (od->nbs) {
  od->has_lb_vip = ls_has_lb_vip(od);
+od->lb_vip_mac = nullable_xstrdup(
+smap_get(&od->nbs->other_config, "lb_vip_mac"));
  } else {
  od->has_lb_vip = lr_has_lb_vip(od);
+od->lb_vip_mac = NULL;
  }
  }

@@ -800,6 +803,9 @@ destroy_lb_for_datapath(struct ovn_datapath *od)
  {
  ovn_lb_ip_set_destroy(od->lb_ips);
  od->lb_ips = NULL;
+
+free(od->lb_vip_mac);
+od->lb_vip_mac = NULL;
  }

  /* A group of logical router datapaths which are connected - either
@@ -12204,6 +12210,70 @@ build_lrouter_nat_flows_for_lb(struct ovn_lb_vip 
*lb_vip,
  }
  }

+static void
+build_lb_rules_arp_nd_rsp(struct hmap *lflows, struct ovn_lb_datapaths *lb_dps,
+  const struct ovn_datapaths *ls_datapaths,
+  struct ds *match, struct ds *actions)
+{
+if (!lb_dps->n_nb_ls) {
+return;
+}
+
+const struct ovn_northd_lb *lb = lb_dps->lb;
+for (size_t i = 0; i < lb->n_vips; i++) {
+struct ovn_lb_vip *lb_vip = &lb->vips[i];
+
+size_t index;
+BITMAP_FOR_EACH_1 (index, ods_size(ls_datapaths), lb_dps->nb_ls_map) {
+struct ovn_datapath *od = ls_datapaths->array[index];
+if (!od->lb_vip_mac) {
+  continue;
+}
+ds_clear(match);
+ds_clear(actions);
+if (IN6_IS_ADDR_V4MAPPED(&lb_vip->vi

[ovs-dev] [PATCH ovn] Remove lport from related_lports if there is no binding

2023-05-30 Thread Priyankar Jain
Currently during port migration, two chassis (source and destination)
can try to claim the same logical switch port simultaneously for a
short-period of time until the tap is deleted on source hypervisor.
ovn-controllers on these 2 hosts constantly receives port-binding
updates about other chassis claiming the port and as a result it tries
to claim the port again (because its chassis has a tap interface
referencing the LSP). This flapping ends once CMS cleans up tap
interface from the source chassis.

Now following steps occur during a single iteration inc-proc-eng during
flapping:

1. PB update received on OVN controller about other chassis owning the
   port.
2. ovn-controller tries to claim the port.
3. It installs the OVS flows for the port and updates the runtime_data
   to include this port in locally relevant ports.
4. If some change to runtime data happens as part of 3, port-groups
   containing the affected ports are recomputed. It uses related_lports
   runtime data to compute the port-groups.

Finally, ovn-controller sends a port-binding update to SB changing the
chassis to itself.
At a later point of time, SB sends the notification to ovn-controller
about (4) being completed.

Once CMS deletes the tap interface, ovn-controller receives the
notification and updates the runtime data accordingly.

Issue: ovs-flows are (sometimes)not cleaned up upon port migration.

If the notification of OVS interface deletion is received before SB
acks the PortBinding update, then ovn-controller does not cleanup
related_lports leading to incorrect port-groups computation.

i.e if the order of events is as follows:

1. PB update received on OVN controller about other chassis owning the
   port.
2. ovn-controller claims the port, installs OVS flows and sends the
   PortBinding update to SB.
3. OVS interface deletion notification received by ovn-controller.
4. SB ack received for step-2 PB update.

This commit fixes this issue by removing the logical_port from related
port even in case there is no binding available locally.

Signed-off-by: Priyankar Jain 
---
 controller/binding.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/controller/binding.c b/controller/binding.c
index 9b0647b70..9889be5c7 100644
--- a/controller/binding.c
+++ b/controller/binding.c
@@ -1568,6 +1568,7 @@ consider_vif_lport_(const struct sbrec_port_binding *pb,
 || is_additional_chassis(pb, b_ctx_in->chassis_rec)) {
 /* Release the lport if there is no lbinding. */
 if (!lbinding_set || !can_bind) {
+remove_related_lport(pb, b_ctx_out);
 return release_lport(pb, b_ctx_in->chassis_rec,
  !b_ctx_in->ovnsb_idl_txn,
  b_ctx_out->tracked_dp_bindings,
-- 
2.37.1 (Apple Git-137.1)

___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn] Remove lport from related_lports if there is no binding

2023-06-01 Thread Priyankar Jain

Hi Mark,

Thanks for the review.

On 01/06/23 12:29 am, Mark Michelson wrote:

Hi Priyankar,

The description makes the issue crystal clear, and you appear to be 
solving the race condition that can happen between the OVS interface 
table and the southbound port_binding table.


Acked-by: Mark Michelson 

Just to let you know, the flapping problem you mention can be avoided 
altogether by using options:requested-chassis on the northbound logical 
switch port. When you migrate the port to a new chassis, place the new 
chassis's name or hostname as this option, and ovn-controller will only 
claim the logical switch port on that chassis. The old chassis will not 
try to claim the port even if the tap is still present.




Thanks for the suggestion. I'll definitely try out this.
Appreciate all the help!

Regards,
Priyankar

I wouldn't be surprised if there were other ways to trigger this race 
condition as well. I suspect the port-flapping scenario is most likely 
to trigger it, though.


On 5/31/23 01:35, Priyankar Jain wrote:

Currently during port migration, two chassis (source and destination)
can try to claim the same logical switch port simultaneously for a
short-period of time until the tap is deleted on source hypervisor.
ovn-controllers on these 2 hosts constantly receives port-binding
updates about other chassis claiming the port and as a result it tries
to claim the port again (because its chassis has a tap interface
referencing the LSP). This flapping ends once CMS cleans up tap
interface from the source chassis.

Now following steps occur during a single iteration inc-proc-eng during
flapping:

1. PB update received on OVN controller about other chassis owning the
    port.
2. ovn-controller tries to claim the port.
3. It installs the OVS flows for the port and updates the runtime_data
    to include this port in locally relevant ports.
4. If some change to runtime data happens as part of 3, port-groups
    containing the affected ports are recomputed. It uses related_lports
    runtime data to compute the port-groups.

Finally, ovn-controller sends a port-binding update to SB changing the
chassis to itself.
At a later point of time, SB sends the notification to ovn-controller
about (4) being completed.

Once CMS deletes the tap interface, ovn-controller receives the
notification and updates the runtime data accordingly.

Issue: ovs-flows are (sometimes)not cleaned up upon port migration.

If the notification of OVS interface deletion is received before SB
acks the PortBinding update, then ovn-controller does not cleanup
related_lports leading to incorrect port-groups computation.

i.e if the order of events is as follows:

1. PB update received on OVN controller about other chassis owning the
    port.
2. ovn-controller claims the port, installs OVS flows and sends the
    PortBinding update to SB.
3. OVS interface deletion notification received by ovn-controller.
4. SB ack received for step-2 PB update.

This commit fixes this issue by removing the logical_port from related
port even in case there is no binding available locally.

Signed-off-by: Priyankar Jain 
---
  controller/binding.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/controller/binding.c b/controller/binding.c
index 9b0647b70..9889be5c7 100644
--- a/controller/binding.c
+++ b/controller/binding.c
@@ -1568,6 +1568,7 @@ consider_vif_lport_(const struct 
sbrec_port_binding *pb,

  || is_additional_chassis(pb, b_ctx_in->chassis_rec)) {
  /* Release the lport if there is no lbinding. */
  if (!lbinding_set || !can_bind) {
+    remove_related_lport(pb, b_ctx_out);
  return release_lport(pb, b_ctx_in->chassis_rec,
   !b_ctx_in->ovnsb_idl_txn,
   b_ctx_out->tracked_dp_bindings,



___
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev


Re: [ovs-dev] [PATCH ovn] Remove lport from related_lports if there is no binding

2023-07-13 Thread Priyankar Jain

Hi Dumitru,

On 12/07/23 4:12 pm, Dumitru Ceara wrote:

On 6/6/23 14:10, Xavier Simonart wrote:

Hi Priyankar, Mark

Thanks for the patch. I agree with Mark - the description is really great !
Based on your description, I tried creating a unit-test reproducing the
issue, and checking that your patch fixes it.
I came up with [0]. It reproduces some issues (flows not deleted) on
origin/main, and the issues fixed using your patch. So, it looks good.

However, if, in [0] I remove the "sleep 2" (see below), then it seems that
there are still some issues.
It might not be exactly the same issue you saw, but is very similar - the
same flow does not get properly deleted.
I think that the (new) issue is the following:
When a port is claimed by two different chassis (as part of the migration),
ovn-controllers try to avoid a "war" between themselves, and postpone port
claiming if the port got claimed very recently.
This works fine. But, if, while a port claim is postponed, the interface is
deleted, it seems that some flows are not properly removed.
Checking that the port is postponed in is_binding_lport_this_chassis might
be enough, but this requires additional check.
(if we want to add this unit test to the patch, then we probably need to
move some of the functions to ovn-macros to avoid duplication, as I steal
them from the system tests)


Hi Priyankar,

Thanks for the fix!

I guess we have two options:

(1) go ahead with your patch but also include Xavier's test.
(2) figure out a more generic solution which covers the second problem
reported by Xavier too and respin this patch as a v2 that fixes all cases.

I'd prefer we go for (2) mainly because we know there's a problem but
also because the "sleep 2" in the test case makes me uneasy.

Do you have some input on this?


I concur. We should go ahead with (2). Lately I was not able to spend 
time on the patch. I'll try to spend some cycles working on the generic 
fix for the issues forementioned.


P.S: Thanks a lot Xavier for providing an automated testcase. Appreciate 
your support.


Thanks,
Priyankar


Regards,
Dumitru



Thanks
Xavier

[0]
OVN_FOR_EACH_NORTHD([
AT_SETUP([)
ovn_start

ovn-nbctl ls-add ls0
ovn-nbctl lsp-add ls0 lsp0
ovn-nbctl lsp-add ls0 lsp1

net_add n1
for i in 1 2; do
 sim_add hv$i
 as hv$i
 ovs-vsctl add-br br-phys
 ovn_attach n1 br-phys 192.168.0.$i
 ovn-appctl vlog/set dbg
done

sleep_sb() {
   echo SB going to sleep
   AT_CHECK([kill -STOP $(cat ovn-sb/ovsdb-server.pid)])
}
wake_up_sb() {
   echo SB waking up
   AT_CHECK([kill -CONT $(cat ovn-sb/ovsdb-server.pid)])
}
sleep_controller() {
   hv=$1
   echo Controller $hv going to sleep
   as $hv ovn-appctl debug/pause
   OVS_WAIT_UNTIL([test x$(as $hv ovn-appctl -t ovn-controller debug/status)
= "xpaused"])
}

wake_up_controller() {
   hv=$1
   echo Controller $hv going to wake up
   as $hv ovn-appctl debug/resume
   OVS_WAIT_UNTIL([test x$(as $hv ovn-appctl -t ovn-controller debug/status)
= "xrunning"])
}
get_flows()
{
 hv=$1
 out=oflows${2}
 as $hv ovs-ofctl dump-flows br-int |
 sed -e 's/cookie=[[0-9.]]*s, //g' |
 sed -e 's/duration=[[0-9.]]*s, //g' |
 sed -e 's/idle_age=[[0-9]]*, //g' |
 sed -e 's/n_packets=[[0-9]]*, //g' |
 sed -e 's/n_bytes=[[0-9]]*, //g' | sort -k2 > $out
 AT_CAPTURE_FILE([$out])
}

check ovn-nbctl --wait=hv sync
hv1_uuid=$(ovn-sbctl --bare --columns _uuid list chassis hv1)
hv2_uuid=$(ovn-sbctl --bare --columns _uuid list chassis hv2)

as hv1 ovs-vsctl -- add-port br-int vif1 -- set Interface vif1
external-ids:iface-id=lsp1
as hv1 ovs-vsctl -- add-port br-int vif -- set Interface vif
external-ids:iface-id=lsp0
check ovn-nbctl pg-add pg1 lsp0 lsp1
wait_for_ports_up
check ovn-nbctl --wait=hv acl-add ls0 to-lport 1000 'eth.type == 0x1234 &&
outport == @pg1' drop

# Delete vif => store flows w/ only vif1, and no vif
as hv1 ovs-vsctl -- del-port br-int vif
check ovn-nbctl --wait=hv sync
get_flows hv1 1

AS_BOX([sleeping hv1, binding hv1 and hv2])
sleep_controller hv1
as hv2 ovs-vsctl -- add-port br-int vif -- set Interface vif
external-ids:iface-id=lsp0
as hv1 ovs-vsctl -- add-port br-int vif -- set Interface vif
external-ids:iface-id=lsp0

OVS_WAIT_UNTIL([
 chassis=$(ovn-sbctl --bare --columns chassis list port_binding lsp0)
 test "$chassis" = $hv2_uuid
])

AS_BOX([sleeping sb, waking up hv1, Sleeping hv2])
sleep_sb
wake_up_controller hv1
sleep_controller hv2
*sleep 2*
AS_BOX([Unbinding hv1, waking up sb, waking up hv2])

as hv1 ovs-vsctl -- del-port br-int vif
wake_up_sb
wake_up_controller hv2

AS_BOX([Unbinding hv2])
as hv2 ovs-vsctl -- del-port br-int vif
check ovn-nbctl --wait=hv sync

get_flows hv1 2

check diff oflows1 oflows2
OVN_CLEANUP([hv1],[hv2])

AT_CLEANUP
])

~




On Thu, Jun 1, 2023 at 8:21 PM Priyankar Jain 
wro

[ovs-dev] [PATCH ovn] controller: Fix an issue wrt cleanup of stale patch port.

2024-03-28 Thread Priyankar Jain
Issue:
Upon updating the network_name option of localnet port from one physical
bridge to another, ovn-controller fails to cleanup the peer localnet
patch port from the old bridge and ends up creating a duplicate peer
localnet patch port which fails in the following ovsdb transaction:

```
"Transaction causes multiple rows in \"Interface\" table to have
identical values
(patch-localnet_a7d47490-8a90-4c4d-8266-2915ad07c185-to-br-int)
```

Current workflow:
1. Keep a set of all existing localnet ports on br-int. Let us call this
   set: existing_ports.
2. For each localnet port in SB:
   2.1 Create a patch port on br-int. (if it already exists on br-int,
   do not create but remove the entry from exisitng_ports set).
   2.2 Create a peer patch port on br-phys-x. (if it already exists on the
   bridge, do not create but remove the entry from exisitng_ports set).
3. Finally, cleanup all the ports and its peers from exisiting_ports.

With the above workflow, upon network_name change of localnet LSP, since
ports on br-int does not change, only peer port needs to be move from
br-phys-x to br-phys-y, the localnet port is removed from
exisiting_ports in #2.1 and its peer never becomes eligible for cleanup.

Fix:
Include both patch port on br-int and its peer port in the
exisiting_ports set as part of #1.
This make sures that the peer port is only removed from existing_ports
only if it is already present on the correct bridge. (#2.1/#2.2)
Otherwise, during the cleanup in #3 it will be considered.

Fixes: b600316 ("Don't delete patch ports that don't belong to br-int")
Signed-off-by: Priyankar Jain 
---
 controller/patch.c |  32 +++--
 tests/ovn.at   | 109 +
 2 files changed, 124 insertions(+), 17 deletions(-)

diff --git a/controller/patch.c b/controller/patch.c
index c1cd5060d..4fed6e375 100644
--- a/controller/patch.c
+++ b/controller/patch.c
@@ -314,6 +314,21 @@ patch_run(struct ovsdb_idl_txn *ovs_idl_txn,
 || smap_get(&port->external_ids, "ovn-l3gateway-port")
 || smap_get(&port->external_ids, "ovn-logical-patch-port")) {
 shash_add(&existing_ports, port->name, port);
+/* Also add peer ports to the list. */
+for (size_t j = 0; j < port->n_interfaces; j++) {
+struct ovsrec_interface *p_iface = port->interfaces[j];
+if (strcmp(p_iface->type, "patch")) {
+continue;
+}
+const char *peer_name = smap_get(&p_iface->options, "peer");
+if (peer_name) {
+const struct ovsrec_port *peer_port =
+get_port(ovsrec_port_by_name, peer_name);
+if (peer_port) {
+shash_add(&existing_ports, peer_port->name, peer_port);
+}
+}
+}
 }
 }
 
@@ -336,23 +351,6 @@ patch_run(struct ovsdb_idl_txn *ovs_idl_txn,
  * ovn-controller.  Otherwise it may cause unncessary dataplane
  * interruption during restart/upgrade. */
 if (!daemon_started_recently()) {
-/* delete peer patch port first */
-for (size_t i = 0; i < port->n_interfaces; i++) {
-struct ovsrec_interface *iface = port->interfaces[i];
-if (strcmp(iface->type, "patch")) {
-continue;
-}
-const char *iface_peer = smap_get(&iface->options, "peer");
-if (iface_peer) {
-const struct ovsrec_port *peer_port =
-get_port(ovsrec_port_by_name, iface_peer);
-if (peer_port) {
-remove_port(bridge_table, peer_port);
-}
-}
-}
-
-/* now delete integration bridge patch port */
 remove_port(bridge_table, port);
 }
 }
diff --git a/tests/ovn.at b/tests/ovn.at
index 4d0c7ad53..487e727c0 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -36454,6 +36454,115 @@ OVN_CLEANUP([hv1])
 AT_CLEANUP
 ])
 
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn-controller update network_name option for localnet port])
+ovn_start
+net_add n1
+
+sim_add hv1
+as hv1
+
+# Bridge Topology
+# Initial: br-int and br-phys-1 connected through ovn-localnet patch port.
+#
+# br-phys-1 -- br-int
+#
+# Final: br-int and br-phys-3 connected through ovn-localnet patch port.
+# Similarly br-int-2 and br-hys-2 connected through localnet patch port
+# but not owned by this controller.
+#
+# br-phys-1  br-int -- br-phys-3
+# br-int-2 -- br-phys-2
+
+# Create bridges
+ovs-vsctl add-br br-int
+ovs-vsctl add-br br-int-2
+ovs-vsctl add-br br-phys-1
+ovs-vsctl add-br br-phys-2
+ovs-vsctl add-br br-phys-3
+ovs-vsctl set open . 

[ovs-dev] [PATCH ovn v2] controller: Fix an issue wrt cleanup of stale patch port.

2024-04-21 Thread Priyankar Jain
Issue:
Upon updating the network_name option of localnet port from one physical
bridge to another, ovn-controller fails to cleanup the peer localnet
patch port from the old bridge and ends up creating a duplicate peer
localnet patch port which fails in the following ovsdb transaction:

```
"Transaction causes multiple rows in \"Interface\" table to have
identical values
(patch-localnet_a7d47490-8a90-4c4d-8266-2915ad07c185-to-br-int)
```

Current workflow:
1. Keep a set of all existing localnet ports on br-int. Let us call this
   set: existing_ports.
2. For each localnet port in SB:
   2.1 Create a patch port on br-int. (if it already exists on br-int,
   do not create but remove the entry from exisitng_ports set).
   2.2 Create a peer patch port on br-phys-x. (if it already exists on the
   bridge, do not create but remove the entry from exisitng_ports set).
3. Finally, cleanup all the ports and its peers from exisiting_ports.

With the above workflow, upon network_name change of localnet LSP, since
ports on br-int does not change, only peer port needs to be move from
br-phys-x to br-phys-y, the localnet port is removed from
exisiting_ports in #2.1 and its peer never becomes eligible for cleanup.

Fix:
Include both patch port on br-int and its peer port in the
exisiting_ports set as part of #1.
This make sures that the peer port is only removed from existing_ports
only if it is already present on the correct bridge. (#2.1/#2.2)
Otherwise, during the cleanup in #3 it will be considered.

Fixes: b600316f252a ("Don't delete patch ports that don't belong to br-int")
Signed-off-by: Priyankar Jain 
---
 controller/patch.c |  32 +++--
 tests/ovn.at   | 109 +
 2 files changed, 124 insertions(+), 17 deletions(-)

diff --git a/controller/patch.c b/controller/patch.c
index c1cd5060d..4fed6e375 100644
--- a/controller/patch.c
+++ b/controller/patch.c
@@ -314,6 +314,21 @@ patch_run(struct ovsdb_idl_txn *ovs_idl_txn,
 || smap_get(&port->external_ids, "ovn-l3gateway-port")
 || smap_get(&port->external_ids, "ovn-logical-patch-port")) {
 shash_add(&existing_ports, port->name, port);
+/* Also add peer ports to the list. */
+for (size_t j = 0; j < port->n_interfaces; j++) {
+struct ovsrec_interface *p_iface = port->interfaces[j];
+if (strcmp(p_iface->type, "patch")) {
+continue;
+}
+const char *peer_name = smap_get(&p_iface->options, "peer");
+if (peer_name) {
+const struct ovsrec_port *peer_port =
+get_port(ovsrec_port_by_name, peer_name);
+if (peer_port) {
+shash_add(&existing_ports, peer_port->name, peer_port);
+}
+}
+}
 }
 }
 
@@ -336,23 +351,6 @@ patch_run(struct ovsdb_idl_txn *ovs_idl_txn,
  * ovn-controller.  Otherwise it may cause unncessary dataplane
  * interruption during restart/upgrade. */
 if (!daemon_started_recently()) {
-/* delete peer patch port first */
-for (size_t i = 0; i < port->n_interfaces; i++) {
-struct ovsrec_interface *iface = port->interfaces[i];
-if (strcmp(iface->type, "patch")) {
-continue;
-}
-const char *iface_peer = smap_get(&iface->options, "peer");
-if (iface_peer) {
-const struct ovsrec_port *peer_port =
-get_port(ovsrec_port_by_name, iface_peer);
-if (peer_port) {
-remove_port(bridge_table, peer_port);
-}
-}
-}
-
-/* now delete integration bridge patch port */
 remove_port(bridge_table, port);
 }
 }
diff --git a/tests/ovn.at b/tests/ovn.at
index 4d0c7ad53..4b71e4916 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -36454,6 +36454,115 @@ OVN_CLEANUP([hv1])
 AT_CLEANUP
 ])
 
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn-controller update network_name option for localnet port])
+ovn_start
+net_add n1
+
+sim_add hv1
+as hv1
+
+# Bridge Topology
+# Initial: br-int and br-phys-1 connected through ovn-localnet patch port.
+#
+# br-phys-1 -- br-int
+#
+# Final: br-int and br-phys-3 connected through ovn-localnet patch port.
+# Similarly br-int-2 and br-hys-2 connected through localnet patch port
+# but not owned by this controller.
+#
+# br-phys-1  br-int -- br-phys-3
+# br-int-2 -- br-phys-2
+
+# Create bridges
+check ovs-vsctl add-br br-int
+check ovs-vsctl add-br br-int-2
+check ovs-vsctl add-br br-phys-1
+check ovs-vsctl add-br br-phys-2
+check ovs-vsct

Re: [ovs-dev] [PATCH ovn v2] controller: Fix an issue wrt cleanup of stale patch port.

2024-05-14 Thread Priyankar Jain


On 10/05/24 10:06 pm, Numan Siddique wrote:

On Fri, May 3, 2024 at 11:00 AM Numan Siddique  wrote:

On Mon, Apr 22, 2024 at 2:49 AM Priyankar Jain
 wrote:

Issue:
Upon updating the network_name option of localnet port from one physical
bridge to another, ovn-controller fails to cleanup the peer localnet
patch port from the old bridge and ends up creating a duplicate peer
localnet patch port which fails in the following ovsdb transaction:

```
"Transaction causes multiple rows in \"Interface\" table to have
identical values
(patch-localnet_a7d47490-8a90-4c4d-8266-2915ad07c185-to-br-int)
```

Current workflow:
1. Keep a set of all existing localnet ports on br-int. Let us call this
set: existing_ports.
2. For each localnet port in SB:
2.1 Create a patch port on br-int. (if it already exists on br-int,
do not create but remove the entry from exisitng_ports set).
2.2 Create a peer patch port on br-phys-x. (if it already exists on the
bridge, do not create but remove the entry from exisitng_ports set).
3. Finally, cleanup all the ports and its peers from exisiting_ports.

With the above workflow, upon network_name change of localnet LSP, since
ports on br-int does not change, only peer port needs to be move from
br-phys-x to br-phys-y, the localnet port is removed from
exisiting_ports in #2.1 and its peer never becomes eligible for cleanup.

Fix:
Include both patch port on br-int and its peer port in the
exisiting_ports set as part of #1.
This make sures that the peer port is only removed from existing_ports
only if it is already present on the correct bridge. (#2.1/#2.2)
Otherwise, during the cleanup in #3 it will be considered.

Fixes: b600316f252a ("Don't delete patch ports that don't belong to br-int")
Signed-off-by: Priyankar Jain 

Acked-by: Numan Siddique 

Applied this patch to main and backported till branch-23.03.  It
doesn't apply cleanly for branch-22.12.

Thanks
Numan

Thanks for merging the changes Numan. Appreciate the help.
22.12 does not have the commit which introduced this regression:

b600316f252a ("Don't delete patch ports that don't belong to br-int")

So i think we are good.

Thanks,
Priyankar


Numan


---
  controller/patch.c |  32 +++--
  tests/ovn.at   | 109 +
  2 files changed, 124 insertions(+), 17 deletions(-)

diff --git a/controller/patch.c b/controller/patch.c
index c1cd5060d..4fed6e375 100644
--- a/controller/patch.c
+++ b/controller/patch.c
@@ -314,6 +314,21 @@ patch_run(struct ovsdb_idl_txn *ovs_idl_txn,
  || smap_get(&port->external_ids, "ovn-l3gateway-port")
  || smap_get(&port->external_ids, "ovn-logical-patch-port")) {
  shash_add(&existing_ports, port->name, port);
+/* Also add peer ports to the list. */
+for (size_t j = 0; j < port->n_interfaces; j++) {
+struct ovsrec_interface *p_iface = port->interfaces[j];
+if (strcmp(p_iface->type, "patch")) {
+continue;
+}
+const char *peer_name = smap_get(&p_iface->options, "peer");
+if (peer_name) {
+const struct ovsrec_port *peer_port =
+get_port(ovsrec_port_by_name, peer_name);
+if (peer_port) {
+shash_add(&existing_ports, peer_port->name, peer_port);
+}
+}
+}
  }
  }

@@ -336,23 +351,6 @@ patch_run(struct ovsdb_idl_txn *ovs_idl_txn,
   * ovn-controller.  Otherwise it may cause unncessary dataplane
   * interruption during restart/upgrade. */
  if (!daemon_started_recently()) {
-/* delete peer patch port first */
-for (size_t i = 0; i < port->n_interfaces; i++) {
-struct ovsrec_interface *iface = port->interfaces[i];
-if (strcmp(iface->type, "patch")) {
-continue;
-}
-const char *iface_peer = smap_get(&iface->options, "peer");
-if (iface_peer) {
-const struct ovsrec_port *peer_port =
-get_port(ovsrec_port_by_name, iface_peer);
-if (peer_port) {
-remove_port(bridge_table, peer_port);
-}
-}
-}
-
-/* now delete integration bridge patch port */
  remove_port(bridge_table, port);
  }
  }
diff --git a/tests/ovn.at b/tests/ovn.at
index 4d0c7ad53..4b71e4916 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -36454,6 +36454,115 @@ OVN_CLEANUP([hv1])
  AT_CLEANUP
  ])

+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn-controller update network_name option fo