When a logical switch has stateful ACLs (allow-related) or load
balancers configured, all IP traffic is sent to conntrack in the
PRE_ACL and PRE_LB pipeline stages. Traffic from/to remote VTEPs
has no conntrack zone assigned, so conntrack lookups return
ct_state=+trk+inv, causing the traffic to be dropped.
Fix this by adding priority-110 flows that bypass conntrack for
EVPN remote VTEP traffic, identified by the from_evpn_vtep and
to_evpn_vtep predicates. These predicates check bit 31 of the
logical inport/outport registers, which is always set for EVPN
binding keys (OVN_MIN_EVPN_KEY = 1 << 31).
The EVPN skip in PRE_ACL is only added when stateful ACLs are
present (matching the existing pattern for localnet/router/switch
ports). The EVPN skip in PRE_LB is unconditional, unlike localnet
ports which are gated on !has_lb_vip -- remote VTEPs have no
conntrack zones so conntrack would always fail regardless of LB
configuration.
Fixes: 9081afcf8698 ("controller: Create physical flows based on EVPN
structures.")
Reported-at: https://redhat.atlassian.net/browse/FDP-3462
Suggested-by: Ales Musil <[email protected]>
Assisted-by: Claude, with model: claude-opus-4-6
Signed-off-by: Dumitru Ceara <[email protected]>
---
lib/logical-fields.c | 22 +++++++++++
lib/ovn-util.c | 2 +-
northd/northd.c | 37 ++++++++++++++++++
tests/multinode.at | 30 ++++++++++++++
tests/ovn-northd.at | 93 ++++++++++++++++++++++++++++++++++++++++++++
tests/ovn.at | 16 +++++++-
6 files changed, 197 insertions(+), 3 deletions(-)
diff --git a/lib/logical-fields.c b/lib/logical-fields.c
index 9b04762a17..4b8bcfdc6f 100644
--- a/lib/logical-fields.c
+++ b/lib/logical-fields.c
@@ -72,6 +72,28 @@ ovn_init_symtab(struct shash *symtab)
expr_symtab_add_string(symtab, "inport", MFF_LOG_INPORT, NULL);
expr_symtab_add_string(symtab, "outport", MFF_LOG_OUTPORT, NULL);
+ /* Also register the inport/outport backing registers as numeric fields
+ * so that predicates can reference specific bits (e.g., the EVPN key
+ * indicator at bit 31). */
+ char inport_reg[8], outport_reg[8];
+ snprintf(inport_reg, sizeof inport_reg, "reg%d",
+ MFF_LOG_INPORT - MFF_REG0);
+ expr_symtab_add_field(symtab, inport_reg, MFF_LOG_INPORT, NULL, false);
+ snprintf(outport_reg, sizeof outport_reg, "reg%d",
+ MFF_LOG_OUTPORT - MFF_REG0);
+ expr_symtab_add_field(symtab, outport_reg, MFF_LOG_OUTPORT, NULL, false);
+
+ /* EVPN binding keys have bit 31 set (OVN_MIN_EVPN_KEY = 1 << 31).
+ * Define predicates to identify traffic from/to remote VTEPs so that
+ * northd can skip conntrack without hard-coding register indices. */
+ char vtep_pred[64];
+ snprintf(vtep_pred, sizeof vtep_pred,
+ "%s == 0x80000000/0x80000000", inport_reg);
+ expr_symtab_add_predicate(symtab, "from_evpn_vtep", vtep_pred);
+ snprintf(vtep_pred, sizeof vtep_pred,
+ "%s == 0x80000000/0x80000000", outport_reg);
+ expr_symtab_add_predicate(symtab, "to_evpn_vtep", vtep_pred);
+
/* The port isn't reserved along the pipeline it's just defined as symbol
* to support matching on string and moving between string registers. */
expr_symtab_add_string(symtab, "remote_outport",
diff --git a/lib/ovn-util.c b/lib/ovn-util.c
index 65fdb3a59c..fb02825ac4 100644
--- a/lib/ovn-util.c
+++ b/lib/ovn-util.c
@@ -1027,7 +1027,7 @@ ip_address_and_port_from_lb_key(const char *key, char
**ip_address,
* NOTE: If OVN_NORTHD_PIPELINE_CSUM is updated make sure to double check
* whether an update of OVN_INTERNAL_MINOR_VER is required. */
#define OVN_NORTHD_PIPELINE_CSUM "3760014456 11249"
-#define OVN_INTERNAL_MINOR_VER 13
+#define OVN_INTERNAL_MINOR_VER 14
/* Returns the OVN version. The caller must free the returned value. */
char *
diff --git a/northd/northd.c b/northd/northd.c
index bc817073e2..0b52db6cf6 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -6418,6 +6418,31 @@ skip_port_from_conntrack(const struct ovn_datapath *od,
struct ovn_port *op,
free(egress_match);
}
+/* Skip conntrack for traffic from/to EVPN remote VTEPs.
+ * Remote VTEPs do not have conntrack zones assigned, so
+ * conntrack lookups would return +trk+inv and cause drops. */
+static void
+skip_evpn_from_conntrack(const struct ovn_datapath *od,
+ bool has_stateful_acl,
+ const struct ovn_stage *in_stage,
+ const struct ovn_stage *out_stage, uint16_t priority,
+ struct lflow_table *lflows,
+ struct lflow_ref *lflow_ref)
+{
+ if (!od->has_evpn_vni) {
+ return;
+ }
+
+ const char *egress_action = has_stateful_acl
+ ? "next;"
+ : "flags.pkt_sampled = 0; ct_clear; next;";
+
+ ovn_lflow_add(lflows, od, in_stage, priority,
+ "from_evpn_vtep", "next;", lflow_ref);
+ ovn_lflow_add(lflows, od, out_stage, priority,
+ "to_evpn_vtep", egress_action, lflow_ref);
+}
+
static void
build_stateless_filter(const struct ovn_datapath *od,
const struct nbrec_acl *acl,
@@ -6520,6 +6545,10 @@ build_ls_stateful_rec_pre_acls(
lflow_ref);
}
+ skip_evpn_from_conntrack(od, true,
+ S_SWITCH_IN_PRE_ACL, S_SWITCH_OUT_PRE_ACL,
+ 110, lflows, lflow_ref);
+
/* stateless filters always take precedence over stateful ACLs. */
build_stateless_filters(od, ls_port_groups, lflows, lflow_ref);
@@ -6751,6 +6780,14 @@ build_ls_stateful_rec_pre_lb(const struct
ls_stateful_record *ls_stateful_rec,
}
}
+ /* EVPN remote VTEPs do not have conntrack zones, so their traffic
+ * must always skip conntrack regardless of whether LB VIPs are
+ * configured. This differs from localnet ports which DO have
+ * conntrack zones and can participate in load balancing. */
+ skip_evpn_from_conntrack(od, ls_stateful_rec->has_stateful_acl,
+ S_SWITCH_IN_PRE_LB, S_SWITCH_OUT_PRE_LB,
+ 110, lflows, lflow_ref);
+
/* 'REGBIT_CONNTRACK_NAT' is set to let the pre-stateful table send
* packet to conntrack for defragmentation and possibly for unNATting.
*
diff --git a/tests/multinode.at b/tests/multinode.at
index c2587b68ae..d07660797c 100644
--- a/tests/multinode.at
+++ b/tests/multinode.at
@@ -3829,6 +3829,36 @@ OVS_WAIT_UNTIL([m_as ovn-gw-1 ip netns exec
fabric_workload ping -6 -W 1 -c 1 10
OVS_WAIT_UNTIL([m_as ovn-gw-2 ip netns exec fabric_workload ping -W 1 -c 1
10.0.0.12])
OVS_WAIT_UNTIL([m_as ovn-gw-2 ip netns exec fabric_workload ping -6 -W 1 -c 1
10::12])
+AS_BOX([Check EVPN traffic with stateful ACLs])
+dnl Adding a stateful ACL should not break traffic from/to remote VTEPs.
+dnl Without the conntrack skip flows (from_evpn_vtep / to_evpn_vtep),
+dnl conntrack would return +trk+inv for VXLAN traffic and drop it.
+check multinode_nbctl --wait=hv \
+ -- acl-add ls from-lport 100 "ip" allow-related \
+ -- acl-add ls to-lport 100 "ip" allow-related
+
+dnl Verify fabric-to-workload pings still work with stateful ACL.
+OVS_WAIT_UNTIL([m_as ovn-gw-1 ip netns exec fabric_workload ping -W 1 -c 1
10.0.0.11])
+OVS_WAIT_UNTIL([m_as ovn-gw-1 ip netns exec fabric_workload ping -6 -W 1 -c 1
10::11])
+OVS_WAIT_UNTIL([m_as ovn-gw-2 ip netns exec fabric_workload ping -W 1 -c 1
10.0.0.12])
+OVS_WAIT_UNTIL([m_as ovn-gw-2 ip netns exec fabric_workload ping -6 -W 1 -c 1
10::12])
+
+dnl Also add a load balancer and verify pings still work.
+check multinode_nbctl --wait=hv \
+ -- lb-add lb1 10.0.0.100:80 10.0.0.11:80 \
+ -- ls-lb-add ls lb1
+
+OVS_WAIT_UNTIL([m_as ovn-gw-1 ip netns exec fabric_workload ping -W 1 -c 1
10.0.0.11])
+OVS_WAIT_UNTIL([m_as ovn-gw-1 ip netns exec fabric_workload ping -6 -W 1 -c 1
10::11])
+OVS_WAIT_UNTIL([m_as ovn-gw-2 ip netns exec fabric_workload ping -W 1 -c 1
10.0.0.12])
+OVS_WAIT_UNTIL([m_as ovn-gw-2 ip netns exec fabric_workload ping -6 -W 1 -c 1
10::12])
+
+dnl Cleanup ACL and LB.
+check multinode_nbctl --wait=hv \
+ -- acl-del ls \
+ -- ls-lb-del ls lb1 \
+ -- lb-del lb1
+
AS_BOX([Check type-2 MAC+IP EVPN route advertisements])
# Ping from the frr-ns to the fabric workload so that its IP is learned on
# the fabric EVPN peer (and advertised to OVN).
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index 796c30daf7..1d7bd6c288 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -19026,6 +19026,99 @@ OVN_CLEANUP_NORTHD
AT_CLEANUP
])
+OVN_FOR_EACH_NORTHD_NO_HV([
+AT_SETUP([LS EVPN conntrack skip with stateful ACLs and LBs])
+AT_KEYWORDS([dynamic-routing])
+ovn_start
+
+AS_BOX([EVPN switch, no ACLs or LBs])
+check ovn-nbctl --wait=sb \
+ -- ls-add ls-evpn \
+ -- set logical_switch ls-evpn other_config:dynamic-routing-vni=10 \
+ -- lsp-add ls-evpn lsp0 \
+ -- lsp-set-addresses lsp0 "00:00:00:00:00:01 10.0.0.1"
+
+ovn-sbctl dump-flows ls-evpn > lflows
+
+dnl No stateful ACL, so no EVPN skip flows in pre_acl.
+AT_CHECK([grep 'pre_acl' lflows | grep 'from_evpn_vtep'], [1])
+AT_CHECK([grep 'pre_acl' lflows | grep 'to_evpn_vtep'], [1])
+
+dnl pre_lb EVPN skip flows are always present for EVPN switches.
+AT_CHECK([grep 'pre_lb' lflows | grep 'from_evpn_vtep\|to_evpn_vtep' |
ovn_strip_lflows], [0], [dnl
+ table=??(ls_in_pre_lb ), priority=110 , match=(from_evpn_vtep),
action=(next;)
+ table=??(ls_out_pre_lb ), priority=110 , match=(to_evpn_vtep),
action=(flags.pkt_sampled = 0; ct_clear; next;)
+])
+
+AS_BOX([EVPN switch + stateful ACL])
+check ovn-nbctl --wait=sb acl-add ls-evpn from-lport 100 "ip" allow-related
+
+ovn-sbctl dump-flows ls-evpn > lflows
+
+dnl Stateful ACL present, so EVPN skip flows appear in pre_acl.
+AT_CHECK([grep 'pre_acl' lflows | grep 'from_evpn_vtep\|to_evpn_vtep' |
ovn_strip_lflows], [0], [dnl
+ table=??(ls_in_pre_acl ), priority=110 , match=(from_evpn_vtep),
action=(next;)
+ table=??(ls_out_pre_acl ), priority=110 , match=(to_evpn_vtep),
action=(next;)
+])
+
+dnl pre_lb EVPN skip flows with next; action (has_stateful_acl is true).
+AT_CHECK([grep 'pre_lb' lflows | grep 'from_evpn_vtep\|to_evpn_vtep' |
ovn_strip_lflows], [0], [dnl
+ table=??(ls_in_pre_lb ), priority=110 , match=(from_evpn_vtep),
action=(next;)
+ table=??(ls_out_pre_lb ), priority=110 , match=(to_evpn_vtep),
action=(next;)
+])
+
+AS_BOX([EVPN switch + LB only])
+check ovn-nbctl --wait=sb \
+ -- acl-del ls-evpn \
+ -- lb-add lb1 10.0.0.100:80 10.0.0.1:80 \
+ -- ls-lb-add ls-evpn lb1
+
+ovn-sbctl dump-flows ls-evpn > lflows
+
+dnl No stateful ACL, so no EVPN skip flows in pre_acl.
+AT_CHECK([grep 'pre_acl' lflows | grep 'from_evpn_vtep'], [1])
+AT_CHECK([grep 'pre_acl' lflows | grep 'to_evpn_vtep'], [1])
+
+dnl pre_lb EVPN skip flows with ct_clear egress (no stateful ACL).
+AT_CHECK([grep 'pre_lb' lflows | grep 'from_evpn_vtep\|to_evpn_vtep' |
ovn_strip_lflows], [0], [dnl
+ table=??(ls_in_pre_lb ), priority=110 , match=(from_evpn_vtep),
action=(next;)
+ table=??(ls_out_pre_lb ), priority=110 , match=(to_evpn_vtep),
action=(flags.pkt_sampled = 0; ct_clear; next;)
+])
+
+AS_BOX([EVPN switch + ACL + LB])
+check ovn-nbctl --wait=sb acl-add ls-evpn from-lport 100 "ip" allow-related
+
+ovn-sbctl dump-flows ls-evpn > lflows
+
+dnl Stateful ACL present again, so EVPN skip flows appear in pre_acl.
+AT_CHECK([grep 'pre_acl' lflows | grep 'from_evpn_vtep\|to_evpn_vtep' |
ovn_strip_lflows], [0], [dnl
+ table=??(ls_in_pre_acl ), priority=110 , match=(from_evpn_vtep),
action=(next;)
+ table=??(ls_out_pre_acl ), priority=110 , match=(to_evpn_vtep),
action=(next;)
+])
+
+dnl pre_lb egress action is next; because has_stateful_acl is true.
+AT_CHECK([grep 'pre_lb' lflows | grep 'from_evpn_vtep\|to_evpn_vtep' |
ovn_strip_lflows], [0], [dnl
+ table=??(ls_in_pre_lb ), priority=110 , match=(from_evpn_vtep),
action=(next;)
+ table=??(ls_out_pre_lb ), priority=110 , match=(to_evpn_vtep),
action=(next;)
+])
+
+AS_BOX([Non-EVPN switch + ACL])
+check ovn-nbctl --wait=sb \
+ -- ls-add ls-plain \
+ -- lsp-add ls-plain lsp1 \
+ -- lsp-set-addresses lsp1 "00:00:00:00:00:02 10.0.0.2" \
+ -- acl-add ls-plain from-lport 100 "ip" allow-related
+
+ovn-sbctl dump-flows ls-plain > lflows
+
+dnl Non-EVPN switch must not have any EVPN skip flows.
+AT_CHECK([grep 'from_evpn_vtep' lflows], [1])
+AT_CHECK([grep 'to_evpn_vtep' lflows], [1])
+
+OVN_CLEANUP_NORTHD
+AT_CLEANUP
+])
+
OVN_FOR_EACH_NORTHD_NO_HV([
AT_SETUP([Check network function])
ovn_start
diff --git a/tests/ovn.at b/tests/ovn.at
index cec3bb9a73..939dffc761 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -108,8 +108,11 @@ dnl
dnl When we add or remove registers this test needs to be updated, of course.
AT_SETUP([registers])
AT_CHECK([ovstest test-ovn dump-symtab | grep reg | sort], [0],
-[[reg0 = xxreg0[96..127]
+[[from_evpn_vtep = reg14 == 0x80000000/0x80000000
+reg0 = xxreg0[96..127]
reg1 = xxreg0[64..95]
+reg14 = NXM_NX_REG14
+reg15 = NXM_NX_REG15
reg2 = xxreg0[32..63]
reg3 = xxreg0[0..31]
reg4 = xxreg1[96..127]
@@ -118,6 +121,7 @@ reg6 = xxreg1[32..63]
reg7 = xxreg1[0..31]
reg8 = xreg4[32..63]
reg9 = xreg4[0..31]
+to_evpn_vtep = reg15 == 0x80000000/0x80000000
xreg0 = xxreg0[64..127]
xreg1 = xxreg0[0..63]
xreg2 = xxreg1[64..127]
@@ -128,6 +132,14 @@ xxreg1 = NXM_NX_XXREG1
]])
AT_CLEANUP
+dnl Check EVPN VTEP predicate definitions.
+AT_SETUP([EVPN VTEP fields])
+AT_CHECK([ovstest test-ovn dump-symtab | grep evpn_vtep | sort], [0],
+[[from_evpn_vtep = reg14 == 0x80000000/0x80000000
+to_evpn_vtep = reg15 == 0x80000000/0x80000000
+]])
+AT_CLEANUP
+
dnl Check that the OVN conntrack field definitions are correct.
AT_SETUP([conntrack fields])
AT_CHECK([ovstest test-ovn dump-symtab | grep ^ct | sort], [0],
@@ -2108,7 +2120,7 @@ reg0[[1..3]] = get_fdb(eth.src);
Cannot use 3-bit field reg0[[1..3]] where 32-bit field is required.
reg15 = get_fdb(eth.dst);
- Syntax error at `reg15' expecting field name.
+ encodes as set_field:0->reg15,resubmit(,OFTABLE_GET_FDB)
outport = get_fdb(ip4.dst);
Cannot use 32-bit field ip4.dst[[0..31]] where 48-bit field is required.
--
2.53.0
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev