Thanks
Numan
---
lib/logical-fields.c | 4 +
northd/ovn-northd.8.xml | 49 ++++++++++---
northd/ovn-northd.c | 123 +++++++++++++++++++++++++++----
ovn-architecture.7.xml | 7 +-
ovn-nb.ovsschema | 5 +-
ovn-nb.xml | 16 ++++
tests/ovn.at <http://ovn.at> | 151
++++++++++++++++++++++++++++++++++----
tests/system-ovn.at <http://system-ovn.at> | 144
++++++++++++++++++++++++++++++++++++
utilities/ovn-nbctl.8.xml | 31 ++++++--
utilities/ovn-nbctl.c | 18 ++++-
10 files changed, 496 insertions(+), 52 deletions(-)
diff --git a/lib/logical-fields.c b/lib/logical-fields.c
index fde53a47e..15342dded 100644
--- a/lib/logical-fields.c
+++ b/lib/logical-fields.c
@@ -130,6 +130,10 @@ ovn_init_symtab(struct shash *symtab)
WR_CT_COMMIT);
expr_symtab_add_subfield_scoped(symtab, "ct_label.blocked", NULL,
"ct_label[0]", WR_CT_COMMIT);
+ expr_symtab_add_subfield_scoped(symtab,
"ct_label.ecmp_reply_eth", NULL,
+ "ct_label[32..79]", WR_CT_COMMIT);
+ expr_symtab_add_subfield_scoped(symtab,
"ct_label.ecmp_reply_port", NULL,
+ "ct_label[80..95]", WR_CT_COMMIT);
expr_symtab_add_field(symtab, "ct_state", MFF_CT_STATE, NULL,
false);
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index eb2514f15..cf251e02a 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -2120,15 +2120,31 @@ icmp6 {
<p>
This is to send packets to connection tracker for tracking and
defragmentation. It contains a priority-0 flow that simply
moves traffic
- to the next table. If load balancing rules with virtual IP
addresses
- (and ports) are configured in <code>OVN_Northbound</code>
database for a
- Gateway router, a priority-100 flow is added for each
configured virtual
- IP address <var>VIP</var>. For IPv4 <var>VIPs</var> the flow
matches
- <code>ip && ip4.dst == <var>VIP</var></code>. For IPv6
- <var>VIPs</var>, the flow matches <code>ip && ip6.dst ==
- <var>VIP</var></code>. The flow uses the action
<code>ct_next;</code>
- to send IP packets to the connection tracker for packet
de-fragmentation
- and tracking before sending it to the next table.
+ to the next table.
+ </p>
+
+ <p>
+ If load balancing rules with virtual IP addresses (and ports) are
+ configured in <code>OVN_Northbound</code> database for a
Gateway router,
+ a priority-100 flow is added for each configured virtual IP
address
+ <var>VIP</var>. For IPv4 <var>VIPs</var> the flow matches
<code>ip
+ && ip4.dst == <var>VIP</var></code>. For IPv6
<var>VIPs</var>,
+ the flow matches <code>ip && ip6.dst ==
<var>VIP</var></code>.
+ The flow uses the action <code>ct_next;</code> to send IP
packets to the
+ connection tracker for packet de-fragmentation and tracking
before
+ sending it to the next table.
+ </p>
+
+ <p>
+ If ECMP routes with symmetric reply are configured in the
+ <code>OVN_Northbound</code> database for a gateway router, a
priority-100
+ flow is added for each router port on which symmetric replies are
+ configured. The matching logic for these ports essentially
reverses the
+ configured logic of the ECMP route. So for instance, a route
with a
+ destination routing policy will instead match if the source
IP address
+ matches the static route's prefix. The flow uses the action
+ <code>ct_next</code> to send IP packets to the connection
tracker for
+ packet de-fragmentation and tracking before sending it to the
next table.
</p>
<h3>Ingress Table 5: UNSNAT</h3>
@@ -2489,7 +2505,15 @@ output;
table. This table, instead, is responsible for determine
the ECMP
group id and select a member id within the group based on
5-tuple
hashing. It stores group id in <code>reg8[0..15]</code> and
member id in
- <code>reg8[16..31]</code>.
+ <code>reg8[16..31]</code>. This step is skipped if the
traffic going
+ out the ECMP route is reply traffic, and the ECMP route was
configured
+ to use symmetric replies. Instead, the stored
<code>ct_label</code> value
+ is used to choose the destination. The least significant 48
bits of the
+ <code>ct_label</code> tell the destination MAC address to
which the
+ packet should be sent. The next 16 bits tell the logical
router port on
+ which the packet should be sent. These values in the
+ <code>ct_label</code> are set when the initial ingress traffic is
+ received over the ECMP route.
</p>
<p>
@@ -2639,6 +2663,11 @@ select(reg8[16..31], <var>MID1</var>,
<var>MID2</var>, ...);
address and <code>reg1</code> as the source protocol address).
</p>
+ <p>
+ This processing is skipped for reply traffic being sent out
of an ECMP
+ route if the route was configured to use symmetric replies.
+ </p>
+
<p>
This table contains the following logical flows:
</p>
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index d10e5ee5d..85f04ccde 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -172,16 +172,17 @@ enum ovn_stage {
PIPELINE_STAGE(ROUTER, IN, DEFRAG, 4,
"lr_in_defrag") \
PIPELINE_STAGE(ROUTER, IN, UNSNAT, 5,
"lr_in_unsnat") \
PIPELINE_STAGE(ROUTER, IN, DNAT, 6, "lr_in_dnat")
\
- PIPELINE_STAGE(ROUTER, IN, ND_RA_OPTIONS, 7,
"lr_in_nd_ra_options") \
- PIPELINE_STAGE(ROUTER, IN, ND_RA_RESPONSE, 8,
"lr_in_nd_ra_response") \
- PIPELINE_STAGE(ROUTER, IN, IP_ROUTING, 9,
"lr_in_ip_routing") \
- PIPELINE_STAGE(ROUTER, IN, IP_ROUTING_ECMP, 10,
"lr_in_ip_routing_ecmp") \
- PIPELINE_STAGE(ROUTER, IN, POLICY, 11,
"lr_in_policy") \
- PIPELINE_STAGE(ROUTER, IN, ARP_RESOLVE, 12,
"lr_in_arp_resolve") \
- PIPELINE_STAGE(ROUTER, IN, CHK_PKT_LEN , 13,
"lr_in_chk_pkt_len") \
- PIPELINE_STAGE(ROUTER, IN, LARGER_PKTS,
14,"lr_in_larger_pkts") \
- PIPELINE_STAGE(ROUTER, IN, GW_REDIRECT, 15,
"lr_in_gw_redirect") \
- PIPELINE_STAGE(ROUTER, IN, ARP_REQUEST, 16,
"lr_in_arp_request") \
+ PIPELINE_STAGE(ROUTER, IN, ECMP_STATEFUL, 7,
"lr_in_ecmp_stateful") \
+ PIPELINE_STAGE(ROUTER, IN, ND_RA_OPTIONS, 8,
"lr_in_nd_ra_options") \
+ PIPELINE_STAGE(ROUTER, IN, ND_RA_RESPONSE, 9,
"lr_in_nd_ra_response") \
+ PIPELINE_STAGE(ROUTER, IN, IP_ROUTING, 10,
"lr_in_ip_routing") \
+ PIPELINE_STAGE(ROUTER, IN, IP_ROUTING_ECMP, 11,
"lr_in_ip_routing_ecmp") \
+ PIPELINE_STAGE(ROUTER, IN, POLICY, 12,
"lr_in_policy") \
+ PIPELINE_STAGE(ROUTER, IN, ARP_RESOLVE, 13,
"lr_in_arp_resolve") \
+ PIPELINE_STAGE(ROUTER, IN, CHK_PKT_LEN , 14,
"lr_in_chk_pkt_len") \
+ PIPELINE_STAGE(ROUTER, IN, LARGER_PKTS,
15,"lr_in_larger_pkts") \
+ PIPELINE_STAGE(ROUTER, IN, GW_REDIRECT, 16,
"lr_in_gw_redirect") \
+ PIPELINE_STAGE(ROUTER, IN, ARP_REQUEST, 17,
"lr_in_arp_request") \
\
/* Logical router egress stages. */
\
PIPELINE_STAGE(ROUTER, OUT, UNDNAT, 0, "lr_out_undnat")
\
@@ -7312,6 +7313,7 @@ struct parsed_route {
bool is_src_route;
uint32_t hash;
const struct nbrec_logical_router_static_route *route;
+ bool ecmp_symmetric_reply;
};
static uint32_t
@@ -7373,6 +7375,8 @@ parsed_routes_add(struct ovs_list *routes,
"src-ip"));
pr->hash = route_hash(pr);
pr->route = route;
+ pr->ecmp_symmetric_reply = smap_get_bool(&route->options,
+
"ecmp_symmetric_reply", false);
ovs_list_insert(routes, &pr->list_node);
return pr;
}
@@ -7621,18 +7625,95 @@ find_static_route_outport(struct
ovn_datapath *od, struct hmap *ports,
return true;
}
+static void
+add_ecmp_symmetric_reply_flows(struct hmap *lflows,
+ struct ovn_datapath *od,
+ const char *port_ip,
+ struct ovn_port *out_port,
+ const struct parsed_route *route,
+ struct ds *route_match)
+{
+ const struct nbrec_logical_router_static_route *st_route =
route->route;
+ struct ds match = DS_EMPTY_INITIALIZER;
+ struct ds actions = DS_EMPTY_INITIALIZER;
+ struct ds ecmp_reply = DS_EMPTY_INITIALIZER;
+ char *cidr = normalize_v46_prefix(&route->prefix, route->plen);
+
+ /* If symmetric ECMP replies are enabled, then packets that
arrive over
+ * an ECMP route need to go through conntrack.
+ */
+ ds_put_format(&match, "inport == %s && ip%s.%s == %s",
+ out_port->json_key,
+ route->prefix.family == AF_INET ? "4" : "6",
+ route->is_src_route ? "dst" : "src",
+ cidr);
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
+ ds_cstr(&match), "ct_next;",
+ &st_route->header_);
+
+ /* And packets that go out over an ECMP route need conntrack */
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
+ ds_cstr(route_match), "ct_next;",
+ &st_route->header_);
+
+ /* Save src eth and inport in ct_label for packets that arrive over
+ * an ECMP route.
+ *
+ * NOTE: we purposely are not clearing match before this
+ * ds_put_cstr() call. The previous contents are needed.
+ */
+ ds_put_cstr(&match, " && (ct.new && !ct.est)");
+
+ ds_put_format(&actions, "ct_commit { ct_label.ecmp_reply_eth =
eth.src;"
+ " ct_label.ecmp_reply_port = %" PRId64 ";}; next;",
+ out_port->sb->tunnel_key);
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
+ ds_cstr(&match), ds_cstr(&actions),
+ &st_route->header_);
+
+ /* Bypass ECMP selection if we already have ct_label information
+ * for where to route the packet.
+ */
+ ds_put_format(&ecmp_reply, "ct.rpl && ct_label.ecmp_reply_port
== %"
+ PRId64, out_port->sb->tunnel_key);
+ ds_clear(&match);
+ ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply),
+ ds_cstr(route_match));
+ ds_clear(&actions);
+ ds_put_format(&actions, "ip.ttl--; flags.loopback = 1; "
+ "eth.src = %s; %sreg1 = %s; outport = %s; next;",
+ out_port->lrp_networks.ea_s,
+ route->prefix.family == AF_INET ? "" : "xx",
+ port_ip, out_port->json_key);
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_ROUTING, 100,
+ ds_cstr(&match), ds_cstr(&actions),
+ &st_route->header_);
+
+ /* Egress reply traffic for symmetric ECMP routes skips router
policies. */
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_POLICY, 65535,
+ ds_cstr(&ecmp_reply), "next;",
+ &st_route->header_);
+
+ ds_clear(&actions);
+ ds_put_cstr(&actions, "eth.dst = ct_label.ecmp_reply_eth; next;");
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ARP_RESOLVE,
+ 200, ds_cstr(&ecmp_reply),
+ ds_cstr(&actions), &st_route->header_);
+}
+
static void
build_ecmp_route_flow(struct hmap *lflows, struct ovn_datapath *od,
struct hmap *ports, struct ecmp_groups_node *eg)
{
bool is_ipv4 = (eg->prefix.family == AF_INET);
- struct ds match = DS_EMPTY_INITIALIZER;
uint16_t priority;
+ struct ecmp_route_list_node *er;
+ struct ds route_match = DS_EMPTY_INITIALIZER;
char *prefix_s = build_route_prefix_s(&eg->prefix, eg->plen);
build_route_match(NULL, prefix_s, eg->plen, eg->is_src_route,
is_ipv4,
- &match, &priority);
+ &route_match, &priority);
free(prefix_s);
struct ds actions = DS_EMPTY_INITIALIZER;
@@ -7640,7 +7721,6 @@ build_ecmp_route_flow(struct hmap *lflows,
struct ovn_datapath *od,
"; %s = select(", REG_ECMP_GROUP_ID, eg->id,
REG_ECMP_MEMBER_ID);
- struct ecmp_route_list_node *er;
bool is_first = true;
LIST_FOR_EACH (er, list_node, &eg->route_list) {
if (is_first) {
@@ -7654,11 +7734,12 @@ build_ecmp_route_flow(struct hmap *lflows,
struct ovn_datapath *od,
ds_put_cstr(&actions, ");");
ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, priority,
- ds_cstr(&match), ds_cstr(&actions));
+ ds_cstr(&route_match), ds_cstr(&actions));
/* Add per member flow */
+ struct ds match = DS_EMPTY_INITIALIZER;
+ struct sset visited_ports = SSET_INITIALIZER(&visited_ports);
LIST_FOR_EACH (er, list_node, &eg->route_list) {
-
const struct parsed_route *route_ = er->route;
const struct nbrec_logical_router_static_route *route =
route_->route;
/* Find the outgoing port. */
@@ -7668,6 +7749,15 @@ build_ecmp_route_flow(struct hmap *lflows,
struct ovn_datapath *od,
&out_port)) {
continue;
}
+ /* Symmetric ECMP reply is only usable on gateway routers.
+ * It is NOT usable on distributed routers with a gateway port.
+ */
+ if (smap_get(&od->nbr->options, "chassis") &&
+ route_->ecmp_symmetric_reply && sset_add(&visited_ports,
+ out_port->key)) {
+ add_ecmp_symmetric_reply_flows(lflows, od, lrp_addr_s,
out_port,
+ route_, &route_match);
+ }
ds_clear(&match);
ds_put_format(&match, REG_ECMP_GROUP_ID" == %"PRIu16" && "
REG_ECMP_MEMBER_ID" == %"PRIu16,
@@ -7688,7 +7778,9 @@ build_ecmp_route_flow(struct hmap *lflows,
struct ovn_datapath *od,
ds_cstr(&match), ds_cstr(&actions),
&route->header_);
}
+ sset_destroy(&visited_ports);
ds_destroy(&match);
+ ds_destroy(&route_match);
ds_destroy(&actions);
}
@@ -8972,6 +9064,7 @@ build_lrouter_flows(struct hmap *datapaths,
struct hmap *ports,
ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 0, "1", "next;");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 0, "1",
"next;");
ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 0, "1",
"next;");
+ ovn_lflow_add(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 0,
"1", "next;");
/* Send the IPv6 NS packets to next table. When ovn-controller
* generates IPv6 NS (for the action - nd_ns{}), the injected
diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
index 246cebc19..b1a462933 100644
--- a/ovn-architecture.7.xml
+++ b/ovn-architecture.7.xml
@@ -1210,11 +1210,12 @@
<dd>
Fields that denote the connection tracking zones for
routers. These
values only have local significance and are not meaningful
between
- chassis. OVN stores the zone information for DNATting in
Open vSwitch
+ chassis. OVN stores the zone information for north to south
traffic
+ (for DNATting or ECMP symmetric replies) in Open vSwitch
<!-- Keep the following in sync with MFF_LOG_DNAT_ZONE and
MFF_LOG_SNAT_ZONE in ovn/lib/logical-fields.h. -->
- extension register number 11 and zone information for SNATing in
- Open vSwitch extension register number 12.
+ extension register number 11 and zone information for south
to north
+ traffic (for SNATing) in Open vSwitch extension register
number 12.
</dd>
<dt>logical flow flags</dt>
diff --git a/ovn-nb.ovsschema b/ovn-nb.ovsschema
index da9af7157..16f7794f2 100644
--- a/ovn-nb.ovsschema
+++ b/ovn-nb.ovsschema
@@ -1,7 +1,7 @@
{
"name": "OVN_Northbound",
"version": "5.24.0",
- "cksum": "1092394564 25961",
+ "cksum": "679745602 26116",
"tables": {
"NB_Global": {
"columns": {
@@ -365,6 +365,9 @@
"min": 0, "max": 1}},
"nexthop": {"type": "string"},
"output_port": {"type": {"key": "string", "min":
0, "max": 1}},
+ "options": {
+ "type": {"key": "string", "value": "string",
+ "min": 0, "max": "unlimited"}},
"external_ids": {
"type": {"key": "string", "value": "string",
"min": 0, "max": "unlimited"}}},
diff --git a/ovn-nb.xml b/ovn-nb.xml
index db5908cd5..5e434d257 100644
--- a/ovn-nb.xml
+++ b/ovn-nb.xml
@@ -2481,6 +2481,22 @@
</column>
</group>
+ <group title="Common options">
+ <column name="options">
+ This column provides general key/value settings. The supported
+ options are described individually below.
+ </column>
+
+ <column name="options" key="ecmp_symmetric_reply">
+ It true, then new traffic that arrives over this route will
have
+ its reply traffic bypass ECMP route selection and will be
sent out
+ this route instead. Note that this option overrides any
rules set
+ in the <ref table="Logical_Router_policy" /> table. This option
+ only works on gateway routers (routers that have
+ <ref column="options" key="chassis" table="Logical_Router"
/> set).
+ </column>
+ </group>
+
</table>
<table name="Logical_Router_Policy" title="Logical router policies">
diff --git a/tests/ovn.at <http://ovn.at> b/tests/ovn.at <http://ovn.at>
index f8dde14c2..c1ab6b85f 100644
--- a/tests/ovn.at <http://ovn.at>
+++ b/tests/ovn.at <http://ovn.at>
@@ -195,6 +195,8 @@ ct.snat = ct_state[6]
ct.trk = ct_state[5]
ct_label = NXM_NX_CT_LABEL
ct_label.blocked = ct_label[0]
+ct_label.ecmp_reply_eth = ct_label[0..47]
+ct_label.ecmp_reply_port = ct_label[48..63]
ct_mark = NXM_NX_CT_MARK
ct_state = NXM_NX_CT_STATE
]])
@@ -16065,7 +16067,7 @@ ovn-sbctl dump-flows lr0 | grep
lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
# Since the sw0-vir is not claimed by any chassis, eth.dst should
be set to
# zero if the ip4.dst is the virtual ip in the router pipeline.
AT_CHECK([cat lflows.txt], [0], [dnl
- table=12(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
00:00:00:00:00:00; next;)
+ table=13(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
00:00:00:00:00:00; next;)
])
ip_to_hex() {
@@ -16116,7 +16118,7 @@ ovn-sbctl dump-flows lr0 | grep
lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
# There should be an arp resolve flow to resolve the virtual_ip
with the
# sw0-p1's MAC.
AT_CHECK([cat lflows.txt], [0], [dnl
- table=12(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
50:54:00:00:00:03; next;)
+ table=13(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
50:54:00:00:00:03; next;)
])
# Forcibly clear virtual_parent. ovn-controller should release the
binding
@@ -16157,7 +16159,7 @@ ovn-sbctl dump-flows lr0 | grep
lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
# There should be an arp resolve flow to resolve the virtual_ip
with the
# sw0-p2's MAC.
AT_CHECK([cat lflows.txt], [0], [dnl
- table=12(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
50:54:00:00:00:05; next;)
+ table=13(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
50:54:00:00:00:05; next;)
])
# send the garp from sw0-p2 (in hv2). hv2 should claim sw0-vir
@@ -16180,7 +16182,7 @@ ovn-sbctl dump-flows lr0 | grep
lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
# There should be an arp resolve flow to resolve the virtual_ip
with the
# sw0-p3's MAC.
AT_CHECK([cat lflows.txt], [0], [dnl
- table=12(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
50:54:00:00:00:04; next;)
+ table=13(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
50:54:00:00:00:04; next;)
])
# Now send arp reply from sw0-p1. hv1 should claim sw0-vir
@@ -16201,7 +16203,7 @@ ovn-sbctl dump-flows lr0 | grep
lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
> lflows.txt
AT_CHECK([cat lflows.txt], [0], [dnl
- table=12(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
50:54:00:00:00:03; next;)
+ table=13(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
50:54:00:00:00:03; next;)
])
# Delete hv1-vif1 port. hv1 should release sw0-vir
@@ -16219,7 +16221,7 @@ ovn-sbctl dump-flows lr0 | grep
lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
> lflows.txt
AT_CHECK([cat lflows.txt], [0], [dnl
- table=12(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
00:00:00:00:00:00; next;)
+ table=13(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
00:00:00:00:00:00; next;)
])
# Now send arp reply from sw0-p2. hv2 should claim sw0-vir
@@ -16240,7 +16242,7 @@ ovn-sbctl dump-flows lr0 | grep
lr_in_arp_resolve | grep "reg0 == 10.0.0.10" \
> lflows.txt
AT_CHECK([cat lflows.txt], [0], [dnl
- table=12(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
50:54:00:00:00:04; next;)
+ table=13(lr_in_arp_resolve ), priority=100 , match=(outport ==
"lr0-sw0" && reg0 == 10.0.0.10), action=(eth.dst =
50:54:00:00:00:04; next;)
])
# Delete sw0-p2 logical port
@@ -20274,22 +20276,22 @@ ovn-nbctl set logical_router_policy $pol5
options:pkt_mark=5
ovn-nbctl --wait=hv sync
OVS_WAIT_UNTIL([
- test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=19 | \
+ test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=20 | \
grep "load:0x64->NXM_NX_PKT_MARK" -c)
])
OVS_WAIT_UNTIL([
- test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=19 | \
+ test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=20 | \
grep "load:0x3->NXM_NX_PKT_MARK" -c)
])
OVS_WAIT_UNTIL([
- test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=19 | \
+ test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=20 | \
grep "load:0x4->NXM_NX_PKT_MARK" -c)
])
OVS_WAIT_UNTIL([
- test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=19 | \
+ test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=20 | \
grep "load:0x5->NXM_NX_PKT_MARK" -c)
])
@@ -20380,12 +20382,12 @@ send_ipv4_pkt hv1 hv1-vif1 505400000003
00000000ff01 \
$(ip_to_hex 10 0 0 3) $(ip_to_hex 172 168 0 120)
OVS_WAIT_UNTIL([
- test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=19 | \
+ test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=20 | \
grep "load:0x2->NXM_NX_PKT_MARK" -c)
])
AT_CHECK([
- test 0 -eq $(as hv1 ovs-ofctl dump-flows br-int table=19 | \
+ test 0 -eq $(as hv1 ovs-ofctl dump-flows br-int table=20 | \
grep "load:0x64->NXM_NX_PKT_MARK" -c)
])
@@ -20741,3 +20743,126 @@ AT_CHECK([test "$hv2_offlows" =
"$hv2_offlows_mon"])
OVN_CLEANUP([hv1], [hv2])
AT_CLEANUP
+
+AT_SETUP([ovn -- Symmetric ECMP reply flows])
+ovn_start
+
+net_add n1
+sim_add hv1
+as hv1
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+
+sim_add hv2
+as hv2
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.2
+
+# Logical network
+#
+# ls1 \
+# \
+# DR -- join -- GW -- ext
+# /
+# ls2 /
+#
+# ls1 and ls2 are internal switches connected to distributed router
+# DR. DR is then connected via a join switch to gateway router GW.
+# GW is then connected to external switch ext. In real life, this
+# would likely have a localnet port, but for the purposes of this test
+# it is unnecessary.
+
+ovn-nbctl create Logical_Router name=DR
+gw_uuid=$(ovn-nbctl create Logical_Router name=GW)
+
+ovn-nbctl ls-add ls1
+ovn-nbctl ls-add ls2
+ovn-nbctl ls-add join
+ovn-nbctl ls-add ext
+
+# Connect ls1 to DR
+ovn-nbctl lrp-add DR dr-ls1 00:00:01:01:02:03 10.0.0.1/24
<http://10.0.0.1/24>
+ovn-nbctl lsp-add ls1 ls1-dr -- set Logical_Switch_Port ls1-dr \
+ type=router options:router-port=dr-ls1
addresses='"00:00:01:01:02:03"'
+
+# Connect ls2 to DR
+ovn-nbctl lrp-add DR dr-ls2 00:00:01:01:02:04 10.0.0.2/24
<http://10.0.0.2/24>
+ovn-nbctl lsp-add ls2 ls2-dr -- set Logical_Switch_Port ls2-dr \
+ type=router options:router-port=dr-ls2
addresses='"00:00:01:01:02:04"'
+
+# Connect join to DR
+ovn-nbctl lrp-add DR dr-join 00:00:02:01:02:03 20.0.0.1/24
<http://20.0.0.1/24>
+ovn-nbctl lsp-add join join-dr -- set Logical_Switch_Port join-dr \
+ type=router options:router-port=dr-join
addresses='"00:00:02:01:02:03"'
+
+# Connect join to GW
+ovn-nbctl lrp-add GW gw-join 00:00:02:01:02:04 20.0.0.2/24
<http://20.0.0.2/24>
+ovn-nbctl lsp-add join join-gw -- set Logical_Switch_Port join-gw \
+ type=router options:router-port=gw-join
addresses='"00:00:02:01:02:04"'
+
+# Connect ext to GW
+ovn-nbctl lrp-add GW gw-ext 00:00:03:01:02:03 172.16.0.1/16
<http://172.16.0.1/16>
+ovn-nbctl lsp-add ext ext-gw -- set Logical_Switch_Port ext-gw \
+ type=router options:router-port=gw-ext
addresses='"00:00:03:01:02:03"'
+
+ovn-nbctl lr-route-add GW 10.0.0.0/24 <http://10.0.0.0/24> 20.0.0.1
+ovn-nbctl --policy="src-ip" lr-route-add DR 10.0.0.0/24
<http://10.0.0.0/24> 20.0.0.2
+
+# Now add some ECMP routes to the GW router.
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add GW
10.0.0.0/24 <http://10.0.0.0/24> 172.16.0.2
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add GW
10.0.0.0/24 <http://10.0.0.0/24> 172.16.0.3
+
+ovn-nbctl --wait=hv sync
+
+# Ensure ECMP symmetric reply flows are not present on any hypervisor.
+AT_CHECK([
+ test 0 -eq $(as hv1 ovs-ofctl dump-flows br-int table=15 | \
+ grep "priority=100" | \
+ grep
"ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
-c)
+])
+AT_CHECK([
+ test 0 -eq $(as hv1 ovs-ofctl dump-flows br-int table=21 | \
+ grep "priority=200" | \
+ grep
"actions=move:NXM_NX_CT_LABEL\\[[32..79\\]]->NXM_OF_ETH_DST\\[[\\]]" -c)
+])
+
+AT_CHECK([
+ test 0 -eq $(as hv2 ovs-ofctl dump-flows br-int table=15 | \
+ grep "priority=100" | \
+ grep
"ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
-c)
+])
+AT_CHECK([
+ test 0 -eq $(as hv2 ovs-ofctl dump-flows br-int table=21 | \
+ grep "priority=200" | \
+ grep
"actions=move:NXM_NX_CT_LABEL\\[[32..79\\]]->NXM_OF_ETH_DST\\[[\\]]" -c)
+])
+
+# Now make GW a gateway router on hv1
+ovn-nbctl set Logical_Router $gw_uuid options:chassis=hv1
+ovn-nbctl --wait=hv sync
+
+# And ensure that ECMP symmetric reply flows are present only on hv1
+AT_CHECK([
+ test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=15 | \
+ grep "priority=100" | \
+ grep
"ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
-c)
+])
+AT_CHECK([
+ test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=21 | \
+ grep "priority=200" | \
+ grep
"actions=move:NXM_NX_CT_LABEL\\[[32..79\\]]->NXM_OF_ETH_DST\\[[\\]]" -c)
+])
+
+AT_CHECK([
+ test 0 -eq $(as hv2 ovs-ofctl dump-flows br-int table=15 | \
+ grep "priority=100" | \
+ grep
"ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))"
-c)
+])
+AT_CHECK([
+ test 0 -eq $(as hv2 ovs-ofctl dump-flows br-int table=21 | \
+ grep "priority=200" | \
+ grep
"actions=move:NXM_NX_CT_LABEL\\[[32..79\\]]->NXM_OF_ETH_DST\\[[\\]]" -c)
+])
+
+OVN_CLEANUP([hv1], [hv2])
+AT_CLEANUP
diff --git a/tests/system-ovn.at <http://system-ovn.at>
b/tests/system-ovn.at <http://system-ovn.at>
index eddc530f9..e239b7394 100644
--- a/tests/system-ovn.at <http://system-ovn.at>
+++ b/tests/system-ovn.at <http://system-ovn.at>
@@ -4483,3 +4483,147 @@ OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query
port patch-.*/d
/connection dropped.*/d"])
AT_CLEANUP
+
+AT_SETUP([ovn -- ECMP symmetric reply])
+AT_KEYWORDS([ecmp])
+
+CHECK_CONNTRACK()
+ovn_start
+
+OVS_TRAFFIC_VSWITCHD_START()
+ADD_BR([br-int])
+
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+ -- set Open_vSwitch . external-ids:system-id=hv1 \
+ -- set Open_vSwitch .
external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+ -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+ -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+ -- set bridge br-int fail-mode=secure
other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+# Logical network:
+# Alice is connected to gateway router R1. R1 is connected to two
"external"
+# routers, R2 and R3 via an "ext" switch.
+# Bob is connected to both R2 and R3. R1 contains two ECMP routes,
one through R2
+# and one through R3, to Bob.
+#
+# alice -- R1 -- ext ---- R2
+# | \
+# | bob
+# | /
+# + ----- R3
+#
+# For this test, Bob sends request traffic through R2 to Alice. We
want to ensure that
+# all response traffic from Alice is routed through R2 as well.
+
+ovn-nbctl create Logical_Router name=R1 options:chassis=hv1
+ovn-nbctl create Logical_Router name=R2
+ovn-nbctl create Logical_Router name=R3
+
+ovn-nbctl ls-add alice
+ovn-nbctl ls-add bob
+ovn-nbctl ls-add ext
+
+# connect alice to R1
+ovn-nbctl lrp-add R1 alice 00:00:01:01:02:03 10.0.0.1/24
<http://10.0.0.1/24>
+ovn-nbctl lsp-add alice rp-alice -- set Logical_Switch_Port rp-alice \
+ type=router options:router-port=alice
addresses='"00:00:01:01:02:03"'
+
+# connect bob to R2
+ovn-nbctl lrp-add R2 R2_bob 00:00:02:01:02:03 172.16.0.2/16
<http://172.16.0.2/16>
+ovn-nbctl lsp-add bob rp2-bob -- set Logical_Switch_Port rp2-bob \
+ type=router options:router-port=R2_bob
addresses='"00:00:02:01:02:03"'
+
+# connect bob to R3
+ovn-nbctl lrp-add R3 R3_bob 00:00:02:01:02:04 172.16.0.3/16
<http://172.16.0.3/16>
+ovn-nbctl lsp-add bob rp3-bob -- set Logical_Switch_Port rp3-bob \
+ type=router options:router-port=R3_bob
addresses='"00:00:02:01:02:04"'
+
+# Connect R1 to ext
+ovn-nbctl lrp-add R1 R1_ext 00:00:04:01:02:03 20.0.0.1/24
<http://20.0.0.1/24>
+ovn-nbctl lsp-add ext r1-ext -- set Logical_Switch_Port r1-ext \
+ type=router options:router-port=R1_ext
addresses='"00:00:04:01:02:03"'
+
+# Connect R2 to ext
+ovn-nbctl lrp-add R2 R2_ext 00:00:04:01:02:04 20.0.0.2/24
<http://20.0.0.2/24>
+ovn-nbctl lsp-add ext r2-ext -- set Logical_Switch_Port r2-ext \
+ type=router options:router-port=R2_ext
addresses='"00:00:04:01:02:04"'
+
+# Connect R3 to ext
+ovn-nbctl lrp-add R3 R3_ext 00:00:04:01:02:05 20.0.0.3/24
<http://20.0.0.3/24>
+ovn-nbctl lsp-add ext r3-ext -- set Logical_Switch_Port r3-ext \
+ type=router options:router-port=R3_ext
addresses='"00:00:04:01:02:05"'
+
+# Install ECMP routes for alice.
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1
10.0.0.0/24 <http://10.0.0.0/24> 20.0.0.2
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1
10.0.0.0/24 <http://10.0.0.0/24> 20.0.0.3
+
+# Static Routes
+ovn-nbctl lr-route-add R2 10.0.0.0/24 <http://10.0.0.0/24> 20.0.0.1
+ovn-nbctl lr-route-add R3 10.0.0.0/24 <http://10.0.0.0/24> 20.0.0.1
+
+# Logical port 'alice1' in switch 'alice'.
+ADD_NAMESPACES(alice1)
+ADD_VETH(alice1, alice1, br-int, "10.0.0.2/24
<http://10.0.0.2/24>", "f0:00:00:01:02:04", \
+ "10.0.0.1")
+ovn-nbctl lsp-add alice alice1 \
+-- lsp-set-addresses alice1 "f0:00:00:01:02:04 10.0.0.2"
+
+# Logical port 'bob1' in switch 'bob'.
+ADD_NAMESPACES(bob1)
+ADD_VETH(bob1, bob1, br-int, "172.16.0.1/16
<http://172.16.0.1/16>", "f0:00:00:01:02:06", \
+ "172.16.0.2")
+ovn-nbctl lsp-add bob bob1 \
+-- lsp-set-addresses bob1 "f0:00:00:01:02:06 172.16.0.1"
+
+# Ensure ovn-controller is caught up
+ovn-nbctl --wait=hv sync
+
+on_exit 'ovs-ofctl dump-flows br-int'
+
+# 'bob1' should be able to ping 'alice1' directly.
+NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 10.0.0.2 |
FORMAT_PING], \
+[0], [dnl
+20 packets transmitted, 20 received, 0% packet loss, time 0ms
+])
+
+# Ensure conntrack entry is present. We should not try to predict
+# the tunnel key for the output port, so we strip it from the labels
+# and just ensure that the known ethernet address is present.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e
's/labels=0x[[0-9a-f]]*00000401020400000000/labels=0x00000401020400000000/'],
[0], [dnl
+icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,labels=0x00000401020400000000
+])
+
+# Ensure datapaths show conntrack states as expected
+# Like with conntrack entries, we shouldn't try to predict
+# port binding tunnel keys. So omit them from expected labels.
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep
'ct_state(+new-est-rpl+trk).*ct(.*label=0x.*00000401020400000000/0xffffffffffffffff00000000)'
-c], [0], [dnl
+1
+])
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep
'ct_state(-new+est+rpl+trk).*ct_label(0x.*00000401020400000000/0xffffffffffffffff00000000)'
-c], [0], [dnl
+1
+])
+
+ovs-ofctl dump-flows br-int
+
+OVS_APP_EXIT_AND_WAIT([ovn-controller])
+
+as ovn-sb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as ovn-nb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as northd
+OVS_APP_EXIT_AND_WAIT([ovn-northd])
+
+as
+OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
+/connection dropped.*/d"])
+
+AT_CLEANUP
diff --git a/utilities/ovn-nbctl.8.xml b/utilities/ovn-nbctl.8.xml
index de86b70e6..18bf90e08 100644
--- a/utilities/ovn-nbctl.8.xml
+++ b/utilities/ovn-nbctl.8.xml
@@ -658,7 +658,8 @@
<dl>
<dt>[<code>--may-exist</code>]
[<code>--policy</code>=<var>POLICY</var>]
- [<code>--ecmp</code>] <code>lr-route-add</code>
<var>router</var>
+ [<code>--ecmp</code>] [<code>--ecmp-symmetric-reply</code>]
+ <code>lr-route-add</code> <var>router</var>
<var>prefix</var> <var>nexthop</var> [<var>port</var>]</dt>
<dd>
<p>
@@ -680,15 +681,31 @@
specified, the default is "dst-ip".
</p>
+ <p>
+ The <code>--ecmp</code> option allows for multiple routes
with the
+ same <var>prefix</var> <var>POLICY</var> but different
+ <var>nexthop</var> and <var>port</var> to be added.
+ </p>
+
+ <p>
+ The <code>--ecmp-symmetric-reply</code> option makes it
so that
+ traffic that arrives over an ECMP route will have its
reply traffic
+ sent out over that same route. Setting
+ <code>--ecmp-symmetric-reply</code> implies
<code>--ecmp</code> so
+ it is not necessary to set both.
+ </p>
+
<p>
It is an error if a route with <var>prefix</var> and
- <var>POLICY</var> already exists, unless
<code>--may-exist</code> or
- <code>--ecmp</code> is specified. If
<code>--may-exist</code> is
- specified but not <code>--ecmp</code>, the existed route
will be
- updated with the new nexthop and port. If
<code>--ecmp</code> is
+ <var>POLICY</var> already exists, unless
<code>--may-exist</code>,
+ <code>--ecmp</code>, or
<code>--ecmp-symmetric-reply</code> is
+ specified. If <code>--may-exist</code> is specified but not
+ <code>--ecmp</code> or
<code>--ecmp-symmetric-reply</code>, the
+ existed route will be updated with the new nexthop and
port. If
+ <code>--ecmp</code> or <code>--ecmp-symmetric-reply</code> is
specified, a new route will be added, regardless of the
existed
- route, which is useful when adding ECMP routes, i.e.
routes with same
- <var>POLICY</var> and <var>prefix</var> but different
+ route., which is useful when adding ECMP routes, i.e.
routes with
+ same <var>POLICY</var> and <var>prefix</var> but different
<var>nexthop</var> and <var>port</var>.
</p>
</dd>
diff --git a/utilities/ovn-nbctl.c b/utilities/ovn-nbctl.c
index 0079ad5a6..e6d8dbe63 100644
--- a/utilities/ovn-nbctl.c
+++ b/utilities/ovn-nbctl.c
@@ -687,7 +687,8 @@ Logical router port commands:\n\
('overlay' or 'bridged')\n\
\n\
Route commands:\n\
- [--policy=POLICY] [--ecmp] lr-route-add ROUTER PREFIX NEXTHOP
[PORT]\n\
+ [--policy=POLICY] [--ecmp] [--ecmp-symmetric-reply] lr-route-add
ROUTER \n\
+ PREFIX NEXTHOP [PORT]\n\
add a route to ROUTER\n\
[--policy=POLICY] lr-route-del ROUTER [PREFIX [NEXTHOP [PORT]]]\n\
remove routes from ROUTER\n\
@@ -3855,7 +3856,10 @@ nbctl_lr_route_add(struct ctl_context *ctx)
}
bool may_exist = shash_find(&ctx->options, "--may-exist") != NULL;
- bool ecmp = shash_find(&ctx->options, "--ecmp") != NULL;
+ bool ecmp_symmetric_reply = shash_find(&ctx->options,
+
"--ecmp-symmetric-reply") != NULL;
+ bool ecmp = shash_find(&ctx->options, "--ecmp") != NULL ||
+ ecmp_symmetric_reply;
if (!ecmp) {
for (int i = 0; i < lr->n_static_routes; i++) {
const struct nbrec_logical_router_static_route *route
@@ -3920,6 +3924,13 @@ nbctl_lr_route_add(struct ctl_context *ctx)
nbrec_logical_router_static_route_set_policy(route, policy);
}
+ if (ecmp_symmetric_reply) {
+ const struct smap options = SMAP_CONST1(&options,
+ "ecmp_symmetric_reply",
+ "true");
+ nbrec_logical_router_static_route_set_options(route, &options);
+ }
+
nbrec_logical_router_verify_static_routes(lr);
struct nbrec_logical_router_static_route **new_routes
= xmalloc(sizeof *new_routes * (lr->n_static_routes + 1));
@@ -6361,7 +6372,8 @@ static const struct ctl_command_syntax
nbctl_commands[] = {
/* logical router route commands. */
{ "lr-route-add", 3, 4, "ROUTER PREFIX NEXTHOP [PORT]", NULL,
- nbctl_lr_route_add, NULL, "--may-exist,--ecmp,--policy=", RW },
+ nbctl_lr_route_add, NULL,
"--may-exist,--ecmp,--ecmp-symmetric-reply,"
+ "--policy=", RW },
{ "lr-route-del", 1, 4, "ROUTER [PREFIX [NEXTHOP [PORT]]]", NULL,
nbctl_lr_route_del, NULL, "--if-exists,--policy=", RW },
{ "lr-route-list", 1, 1, "ROUTER", NULL, nbctl_lr_route_list,
NULL,
--
2.25.4
_______________________________________________
dev mailing list
d...@openvswitch.org <mailto:d...@openvswitch.org>
https://mail.openvswitch.org/mailman/listinfo/ovs-dev