IP multicast relay didn't work properly if traffic had to be forwarded across a distributed gateway port in the router pipeline. That is because the multicast_group used as output logical port is expanded in the egress pipeline, way after 'lr_in_gw_redirect' where unicast traffic would normally be forwarded to the chassis-redirect port.
In order to achieve the same behavior for IP multicast routed traffic we now store the chassis-redirect port binding in the multicast_group on which IP multicast is routed. On the remote hypervisor, to make sure traffic is delivered to the correct destination switch pipeline, we make sure that ovn-controller translates chassis-redirect ports from multicast groups to the logical patch ports they were created from. This patch also adds a test to simulate the ovn-kubernetes IP multicast use case (where this issue was first observed). Fixes: 5d1527b11e94 ("ovn-northd: Add IGMP Relay support") Reported-by: Alexander Constantinescu <acons...@redhat.com> Reported-at: https://bugzilla.redhat.com/2006306 Signed-off-by: Dumitru Ceara <dce...@redhat.com> --- controller/physical.c | 28 ++++- northd/lrouter.dl | 14 ++- northd/multicast.dl | 23 +++- northd/northd.c | 7 +- northd/ovn_northd.dl | 12 +- tests/ovn.at | 248 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 314 insertions(+), 18 deletions(-) diff --git a/controller/physical.c b/controller/physical.c index ffb9f9952..0cfb158c8 100644 --- a/controller/physical.c +++ b/controller/physical.c @@ -1373,7 +1373,8 @@ out: } static void -consider_mc_group(enum mf_field_id mff_ovn_geneve, +consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name, + enum mf_field_id mff_ovn_geneve, const struct simap *ct_zones, const struct hmap *local_datapaths, struct shash *local_bindings, @@ -1406,6 +1407,10 @@ consider_mc_group(enum mf_field_id mff_ovn_geneve, * instead. (If we put them in 'ofpacts', then the output * would happen on every hypervisor in the multicast group, * effectively duplicating the packet.) + * + * - For chassisredirect ports, add actions to 'ofpacts' to + * set the output port to be the router patch port for which + * the redirect port was added. */ struct ofpbuf ofpacts; ofpbuf_init(&ofpacts, 0); @@ -1440,6 +1445,21 @@ consider_mc_group(enum mf_field_id mff_ovn_geneve, && port->chassis == chassis)) { put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts); put_resubmit(OFTABLE_CHECK_LOOPBACK, &ofpacts); + } else if (!strcmp(port->type, "chassisredirect") + && port->chassis == chassis) { + const char *distributed_port = smap_get(&port->options, + "distributed-port"); + if (distributed_port) { + const struct sbrec_port_binding *distributed_binding + = lport_lookup_by_name(sbrec_port_binding_by_name, + distributed_port); + if (distributed_binding + && port->datapath == distributed_binding->datapath) { + put_load(distributed_binding->tunnel_key, MFF_LOG_OUTPORT, + 0, 32, &ofpacts); + put_resubmit(OFTABLE_CHECK_LOOPBACK, &ofpacts); + } + } } else if (port->chassis && !get_localnet_port( local_datapaths, mc->datapath->tunnel_key)) { /* Add remote chassis only when localnet port not exist, @@ -1574,7 +1594,8 @@ physical_handle_mc_group_changes(struct physical_ctx *p_ctx, if (!sbrec_multicast_group_is_new(mc)) { ofctrl_remove_flows(flow_table, &mc->header_.uuid); } - consider_mc_group(p_ctx->mff_ovn_geneve, p_ctx->ct_zones, + consider_mc_group(p_ctx->sbrec_port_binding_by_name, + p_ctx->mff_ovn_geneve, p_ctx->ct_zones, p_ctx->local_datapaths, p_ctx->local_bindings, p_ctx->patch_ofports, p_ctx->chassis, mc, @@ -1617,7 +1638,8 @@ physical_run(struct physical_ctx *p_ctx, /* Handle output to multicast groups, in tables 32 and 33. */ const struct sbrec_multicast_group *mc; SBREC_MULTICAST_GROUP_TABLE_FOR_EACH (mc, p_ctx->mc_group_table) { - consider_mc_group(p_ctx->mff_ovn_geneve, p_ctx->ct_zones, + consider_mc_group(p_ctx->sbrec_port_binding_by_name, + p_ctx->mff_ovn_geneve, p_ctx->ct_zones, p_ctx->local_datapaths, p_ctx->local_bindings, p_ctx->patch_ofports, p_ctx->chassis, mc, p_ctx->chassis_tunnels, diff --git a/northd/lrouter.dl b/northd/lrouter.dl index 3029ba67d..0e4308eb5 100644 --- a/northd/lrouter.dl +++ b/northd/lrouter.dl @@ -145,8 +145,10 @@ Warning[message] :- * * A logical router can have multiple distributed gateway ports. */ relation DistributedGatewayPort(lrp: Intern<nb::Logical_Router_Port>, - lr_uuid: uuid) -DistributedGatewayPort(lrp, lr_uuid) :- + lr_uuid: uuid, cr_lrp_uuid: uuid) + +// lrp._uuid is already in use; generate a new UUID by hashing it. +DistributedGatewayPort(lrp, lr_uuid, hash128(lrp_uuid)) :- lr in nb::Logical_Router(._uuid = lr_uuid), LogicalRouterPort(lrp_uuid, lr._uuid), lrp in &nb::Logical_Router_Port(._uuid = lrp_uuid), @@ -237,10 +239,10 @@ DistributedGatewayPortHAChassisGroup(lrp, /* For each router port, tracks whether it's a redirect port of its router */ relation RouterPortIsRedirect(lrp: uuid, is_redirect: bool) -RouterPortIsRedirect(lrp, true) :- DistributedGatewayPort(&nb::Logical_Router_Port{._uuid = lrp}, _). +RouterPortIsRedirect(lrp, true) :- DistributedGatewayPort(&nb::Logical_Router_Port{._uuid = lrp}, _, _). RouterPortIsRedirect(lrp, false) :- &nb::Logical_Router_Port(._uuid = lrp), - not DistributedGatewayPort(&nb::Logical_Router_Port{._uuid = lrp}, _). + not DistributedGatewayPort(&nb::Logical_Router_Port{._uuid = lrp}, _, _). /* * LogicalRouterDGWPorts maps from each logical router UUID @@ -249,12 +251,12 @@ relation LogicalRouterDGWPorts( lr_uuid: uuid, l3dgw_ports: Vec<Intern<nb::Logical_Router_Port>>) LogicalRouterDGWPorts(lr_uuid, l3dgw_ports) :- - DistributedGatewayPort(lrp, lr_uuid), + DistributedGatewayPort(lrp, lr_uuid, _), var l3dgw_ports = lrp.group_by(lr_uuid).to_vec(). LogicalRouterDGWPorts(lr_uuid, vec_empty()) :- lr in nb::Logical_Router(), var lr_uuid = lr._uuid, - not DistributedGatewayPort(_, lr_uuid). + not DistributedGatewayPort(_, lr_uuid, _). typedef ExceptionalExtIps = AllowedExtIps{ips: Intern<nb::Address_Set>} | ExemptedExtIps{ips: Intern<nb::Address_Set>} diff --git a/northd/multicast.dl b/northd/multicast.dl index 074caf654..56bfa0c63 100644 --- a/northd/multicast.dl +++ b/northd/multicast.dl @@ -236,7 +236,28 @@ IgmpRouterGroupPort(address, rtr_port.router, rtr_port.lrp._uuid) :- }, var flood_port = FlatMap(sw_flood_ports), &SwitchPort(.lsp = &nb::Logical_Switch_Port{._uuid = flood_port}, - .peer = Some{rtr_port}). + .peer = Some{rtr_port}), + RouterPortIsRedirect(rtr_port.lrp._uuid, false). + +/* Store the chassis redirect port uuid for redirect ports, otherwise traffic + * will not be tunneled properly. This will be translated back to the patch + * port on the remote hypervisor. + */ +IgmpRouterGroupPort(address, rtr_port.router, cr_lrp_uuid) :- + SwitchMcastFloodRelayPorts(switch, sw_flood_ports), + IgmpSwitchMulticastGroup(address, switch, _), + /* For IPv6 only relay routable multicast groups + * (RFC 4291 2.7). + */ + match (ipv6_parse(address.ival())) { + Some{ipv6} -> ipv6.is_routable_multicast(), + None -> true + }, + var flood_port = FlatMap(sw_flood_ports), + &SwitchPort(.lsp = &nb::Logical_Switch_Port{._uuid = flood_port}, + .peer = Some{rtr_port}), + RouterPortIsRedirect(rtr_port.lrp._uuid, true), + DistributedGatewayPort(rtr_port.lrp, _, cr_lrp_uuid). /* Aggregated IGMP group for routers: merges all IgmpRouterGroupPort for * a given address-router tuple from all connected switches. diff --git a/northd/northd.c b/northd/northd.c index d1b87891c..e021e146b 100644 --- a/northd/northd.c +++ b/northd/northd.c @@ -14169,7 +14169,12 @@ build_mcast_groups(struct northd_context *ctx, address, igmp_group->mcgroup.name); struct ovn_port **router_igmp_ports = xmalloc(sizeof *router_igmp_ports); - router_igmp_ports[0] = router_port; + /* Store the chassis redirect port otherwise traffic will not + * be tunneled properly. + */ + router_igmp_ports[0] = router_port->cr_port + ? router_port->cr_port + : router_port; ovn_igmp_group_add_entry(igmp_group_rtr, router_igmp_ports, 1); } } diff --git a/northd/ovn_northd.dl b/northd/ovn_northd.dl index 0202af5dc..22913f05a 100644 --- a/northd/ovn_northd.dl +++ b/northd/ovn_northd.dl @@ -464,9 +464,7 @@ function has_distributed_nat(nats: Vec<NAT>): bool { */ /* Create derived ports */ -OutProxy_Port_Binding(// lrp._uuid is already in use; generate a new UUID by - // hashing it. - ._uuid = hash128(lrp._uuid), +OutProxy_Port_Binding(._uuid = cr_lrp_uuid, .logical_port = chassis_redirect_name(lrp.name).intern(), .__type = i"chassisredirect", .gateway_chassis = set_empty(), @@ -478,7 +476,7 @@ OutProxy_Port_Binding(// lrp._uuid is already in use; generate a new UUID by .mac = set_singleton(i"${lrp.mac} ${lrp.networks.map(ival).to_vec().join(\" \")}"), .nat_addresses = set_empty(), .external_ids = lrp.external_ids) :- - DistributedGatewayPort(lrp, lr_uuid), + DistributedGatewayPort(lrp, lr_uuid, cr_lrp_uuid), DistributedGatewayPortHAChassisGroup(lrp, hacg_uuid), var redirect_type = match (lrp.options.get(i"redirect-type")) { Some{var value} -> [i"redirect-type" -> value], @@ -549,7 +547,7 @@ RefChassis(lr_uuid, chassis_uuid) :- HAChassis(.hacg_uuid = hacg_uuid, .hac_uuid = hac_uuid), var hacg_size = hac_uuid.group_by(hacg_uuid).to_set().size(), hacg_size > 1, - DistributedGatewayPort(lrp, lr_uuid), + DistributedGatewayPort(lrp, lr_uuid, _), ConnectedLogicalRouter[(lr_uuid, set_uuid)], ConnectedLogicalRouter[(lr2_uuid, set_uuid)], FirstHopLogicalRouter(lr2_uuid, ls_uuid), @@ -577,7 +575,7 @@ relation HAChassisGroupRefChassisSet(hacg_uuid: uuid, chassis_uuids: Set<uuid>) HAChassisGroupRefChassisSet(hacg_uuid, chassis_uuids) :- DistributedGatewayPortHAChassisGroup(lrp, hacg_uuid), - DistributedGatewayPort(lrp, lr_uuid), + DistributedGatewayPort(lrp, lr_uuid, _), RefChassisSet(lr_uuid, chassis_uuids), var chassis_uuids = chassis_uuids.group_by(hacg_uuid).union(). @@ -8198,7 +8196,7 @@ for (&Router(._uuid = lr_uuid)) * packet did not match any higher priority redirect * rule, then the traffic is redirected to the central * instance of the l3dgw_port. */ - for (DistributedGatewayPort(lrp, lr_uuid)) { + for (DistributedGatewayPort(lrp, lr_uuid, _)) { Flow(.logical_datapath = lr_uuid, .stage = s_ROUTER_IN_GW_REDIRECT(), .priority = 50, diff --git a/tests/ovn.at b/tests/ovn.at index d76f4ba86..172b5c713 100644 --- a/tests/ovn.at +++ b/tests/ovn.at @@ -19247,6 +19247,254 @@ OVN_CLEANUP([hv1], [hv2], [hv3]) AT_CLEANUP ]) +OVN_FOR_EACH_NORTHD([ +AT_SETUP([IGMP relay - distributed gateway port]) +AT_KEYWORDS([snoop relay]) + +ovn_start + +# Logical network: +# Two logical switches (sw1-sw2) connected to a logical router (rtr). +# sw1: +# - subnet 10.0.0.0/8 +# - 1 ports bound on hv1 (sw1-p1) +# sw2: +# - subnet 20.0.0.0/8 +# - 1 port bound on hv2 (sw2-p2) +# - IGMP Querier from 20.0.0.254 + +reset_pcap_file() { + local iface=$1 + local pcap_file=$2 + check ovs-vsctl -- set Interface $iface options:tx_pcap=dummy-tx.pcap \ +options:rxq_pcap=dummy-rx.pcap + rm -f ${pcap_file}*.pcap + check ovs-vsctl -- set Interface $iface options:tx_pcap=${pcap_file}-tx.pcap \ +options:rxq_pcap=${pcap_file}-rx.pcap +} + +# +# send_igmp_v3_report INPORT HV ETH_SRC IP_SRC IP_CSUM GROUP REC_TYPE +# IGMP_CSUM OUTFILE +# +# This shell function causes an IGMPv3 report to be received on INPORT of HV. +# The packet's content has Ethernet destination 01:00:5E:00:00:22 and source +# ETH_SRC (exactly 12 hex digits). Ethernet type is set to IP. +# GROUP is the IP multicast group to be joined/to leave (based on REC_TYPE). +# REC_TYPE == 04: join GROUP +# REC_TYPE == 03: leave GROUP +# The packet hexdump is also stored in OUTFILE. +# +send_igmp_v3_report() { + local inport=$1 hv=$2 eth_src=$3 ip_src=$4 ip_chksum=$5 group=$6 + local rec_type=$7 igmp_chksum=$8 outfile=$9 + + local eth_dst=01005e000016 + local ip_dst=$(ip_to_hex 224 0 0 22) + local ip_ttl=01 + local ip_ra_opt=94040000 + + local igmp_type=2200 + local num_rec=00000001 + local aux_dlen=00 + local num_src=0000 + + local eth=${eth_dst}${eth_src}0800 + local ip=46c0002800004000${ip_ttl}02${ip_chksum}${ip_src}${ip_dst}${ip_ra_opt} + local igmp=${igmp_type}${igmp_chksum}${num_rec}${rec_type}${aux_dlen}${num_src}${group} + local packet=${eth}${ip}${igmp} + + echo ${packet} >> ${outfile} + check as $hv ovs-appctl netdev-dummy/receive ${inport} ${packet} +} + +# +# store_igmp_v3_query ETH_SRC IP_SRC IP_CSUM OUTFILE +# +# This shell function builds an IGMPv3 general query from ETH_SRC and IP_SRC +# and stores the hexdump of the packet in OUTFILE. +# +store_igmp_v3_query() { + local eth_src=$1 ip_src=$2 ip_chksum=$3 outfile=$4 + + local eth_dst=01005e000001 + local ip_dst=$(ip_to_hex 224 0 0 1) + local ip_ttl=01 + local igmp_type=11 + local max_resp=0a + local igmp_chksum=eeeb + local addr=00000000 + + local eth=${eth_dst}${eth_src}0800 + local ip=4500002000004000${ip_ttl}02${ip_chksum}${ip_src}${ip_dst} + local igmp=${igmp_type}${max_resp}${igmp_chksum}${addr}000a0000 + local packet=${eth}${ip}${igmp} + + echo ${packet} >> ${outfile} +} + +# +# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL +# IP_CHKSUM IP_PROTO DATA +# +# This shell function causes an IP multicast packet to be received on INPORT +# of HV. +# The hexdump of the packet is stored in OUTFILE. +# +send_ip_multicast_pkt() { + local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 + local ip_src=$5 ip_dst=$6 ip_len=$7 ip_ttl=$8 ip_chksum=$9 proto=${10} + local data=${11} + + local eth=${eth_dst}${eth_src}0800 + local ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst} + local packet=${eth}${ip}${data} + + check as $hv ovs-appctl netdev-dummy/receive ${inport} ${packet} +} + +# +# store_ip_multicast_pkt ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL +# IP_CHKSUM IP_PROTO DATA OUTFILE +# +# This shell function builds an IP multicast packet and stores the hexdump of +# the packet in OUTFILE. +# +store_ip_multicast_pkt() { + local eth_src=$1 eth_dst=$2 + local ip_src=$3 ip_dst=$4 ip_len=$5 ip_ttl=$6 ip_chksum=$7 proto=$8 + local data=$9 outfile=${10} + + local eth=${eth_dst}${eth_src}0800 + local ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst} + local packet=${eth}${ip}${data} + + echo ${packet} >> ${outfile} +} + +check ovn-nbctl ls-add sw1 \ + -- lsp-add sw1 sw1-p1 + +check ovn-nbctl ls-add sw2 \ + -- lsp-add sw2 sw2-p2 + +check ovn-nbctl lr-add rtr \ + -- lrp-add rtr rtr-sw1 00:00:00:00:01:00 10.0.0.254/24 \ + -- lrp-add rtr rtr-sw2 00:00:00:00:02:00 20.0.0.254/24 + +ovn-nbctl lsp-add sw1 sw1-rtr \ + -- lsp-set-type sw1-rtr router \ + -- lsp-set-addresses sw1-rtr 00:00:00:00:01:00 \ + -- lsp-set-options sw1-rtr router-port=rtr-sw1 +check ovn-nbctl lsp-add sw2 sw2-rtr \ + -- lsp-set-type sw2-rtr router \ + -- lsp-set-addresses sw2-rtr 00:00:00:00:02:00 \ + -- lsp-set-options sw2-rtr router-port=rtr-sw2 +check ovn-nbctl lrp-set-gateway-chassis rtr-sw1 hv1 10 +check ovn-nbctl lrp-set-gateway-chassis rtr-sw2 hv2 10 + +net_add n1 +sim_add hv1 +as hv1 +check ovs-vsctl add-br br-phys +ovn_attach n1 br-phys 192.168.0.1 +check ovs-vsctl -- add-port br-int hv1-vif1 -- \ + set interface hv1-vif1 external-ids:iface-id=sw1-p1 \ + options:tx_pcap=hv1/vif1-tx.pcap \ + options:rxq_pcap=hv1/vif1-rx.pcap + +sim_add hv2 +as hv2 +check ovs-vsctl add-br br-phys +ovn_attach n1 br-phys 192.168.0.2 +check ovs-vsctl -- add-port br-int hv2-vif2 -- \ + set interface hv2-vif2 external-ids:iface-id=sw2-p2 \ + options:tx_pcap=hv2/vif2-tx.pcap \ + options:rxq_pcap=hv2/vif2-rx.pcap + +wait_for_ports_up + +AS_BOX([Wait until cr-rtr-sw1 is claimed by hv1]) +hv1_chassis=$(fetch_column Chassis _uuid name=hv1) +AS_BOX([check that the chassis redirect port has been claimed by the hv1 chassis]) +wait_row_count Port_Binding 1 logical_port=cr-rtr-sw1 chassis=$hv1_chassis + +AS_BOX([Wait until cr-rtr-sw2 is claimed by hv2]) +hv2_chassis=$(fetch_column Chassis _uuid name=hv2) +AS_BOX([check that the chassis redirect port has been claimed by the hv2 chassis]) +wait_row_count Port_Binding 1 logical_port=cr-rtr-sw2 chassis=$hv2_chassis + +check ovn-nbctl --wait=hv sync + +AT_CAPTURE_FILE([exp]) +AT_CAPTURE_FILE([rcv]) +check_packets() { + > exp + > rcv + if test "$1" = --uniq; then + sort="sort -u"; shift + else + sort=sort + fi + for tuple in "$@"; do + set $tuple; pcap=$1 type=$2 + echo "--- $pcap" | tee -a exp >> rcv + $sort "$type" >> exp + $PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" $pcap | $sort >> rcv + echo | tee -a exp >> rcv + done + + $at_diff exp rcv >/dev/null +} + +OVN_POPULATE_ARP + +# Enable IGMP snooping on sw1 and sw2 and relay on rtr. +check ovn-nbctl --wait=hv set Logical_Switch sw1 \ + other_config:mcast_querier="false" \ + other_config:mcast_snoop="true" +check ovn-nbctl --wait=hv set Logical_Switch sw2 \ + other_config:mcast_querier="false" \ + other_config:mcast_snoop="true" +check ovn-nbctl set logical_router rtr \ + options:mcast_relay="true" +check ovn-nbctl --wait=hv sync + +# Inject IGMP Join for 239.0.1.68 on sw2-p2. +send_igmp_v3_report hv2-vif2 hv2 \ + 000000000001 $(ip_to_hex 10 0 0 1) f9f8 \ + $(ip_to_hex 239 0 1 68) 04 e9b9 \ + /dev/null + +# Check that the IGMP Group is learned by all switches. +wait_row_count IGMP_Group 1 address=239.0.1.68 +check ovn-nbctl --wait=hv sync + +# Send traffic from sw1 and make sure it is relayed by rtr. +> expected_routed +> expected_empty + +as hv1 reset_pcap_file hv1-vif1 hv1/vif1 +as hv2 reset_pcap_file hv2-vif2 hv2/vif2 + +send_ip_multicast_pkt hv1-vif1 hv1 \ + 000000000001 01005e000144 \ + $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \ + e518e518000a3b3a0000 +store_ip_multicast_pkt \ + 000000000200 01005e000144 \ + $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \ + e518e518000a3b3a0000 expected_routed + +OVS_WAIT_UNTIL( + [check_packets 'hv1/vif1-tx.pcap expected_empty' \ + 'hv2/vif2-tx.pcap expected_routed'], + [$at_diff -F'^---' exp rcv]) + +OVN_CLEANUP([hv1], [hv2]) +AT_CLEANUP +]) + OVN_FOR_EACH_NORTHD([ AT_SETUP([MLD snoop/querier/relay]) AT_KEYWORDS([snoop query querier relay]) -- 2.27.0 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev