Commit [1] added handling for E/W ICMPv4/v6 "fragmentation needed" packets generated for overlay tunneled traffic. This was required because the GENEVE kernel module generates ICMP "need frag" packets when a tunneled packet exceeds the path MTU, and such packets were previously dropped due to metadata needing to be swapped.
However, it did not cover the case where similar ICMP packets arrive from VTEP tunnels. Such packets are not subject to the same metadata handling constraints, since they are VXLAN-encapsulated and are already delivered directly to the destination VIF MAC address. As a result, they do not match the added for such packets rules and are dropped. Exclude packets coming from VTEP tunnels from this special handling. [1] https://github.com/ovn-org/ovn/commit/221476a01f2670cf4eb78cd9353e709cb8a16329 Fixes: 221476a01f26 ("ovn: Add tunnel PMTUD support.") Acked-by: Lorenzo Bianconi <[email protected]> Signed-off-by: Alexandra Rukomoinikova <[email protected]> --- v1 --> v2: added ACK by rename add_tunnel_ingress_icmp_need_frag_flow func to add_tunnel_ingress_pmtud_flows fixed Lorenzo's comments --- controller/encaps.c | 29 ++++++++++++------ controller/local_data.c | 2 ++ controller/local_data.h | 1 + controller/physical.c | 59 +++++++++++++++++++++++++----------- tests/ovn-controller-vtep.at | 4 +++ 5 files changed, 69 insertions(+), 26 deletions(-) diff --git a/controller/encaps.c b/controller/encaps.c index 61f41bf3a..0f3dd5aed 100644 --- a/controller/encaps.c +++ b/controller/encaps.c @@ -44,6 +44,7 @@ encaps_register_ovs_idl(struct ovsdb_idl *ovs_idl) ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_name); ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_type); ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_options); + ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_other_config); } /* Enough context to create a new tunnel, using tunnel_add(). */ @@ -201,12 +202,14 @@ out: } static void -tunnel_add(struct tunnel_ctx *tc, const struct sbrec_sb_global *sbg, - const char *new_chassis_id, const struct sbrec_encap *encap, - const char *local_ip, +tunnel_add(struct tunnel_ctx *tc, + const struct sbrec_sb_global *sbg, + const struct sbrec_chassis *chassis_rec, + const struct sbrec_encap *encap, const char *local_ip, const struct ovsrec_open_vswitch_table *ovs_table) { struct smap options = SMAP_INITIALIZER(&options); + struct smap other_config = SMAP_INITIALIZER(&other_config); smap_add(&options, "remote_ip", encap->ip); smap_add(&options, "local_ip", local_ip); smap_add(&options, "key", "flow"); @@ -214,6 +217,8 @@ tunnel_add(struct tunnel_ctx *tc, const struct sbrec_sb_global *sbg, const char *csum = smap_get(&encap->options, "csum"); char *tunnel_entry_id = NULL; char *tunnel_entry_id_old = NULL; + bool is_vtep_tunnel = smap_get_bool(&chassis_rec->other_config, + "is-vtep", false); /* * Since a chassis may have multiple encap-ip, we can't just add the @@ -221,9 +226,9 @@ tunnel_add(struct tunnel_ctx *tc, const struct sbrec_sb_global *sbg, * combination of the chassis_name and the remote and local encap-ips to * identify a specific tunnel to the remote chassis. */ - tunnel_entry_id = encaps_tunnel_id_create(new_chassis_id, encap->ip, + tunnel_entry_id = encaps_tunnel_id_create(chassis_rec->name, encap->ip, local_ip); - tunnel_entry_id_old = encaps_tunnel_id_create_legacy(new_chassis_id, + tunnel_entry_id_old = encaps_tunnel_id_create_legacy(chassis_rec->name, encap->ip); if (csum && (!strcmp(csum, "true") || !strcmp(csum, "false"))) { smap_add(&options, "csum", csum); @@ -258,7 +263,7 @@ tunnel_add(struct tunnel_ctx *tc, const struct sbrec_sb_global *sbg, /* Add auth info if ipsec is enabled. */ if (sbg->ipsec) { - smap_add(&options, "remote_name", new_chassis_id); + smap_add(&options, "remote_name", chassis_rec->name); /* Force NAT-T traversal via configuration */ /* Two ipsec backends are supported: libreswan and strongswan */ @@ -276,6 +281,10 @@ tunnel_add(struct tunnel_ctx *tc, const struct sbrec_sb_global *sbg, } } + if (is_vtep_tunnel) { + smap_add(&other_config, "is-vtep", "true"); + } + /* If there's an existing tunnel record that does not need any change, * keep it. Otherwise, create a new record (if there was an existing * record, the new record will supplant it and encaps_run() will delete @@ -312,10 +321,10 @@ tunnel_add(struct tunnel_ctx *tc, const struct sbrec_sb_global *sbg, * its name, otherwise generate a new, unique name. */ char *port_name = (tunnel ? xstrdup(tunnel->port->name) - : tunnel_create_name(tc, new_chassis_id)); + : tunnel_create_name(tc, chassis_rec->name)); if (!port_name) { VLOG_WARN("Unable to allocate unique name for '%s' tunnel", - new_chassis_id); + chassis_rec->name); goto exit; } @@ -323,6 +332,7 @@ tunnel_add(struct tunnel_ctx *tc, const struct sbrec_sb_global *sbg, ovsrec_interface_set_name(iface, port_name); ovsrec_interface_set_type(iface, encap->type); ovsrec_interface_set_options(iface, &options); + ovsrec_interface_set_other_config(iface, &other_config); struct ovsrec_port *port = ovsrec_port_insert(tc->ovs_txn); ovsrec_port_set_name(port, port_name); @@ -338,6 +348,7 @@ exit: free(tunnel_entry_id); free(tunnel_entry_id_old); smap_destroy(&options); + smap_destroy(&other_config); } static bool @@ -403,7 +414,7 @@ chassis_tunnel_add(const struct sbrec_chassis *chassis_rec, } VLOG_DBG("tunnel_add: '%s', local ip: %s", chassis_rec->name, this_chassis->encaps[j]->ip); - tunnel_add(tc, sbg, chassis_rec->name, chassis_rec->encaps[i], + tunnel_add(tc, sbg, chassis_rec, chassis_rec->encaps[i], this_chassis->encaps[j]->ip, ovs_table); tuncnt++; } diff --git a/controller/local_data.c b/controller/local_data.c index dda746d73..5f1a52771 100644 --- a/controller/local_data.c +++ b/controller/local_data.c @@ -532,6 +532,8 @@ local_nonvif_data_run(const struct ovsrec_bridge *br_int, tun->ofport = u16_to_ofp(ofport); tun->type = tunnel_type; tun->is_ipv6 = ip ? addr_is_ipv6(ip) : false; + tun->is_vtep_tunnel = smap_get_bool(&iface_rec->other_config, + "is-vtep", false); free(hash_id); free(ip); diff --git a/controller/local_data.h b/controller/local_data.h index 948c1a935..fdb81288c 100644 --- a/controller/local_data.h +++ b/controller/local_data.h @@ -146,6 +146,7 @@ struct chassis_tunnel { ofp_port_t ofport; enum chassis_tunnel_type type; bool is_ipv6; + bool is_vtep_tunnel; }; /* Flow-based tunnel that consolidates multiple endpoints into a single diff --git a/controller/physical.c b/controller/physical.c index 228f3d171..4f1748c6e 100644 --- a/controller/physical.c +++ b/controller/physical.c @@ -351,30 +351,35 @@ put_flow_based_remote_port_redirect_overlay( } } +/* Add handling for E/W ICMPv4/v6 packets when tunneled packets exceed + * path MTU. + * If packet needs to be tunneled to another node and the physical + * interface used for tunneling has a lower MTU than the packet size, + * or if there is a route exception with a smaller MTU, kernel + * generates an ICMP "Fragmentation Needed" message, but the package + * metadata didn't change. Such packets might have been dropped due + * to required metadata modifications for returned packet. + * + * Mark these packets with MLF_RX_FROM_TUNNEL_BIT for further + * processing. Packets received from a VTEP tunnel should be passed + * through, and errors handled via the normal processing path, since + * port metadata is not carried in VTEP packets in VNI. + */ static void -add_tunnel_ingress_flows(const struct chassis_tunnel *tun, - enum mf_field_id mff_ovn_geneve, - struct ovn_desired_flow_table *flow_table, - struct ofpbuf *ofpacts) +add_tunnel_ingress_pmtud_flows(const struct chassis_tunnel *tun, + struct ofpbuf *ofpacts, + struct ovn_desired_flow_table *flow_table) { - /* Main ingress flow (priority 100) */ - struct match match = MATCH_CATCHALL_INITIALIZER; - match_set_in_port(&match, tun->ofport); - - ofpbuf_clear(ofpacts); - put_decapsulation(mff_ovn_geneve, tun, ofpacts); - put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts); + if (tun->is_vtep_tunnel) { + return; + } - ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match, - ofpacts, hc_uuid); + struct match match = MATCH_CATCHALL_INITIALIZER; /* Set allow rx from tunnel bit */ put_load(1, MFF_LOG_FLAGS, MLF_RX_FROM_TUNNEL_BIT, 1, ofpacts); put_resubmit(OFTABLE_CT_ZONE_LOOKUP, ofpacts); - /* Add specific flows for E/W ICMPv{4,6} packets if tunnelled packets - * do not fit path MTU. */ - /* IPv4 ICMP flow (priority 120) */ match_init_catchall(&match); match_set_in_port(&match, tun->ofport); @@ -398,6 +403,26 @@ add_tunnel_ingress_flows(const struct chassis_tunnel *tun, ofpacts, hc_uuid); } +static void +add_tunnel_ingress_flows(const struct chassis_tunnel *tun, + enum mf_field_id mff_ovn_geneve, + struct ovn_desired_flow_table *flow_table, + struct ofpbuf *ofpacts) +{ + /* Main ingress flow (priority 100) */ + struct match match = MATCH_CATCHALL_INITIALIZER; + match_set_in_port(&match, tun->ofport); + + ofpbuf_clear(ofpacts); + put_decapsulation(mff_ovn_geneve, tun, ofpacts); + put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts); + + ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match, + ofpacts, hc_uuid); + + add_tunnel_ingress_pmtud_flows(tun, ofpacts, flow_table); +} + static void put_stack(enum mf_field_id field, struct ofpact_stack *stack) { @@ -3943,7 +3968,7 @@ physical_run(struct physical_ctx *p_ctx, struct chassis_tunnel *tun; HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) { add_tunnel_ingress_flows(tun, p_ctx->mff_ovn_geneve, flow_table, - &ofpacts); + &ofpacts); } /* Process packets that arrive from flow-based tunnels. */ diff --git a/tests/ovn-controller-vtep.at b/tests/ovn-controller-vtep.at index 961324bd2..caf53e291 100644 --- a/tests/ovn-controller-vtep.at +++ b/tests/ovn-controller-vtep.at @@ -775,6 +775,10 @@ AT_CHECK([ovs-ofctl dump-flows br-int table=OFTABLE_PHY_TO_LOG | grep 'priority= priority=110,tun_id=0x<>,in_port=<> actions=move:NXM_NX_TUN_ID[[0..23]]->OXM_OF_METADATA[[0..23]],load:0x<>->NXM_NX_REG14[[0..14]],load:0x<>->NXM_NX_REG10[[1]],resubmit(,OFTABLE_LOG_INGRESS_PIPELINE) ]) +# Skip processing ICMP "packet too big" errors in this table if the packet came from a VTEP tunnel. +AT_CHECK([ovs-ofctl dump-flows br-int table=OFTABLE_PHY_TO_LOG | \ + grep -E 'icmp_type=3,icmp_code=4|icmp_type=2,icmp_code=0'], [1], []) + OVN_CONTROLLER_VTEP_STOP([], vtep1) OVN_CLEANUP([hv1]) AT_CLEANUP -- 2.48.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
