On 4/20/26 12:54 PM, Alexandra Rukomoinikova via dev wrote:
> Commit [1] added handling for E/W ICMPv4/v6 "fragmentation needed"
> packets generated for overlay tunneled traffic. This was required
> because the GENEVE kernel module generates ICMP "need frag" packets
> when a tunneled packet exceeds the path MTU, and such packets were
> previously dropped due to metadata needing to be swapped.
> 
> However, it did not cover the case where similar ICMP packets
> arrive from VTEP tunnels. Such packets are not subject to the same
> metadata handling constraints, since they are VXLAN-encapsulated and
> are already delivered directly to the destination VIF MAC address.
> 
> As a result, they do not match the added for such packets rules and are
> dropped. Exclude packets coming from VTEP tunnels from this special handling.
> 
> [1] 
> https://github.com/ovn-org/ovn/commit/221476a01f2670cf4eb78cd9353e709cb8a16329
> Fixes: 221476a01f26 ("ovn: Add tunnel PMTUD support.")
> Acked-by: Lorenzo Bianconi <[email protected]>
> Signed-off-by: Alexandra Rukomoinikova <[email protected]>
> ---

Hi Alexandra, Lorenzo,

Thanks for the patch and review!

>  v1 --> v2: added ACK by
>             rename add_tunnel_ingress_icmp_need_frag_flow func to 
> add_tunnel_ingress_pmtud_flows
>             fixed Lorenzo's comments
> ---
>  controller/encaps.c          | 29 ++++++++++++------
>  controller/local_data.c      |  2 ++
>  controller/local_data.h      |  1 +
>  controller/physical.c        | 59 +++++++++++++++++++++++++-----------
>  tests/ovn-controller-vtep.at |  4 +++
>  5 files changed, 69 insertions(+), 26 deletions(-)
> 
> diff --git a/controller/encaps.c b/controller/encaps.c
> index 61f41bf3a..0f3dd5aed 100644
> --- a/controller/encaps.c
> +++ b/controller/encaps.c
> @@ -44,6 +44,7 @@ encaps_register_ovs_idl(struct ovsdb_idl *ovs_idl)
>      ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_name);
>      ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_type);
>      ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_options);
> +    ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_other_config);
>  }
>  
>  /* Enough context to create a new tunnel, using tunnel_add(). */
> @@ -201,12 +202,14 @@ out:
>  }
>  
>  static void
> -tunnel_add(struct tunnel_ctx *tc, const struct sbrec_sb_global *sbg,
> -           const char *new_chassis_id, const struct sbrec_encap *encap,
> -           const char *local_ip,
> +tunnel_add(struct tunnel_ctx *tc,
> +           const struct sbrec_sb_global *sbg,
> +           const struct sbrec_chassis *chassis_rec,
> +           const struct sbrec_encap *encap, const char *local_ip,
>             const struct ovsrec_open_vswitch_table *ovs_table)
>  {
>      struct smap options = SMAP_INITIALIZER(&options);
> +    struct smap other_config = SMAP_INITIALIZER(&other_config);
>      smap_add(&options, "remote_ip", encap->ip);
>      smap_add(&options, "local_ip", local_ip);
>      smap_add(&options, "key", "flow");
> @@ -214,6 +217,8 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
> sbrec_sb_global *sbg,
>      const char *csum = smap_get(&encap->options, "csum");
>      char *tunnel_entry_id = NULL;
>      char *tunnel_entry_id_old = NULL;
> +    bool is_vtep_tunnel = smap_get_bool(&chassis_rec->other_config,
> +                                        "is-vtep", false);

This is duplicated in chassis.c (as a static function), should we unify
all the "is-vtep" helpers?

>  
>      /*
>       * Since a chassis may have multiple encap-ip, we can't just add the
> @@ -221,9 +226,9 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
> sbrec_sb_global *sbg,
>       * combination of the chassis_name and the remote and local encap-ips to
>       * identify a specific tunnel to the remote chassis.
>       */
> -    tunnel_entry_id = encaps_tunnel_id_create(new_chassis_id, encap->ip,
> +    tunnel_entry_id = encaps_tunnel_id_create(chassis_rec->name, encap->ip,
>                                                local_ip);
> -    tunnel_entry_id_old = encaps_tunnel_id_create_legacy(new_chassis_id,
> +    tunnel_entry_id_old = encaps_tunnel_id_create_legacy(chassis_rec->name,
>                                                           encap->ip);
>      if (csum && (!strcmp(csum, "true") || !strcmp(csum, "false"))) {
>          smap_add(&options, "csum", csum);
> @@ -258,7 +263,7 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
> sbrec_sb_global *sbg,
>  
>      /* Add auth info if ipsec is enabled. */
>      if (sbg->ipsec) {
> -        smap_add(&options, "remote_name", new_chassis_id);
> +        smap_add(&options, "remote_name", chassis_rec->name);
>  
>          /* Force NAT-T traversal via configuration */
>          /* Two ipsec backends are supported: libreswan and strongswan */
> @@ -276,6 +281,10 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
> sbrec_sb_global *sbg,
>          }
>      }
>  
> +    if (is_vtep_tunnel) {
> +        smap_add(&other_config, "is-vtep", "true");
> +    }
> +
>      /* If there's an existing tunnel record that does not need any change,
>       * keep it.  Otherwise, create a new record (if there was an existing
>       * record, the new record will supplant it and encaps_run() will delete
> @@ -312,10 +321,10 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
> sbrec_sb_global *sbg,
>       * its name, otherwise generate a new, unique name. */
>      char *port_name = (tunnel
>                         ? xstrdup(tunnel->port->name)
> -                       : tunnel_create_name(tc, new_chassis_id));
> +                       : tunnel_create_name(tc, chassis_rec->name));
>      if (!port_name) {
>          VLOG_WARN("Unable to allocate unique name for '%s' tunnel",
> -                  new_chassis_id);
> +                  chassis_rec->name);
>          goto exit;
>      }
>  
> @@ -323,6 +332,7 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
> sbrec_sb_global *sbg,
>      ovsrec_interface_set_name(iface, port_name);
>      ovsrec_interface_set_type(iface, encap->type);
>      ovsrec_interface_set_options(iface, &options);
> +    ovsrec_interface_set_other_config(iface, &other_config);
>  
>      struct ovsrec_port *port = ovsrec_port_insert(tc->ovs_txn);
>      ovsrec_port_set_name(port, port_name);
> @@ -338,6 +348,7 @@ exit:
>      free(tunnel_entry_id);
>      free(tunnel_entry_id_old);
>      smap_destroy(&options);
> +    smap_destroy(&other_config);
>  }
>  
>  static bool
> @@ -403,7 +414,7 @@ chassis_tunnel_add(const struct sbrec_chassis 
> *chassis_rec,
>              }
>              VLOG_DBG("tunnel_add: '%s', local ip: %s", chassis_rec->name,
>                       this_chassis->encaps[j]->ip);
> -            tunnel_add(tc, sbg, chassis_rec->name, chassis_rec->encaps[i],
> +            tunnel_add(tc, sbg, chassis_rec, chassis_rec->encaps[i],
>                         this_chassis->encaps[j]->ip, ovs_table);
>              tuncnt++;
>          }
> diff --git a/controller/local_data.c b/controller/local_data.c
> index dda746d73..5f1a52771 100644
> --- a/controller/local_data.c
> +++ b/controller/local_data.c
> @@ -532,6 +532,8 @@ local_nonvif_data_run(const struct ovsrec_bridge *br_int,
>                  tun->ofport = u16_to_ofp(ofport);
>                  tun->type = tunnel_type;
>                  tun->is_ipv6 = ip ? addr_is_ipv6(ip) : false;
> +                tun->is_vtep_tunnel = smap_get_bool(&iface_rec->other_config,
> +                                                    "is-vtep", false);

Should we have a small public helper in encaps.[ch] (where the tunnel
interface is actually created and this other_config value is set)?

>  
>                  free(hash_id);
>                  free(ip);
> diff --git a/controller/local_data.h b/controller/local_data.h
> index 948c1a935..fdb81288c 100644
> --- a/controller/local_data.h
> +++ b/controller/local_data.h
> @@ -146,6 +146,7 @@ struct chassis_tunnel {
>      ofp_port_t ofport;
>      enum chassis_tunnel_type type;
>      bool is_ipv6;
> +    bool is_vtep_tunnel;

I think the terminology we agreed upon back when Ihar (in CC) added
VXLAN support [0] (for ovn-chassis-to-ovn-chassis communication) was
"ramp switch", should this be called "is_ramp_tunnel"?

[0] https://github.com/ovn-org/ovn/commit/b07f1bc

>  };
>  
>  /* Flow-based tunnel that consolidates multiple endpoints into a single
> diff --git a/controller/physical.c b/controller/physical.c
> index 228f3d171..4f1748c6e 100644
> --- a/controller/physical.c
> +++ b/controller/physical.c
> @@ -351,30 +351,35 @@ put_flow_based_remote_port_redirect_overlay(
>      }
>  }
>  
> +/* Add handling for E/W ICMPv4/v6 packets when tunneled packets exceed
> + * path MTU.
> + * If packet needs to be tunneled to another node and the physical
> + * interface used for tunneling has a lower MTU than the packet size,
> + * or if there is a route exception with a smaller MTU, kernel
> + * generates an ICMP "Fragmentation Needed" message, but the package

Nit: s/package/packet/

> + * metadata didn't change. Such packets might have been dropped due
> + * to required metadata modifications for returned packet.
> + *
> + * Mark these packets with MLF_RX_FROM_TUNNEL_BIT for further
> + * processing. Packets received from a VTEP tunnel should be passed

Here too, probably "RAMP" is more appropriate.

> + * through, and errors handled via the normal processing path, since
> + * port metadata is not carried in VTEP packets in VNI.
> + */
>  static void
> -add_tunnel_ingress_flows(const struct chassis_tunnel *tun,
> -                         enum mf_field_id mff_ovn_geneve,
> -                         struct ovn_desired_flow_table *flow_table,
> -                         struct ofpbuf *ofpacts)
> +add_tunnel_ingress_pmtud_flows(const struct chassis_tunnel *tun,
> +                               struct ofpbuf *ofpacts,
> +                               struct ovn_desired_flow_table *flow_table)
>  {
> -    /* Main ingress flow (priority 100) */
> -    struct match match = MATCH_CATCHALL_INITIALIZER;
> -    match_set_in_port(&match, tun->ofport);
> -
> -    ofpbuf_clear(ofpacts);
> -    put_decapsulation(mff_ovn_geneve, tun, ofpacts);
> -    put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts);
> +    if (tun->is_vtep_tunnel) {
> +        return;
> +    }
>  
> -    ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
> -                    ofpacts, hc_uuid);
> +    struct match match = MATCH_CATCHALL_INITIALIZER;
>  
>      /* Set allow rx from tunnel bit */
>      put_load(1, MFF_LOG_FLAGS, MLF_RX_FROM_TUNNEL_BIT, 1, ofpacts);
>      put_resubmit(OFTABLE_CT_ZONE_LOOKUP, ofpacts);
>  
> -    /* Add specific flows for E/W ICMPv{4,6} packets if tunnelled packets
> -     * do not fit path MTU. */
> -
>      /* IPv4 ICMP flow (priority 120) */
>      match_init_catchall(&match);
>      match_set_in_port(&match, tun->ofport);
> @@ -398,6 +403,26 @@ add_tunnel_ingress_flows(const struct chassis_tunnel 
> *tun,
>                      ofpacts, hc_uuid);
>  }
>  
> +static void
> +add_tunnel_ingress_flows(const struct chassis_tunnel *tun,
> +                         enum mf_field_id mff_ovn_geneve,
> +                         struct ovn_desired_flow_table *flow_table,
> +                         struct ofpbuf *ofpacts)
> +{
> +    /* Main ingress flow (priority 100) */
> +    struct match match = MATCH_CATCHALL_INITIALIZER;
> +    match_set_in_port(&match, tun->ofport);
> +
> +    ofpbuf_clear(ofpacts);
> +    put_decapsulation(mff_ovn_geneve, tun, ofpacts);
> +    put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts);
> +
> +    ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
> +                    ofpacts, hc_uuid);
> +
> +    add_tunnel_ingress_pmtud_flows(tun, ofpacts, flow_table);
> +}
> +
>  static void
>  put_stack(enum mf_field_id field, struct ofpact_stack *stack)
>  {
> @@ -3943,7 +3968,7 @@ physical_run(struct physical_ctx *p_ctx,
>      struct chassis_tunnel *tun;
>      HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) {
>          add_tunnel_ingress_flows(tun, p_ctx->mff_ovn_geneve, flow_table,
> -                                &ofpacts);
> +                                 &ofpacts);

Nit: unrelated change.

>      }
>  
>      /* Process packets that arrive from flow-based tunnels. */
> diff --git a/tests/ovn-controller-vtep.at b/tests/ovn-controller-vtep.at
> index 961324bd2..caf53e291 100644
> --- a/tests/ovn-controller-vtep.at
> +++ b/tests/ovn-controller-vtep.at
> @@ -775,6 +775,10 @@ AT_CHECK([ovs-ofctl dump-flows br-int 
> table=OFTABLE_PHY_TO_LOG | grep 'priority=
>  priority=110,tun_id=0x<>,in_port=<> 
> actions=move:NXM_NX_TUN_ID[[0..23]]->OXM_OF_METADATA[[0..23]],load:0x<>->NXM_NX_REG14[[0..14]],load:0x<>->NXM_NX_REG10[[1]],resubmit(,OFTABLE_LOG_INGRESS_PIPELINE)
>  ])
>  
> +# Skip processing ICMP "packet too big" errors in this table if the packet 
> came from a VTEP tunnel.
> +AT_CHECK([ovs-ofctl dump-flows br-int table=OFTABLE_PHY_TO_LOG | \
> +          grep -E 'icmp_type=3,icmp_code=4|icmp_type=2,icmp_code=0'], [1], 
> [])
> +
>  OVN_CONTROLLER_VTEP_STOP([], vtep1)
>  OVN_CLEANUP([hv1])
>  AT_CLEANUP

Regards,
Dumitru

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to