On 23.04.2026 13:33, Dumitru Ceara wrote:
> On 4/20/26 12:54 PM, Alexandra Rukomoinikova via dev wrote:
>> Commit [1] added handling for E/W ICMPv4/v6 "fragmentation needed"
>> packets generated for overlay tunneled traffic. This was required
>> because the GENEVE kernel module generates ICMP "need frag" packets
>> when a tunneled packet exceeds the path MTU, and such packets were
>> previously dropped due to metadata needing to be swapped.
>>
>> However, it did not cover the case where similar ICMP packets
>> arrive from VTEP tunnels. Such packets are not subject to the same
>> metadata handling constraints, since they are VXLAN-encapsulated and
>> are already delivered directly to the destination VIF MAC address.
>>
>> As a result, they do not match the added for such packets rules and are
>> dropped. Exclude packets coming from VTEP tunnels from this special handling.
>>
>> [1] 
>> https://github.com/ovn-org/ovn/commit/221476a01f2670cf4eb78cd9353e709cb8a16329
>> Fixes: 221476a01f26 ("ovn: Add tunnel PMTUD support.")
>> Acked-by: Lorenzo Bianconi <[email protected]>
>> Signed-off-by: Alexandra Rukomoinikova <[email protected]>
>> ---
> Hi Alexandra, Lorenzo,
>
> Thanks for the patch and review!


Hi Dumitru! Thanks for review, i will answer below


>
>>   v1 --> v2: added ACK by
>>              rename add_tunnel_ingress_icmp_need_frag_flow func to 
>> add_tunnel_ingress_pmtud_flows
>>              fixed Lorenzo's comments
>> ---
>>   controller/encaps.c          | 29 ++++++++++++------
>>   controller/local_data.c      |  2 ++
>>   controller/local_data.h      |  1 +
>>   controller/physical.c        | 59 +++++++++++++++++++++++++-----------
>>   tests/ovn-controller-vtep.at |  4 +++
>>   5 files changed, 69 insertions(+), 26 deletions(-)
>>
>> diff --git a/controller/encaps.c b/controller/encaps.c
>> index 61f41bf3a..0f3dd5aed 100644
>> --- a/controller/encaps.c
>> +++ b/controller/encaps.c
>> @@ -44,6 +44,7 @@ encaps_register_ovs_idl(struct ovsdb_idl *ovs_idl)
>>       ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_name);
>>       ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_type);
>>       ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_options);
>> +    ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_other_config);
>>   }
>>   
>>   /* Enough context to create a new tunnel, using tunnel_add(). */
>> @@ -201,12 +202,14 @@ out:
>>   }
>>   
>>   static void
>> -tunnel_add(struct tunnel_ctx *tc, const struct sbrec_sb_global *sbg,
>> -           const char *new_chassis_id, const struct sbrec_encap *encap,
>> -           const char *local_ip,
>> +tunnel_add(struct tunnel_ctx *tc,
>> +           const struct sbrec_sb_global *sbg,
>> +           const struct sbrec_chassis *chassis_rec,
>> +           const struct sbrec_encap *encap, const char *local_ip,
>>              const struct ovsrec_open_vswitch_table *ovs_table)
>>   {
>>       struct smap options = SMAP_INITIALIZER(&options);
>> +    struct smap other_config = SMAP_INITIALIZER(&other_config);
>>       smap_add(&options, "remote_ip", encap->ip);
>>       smap_add(&options, "local_ip", local_ip);
>>       smap_add(&options, "key", "flow");
>> @@ -214,6 +217,8 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
>> sbrec_sb_global *sbg,
>>       const char *csum = smap_get(&encap->options, "csum");
>>       char *tunnel_entry_id = NULL;
>>       char *tunnel_entry_id_old = NULL;
>> +    bool is_vtep_tunnel = smap_get_bool(&chassis_rec->other_config,
>> +                                        "is-vtep", false);
> This is duplicated in chassis.c (as a static function), should we unify
> all the "is-vtep" helpers?


Yes, thanks, I will add


>
>>   
>>       /*
>>        * Since a chassis may have multiple encap-ip, we can't just add the
>> @@ -221,9 +226,9 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
>> sbrec_sb_global *sbg,
>>        * combination of the chassis_name and the remote and local encap-ips 
>> to
>>        * identify a specific tunnel to the remote chassis.
>>        */
>> -    tunnel_entry_id = encaps_tunnel_id_create(new_chassis_id, encap->ip,
>> +    tunnel_entry_id = encaps_tunnel_id_create(chassis_rec->name, encap->ip,
>>                                                 local_ip);
>> -    tunnel_entry_id_old = encaps_tunnel_id_create_legacy(new_chassis_id,
>> +    tunnel_entry_id_old = encaps_tunnel_id_create_legacy(chassis_rec->name,
>>                                                            encap->ip);
>>       if (csum && (!strcmp(csum, "true") || !strcmp(csum, "false"))) {
>>           smap_add(&options, "csum", csum);
>> @@ -258,7 +263,7 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
>> sbrec_sb_global *sbg,
>>   
>>       /* Add auth info if ipsec is enabled. */
>>       if (sbg->ipsec) {
>> -        smap_add(&options, "remote_name", new_chassis_id);
>> +        smap_add(&options, "remote_name", chassis_rec->name);
>>   
>>           /* Force NAT-T traversal via configuration */
>>           /* Two ipsec backends are supported: libreswan and strongswan */
>> @@ -276,6 +281,10 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
>> sbrec_sb_global *sbg,
>>           }
>>       }
>>   
>> +    if (is_vtep_tunnel) {
>> +        smap_add(&other_config, "is-vtep", "true");
>> +    }
>> +
>>       /* If there's an existing tunnel record that does not need any change,
>>        * keep it.  Otherwise, create a new record (if there was an existing
>>        * record, the new record will supplant it and encaps_run() will delete
>> @@ -312,10 +321,10 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
>> sbrec_sb_global *sbg,
>>        * its name, otherwise generate a new, unique name. */
>>       char *port_name = (tunnel
>>                          ? xstrdup(tunnel->port->name)
>> -                       : tunnel_create_name(tc, new_chassis_id));
>> +                       : tunnel_create_name(tc, chassis_rec->name));
>>       if (!port_name) {
>>           VLOG_WARN("Unable to allocate unique name for '%s' tunnel",
>> -                  new_chassis_id);
>> +                  chassis_rec->name);
>>           goto exit;
>>       }
>>   
>> @@ -323,6 +332,7 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
>> sbrec_sb_global *sbg,
>>       ovsrec_interface_set_name(iface, port_name);
>>       ovsrec_interface_set_type(iface, encap->type);
>>       ovsrec_interface_set_options(iface, &options);
>> +    ovsrec_interface_set_other_config(iface, &other_config);
>>   
>>       struct ovsrec_port *port = ovsrec_port_insert(tc->ovs_txn);
>>       ovsrec_port_set_name(port, port_name);
>> @@ -338,6 +348,7 @@ exit:
>>       free(tunnel_entry_id);
>>       free(tunnel_entry_id_old);
>>       smap_destroy(&options);
>> +    smap_destroy(&other_config);
>>   }
>>   
>>   static bool
>> @@ -403,7 +414,7 @@ chassis_tunnel_add(const struct sbrec_chassis 
>> *chassis_rec,
>>               }
>>               VLOG_DBG("tunnel_add: '%s', local ip: %s", chassis_rec->name,
>>                        this_chassis->encaps[j]->ip);
>> -            tunnel_add(tc, sbg, chassis_rec->name, chassis_rec->encaps[i],
>> +            tunnel_add(tc, sbg, chassis_rec, chassis_rec->encaps[i],
>>                          this_chassis->encaps[j]->ip, ovs_table);
>>               tuncnt++;
>>           }
>> diff --git a/controller/local_data.c b/controller/local_data.c
>> index dda746d73..5f1a52771 100644
>> --- a/controller/local_data.c
>> +++ b/controller/local_data.c
>> @@ -532,6 +532,8 @@ local_nonvif_data_run(const struct ovsrec_bridge *br_int,
>>                   tun->ofport = u16_to_ofp(ofport);
>>                   tun->type = tunnel_type;
>>                   tun->is_ipv6 = ip ? addr_is_ipv6(ip) : false;
>> +                tun->is_vtep_tunnel = 
>> smap_get_bool(&iface_rec->other_config,
>> +                                                    "is-vtep", false);
> Should we have a small public helper in encaps.[ch] (where the tunnel
> interface is actually created and this other_config value is set)?
>
>>   
>>                   free(hash_id);
>>                   free(ip);
>> diff --git a/controller/local_data.h b/controller/local_data.h
>> index 948c1a935..fdb81288c 100644
>> --- a/controller/local_data.h
>> +++ b/controller/local_data.h
>> @@ -146,6 +146,7 @@ struct chassis_tunnel {
>>       ofp_port_t ofport;
>>       enum chassis_tunnel_type type;
>>       bool is_ipv6;
>> +    bool is_vtep_tunnel;
> I think the terminology we agreed upon back when Ihar (in CC) added
> VXLAN support [0] (for ovn-chassis-to-ovn-chassis communication) was
> "ramp switch", should this be called "is_ramp_tunnel"?
>
> [0] https://github.com/ovn-org/ovn/commit/b07f1bc

Yes, thanks! i'll fix it. i'll also correct statement in commit message 
about vxlan encap,

I'll make it clearer, now it seems like any vlxan encapsulation has no 
problems with metadata swapping while icmp frag needed packet processing


>
>>   };
>>   
>>   /* Flow-based tunnel that consolidates multiple endpoints into a single
>> diff --git a/controller/physical.c b/controller/physical.c
>> index 228f3d171..4f1748c6e 100644
>> --- a/controller/physical.c
>> +++ b/controller/physical.c
>> @@ -351,30 +351,35 @@ put_flow_based_remote_port_redirect_overlay(
>>       }
>>   }
>>   
>> +/* Add handling for E/W ICMPv4/v6 packets when tunneled packets exceed
>> + * path MTU.
>> + * If packet needs to be tunneled to another node and the physical
>> + * interface used for tunneling has a lower MTU than the packet size,
>> + * or if there is a route exception with a smaller MTU, kernel
>> + * generates an ICMP "Fragmentation Needed" message, but the package
> Nit: s/package/packet/
>
>> + * metadata didn't change. Such packets might have been dropped due
>> + * to required metadata modifications for returned packet.
>> + *
>> + * Mark these packets with MLF_RX_FROM_TUNNEL_BIT for further
>> + * processing. Packets received from a VTEP tunnel should be passed
> Here too, probably "RAMP" is more appropriate.
>
>> + * through, and errors handled via the normal processing path, since
>> + * port metadata is not carried in VTEP packets in VNI.
>> + */
>>   static void
>> -add_tunnel_ingress_flows(const struct chassis_tunnel *tun,
>> -                         enum mf_field_id mff_ovn_geneve,
>> -                         struct ovn_desired_flow_table *flow_table,
>> -                         struct ofpbuf *ofpacts)
>> +add_tunnel_ingress_pmtud_flows(const struct chassis_tunnel *tun,
>> +                               struct ofpbuf *ofpacts,
>> +                               struct ovn_desired_flow_table *flow_table)
>>   {
>> -    /* Main ingress flow (priority 100) */
>> -    struct match match = MATCH_CATCHALL_INITIALIZER;
>> -    match_set_in_port(&match, tun->ofport);
>> -
>> -    ofpbuf_clear(ofpacts);
>> -    put_decapsulation(mff_ovn_geneve, tun, ofpacts);
>> -    put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts);
>> +    if (tun->is_vtep_tunnel) {
>> +        return;
>> +    }
>>   
>> -    ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
>> -                    ofpacts, hc_uuid);
>> +    struct match match = MATCH_CATCHALL_INITIALIZER;
>>   
>>       /* Set allow rx from tunnel bit */
>>       put_load(1, MFF_LOG_FLAGS, MLF_RX_FROM_TUNNEL_BIT, 1, ofpacts);
>>       put_resubmit(OFTABLE_CT_ZONE_LOOKUP, ofpacts);
>>   
>> -    /* Add specific flows for E/W ICMPv{4,6} packets if tunnelled packets
>> -     * do not fit path MTU. */
>> -
>>       /* IPv4 ICMP flow (priority 120) */
>>       match_init_catchall(&match);
>>       match_set_in_port(&match, tun->ofport);
>> @@ -398,6 +403,26 @@ add_tunnel_ingress_flows(const struct chassis_tunnel 
>> *tun,
>>                       ofpacts, hc_uuid);
>>   }
>>   
>> +static void
>> +add_tunnel_ingress_flows(const struct chassis_tunnel *tun,
>> +                         enum mf_field_id mff_ovn_geneve,
>> +                         struct ovn_desired_flow_table *flow_table,
>> +                         struct ofpbuf *ofpacts)
>> +{
>> +    /* Main ingress flow (priority 100) */
>> +    struct match match = MATCH_CATCHALL_INITIALIZER;
>> +    match_set_in_port(&match, tun->ofport);
>> +
>> +    ofpbuf_clear(ofpacts);
>> +    put_decapsulation(mff_ovn_geneve, tun, ofpacts);
>> +    put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts);
>> +
>> +    ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
>> +                    ofpacts, hc_uuid);
>> +
>> +    add_tunnel_ingress_pmtud_flows(tun, ofpacts, flow_table);
>> +}
>> +
>>   static void
>>   put_stack(enum mf_field_id field, struct ofpact_stack *stack)
>>   {
>> @@ -3943,7 +3968,7 @@ physical_run(struct physical_ctx *p_ctx,
>>       struct chassis_tunnel *tun;
>>       HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) {
>>           add_tunnel_ingress_flows(tun, p_ctx->mff_ovn_geneve, flow_table,
>> -                                &ofpacts);
>> +                                 &ofpacts);
> Nit: unrelated change.


I can drop it if you prefer, but I tend to fix small formatting 
inconsistencies when I’m already touching nearby code.

Thanks!

>>       }
>>   
>>       /* Process packets that arrive from flow-based tunnels. */
>> diff --git a/tests/ovn-controller-vtep.at b/tests/ovn-controller-vtep.at
>> index 961324bd2..caf53e291 100644
>> --- a/tests/ovn-controller-vtep.at
>> +++ b/tests/ovn-controller-vtep.at
>> @@ -775,6 +775,10 @@ AT_CHECK([ovs-ofctl dump-flows br-int 
>> table=OFTABLE_PHY_TO_LOG | grep 'priority=
>>   priority=110,tun_id=0x<>,in_port=<> 
>> actions=move:NXM_NX_TUN_ID[[0..23]]->OXM_OF_METADATA[[0..23]],load:0x<>->NXM_NX_REG14[[0..14]],load:0x<>->NXM_NX_REG10[[1]],resubmit(,OFTABLE_LOG_INGRESS_PIPELINE)
>>   ])
>>   
>> +# Skip processing ICMP "packet too big" errors in this table if the packet 
>> came from a VTEP tunnel.
>> +AT_CHECK([ovs-ofctl dump-flows br-int table=OFTABLE_PHY_TO_LOG | \
>> +          grep -E 'icmp_type=3,icmp_code=4|icmp_type=2,icmp_code=0'], [1], 
>> [])
>> +
>>   OVN_CONTROLLER_VTEP_STOP([], vtep1)
>>   OVN_CLEANUP([hv1])
>>   AT_CLEANUP
> Regards,
> Dumitru
>

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to