Commit [1] added handling for E/W ICMPv4/v6 "fragmentation needed"
packets generated for overlay tunneled traffic. This was required
because kernel generates ICMP "need frag" packet when a tunneled
packet exceeds the path MTU, and such packets were previously
dropped due to metadata needing to be swapped.

However, it did not cover the case where similar ICMP packets arrive
from RAMP tunnels. Such packets are not subject to the same metadata
handling constraints, since VXLAN encapsulation in this case does not
encode port in the VNI field, and, also, packets are delivered directly
to the destination VIF MAC address.

As a result, they do not match the added for such packets rules and are
dropped. Exclude packets coming from VTEP tunnels from this special handling.

[1] 
https://github.com/ovn-org/ovn/commit/221476a01f2670cf4eb78cd9353e709cb8a16329
Fixes: 221476a01f26 ("ovn: Add tunnel PMTUD support.")
Signed-off-by: Alexandra Rukomoinikova <[email protected]>
 ---
  v2 --> v3: removed ACK since i changed code
             added helper function for all get is-vtep calls
             changed naming from is_vtep -> to is_ramp
  v1 --> v2: added ACK by
             rename add_tunnel_ingress_icmp_need_frag_flow func to 
add_tunnel_ingress_pmtud_flows
             fixed Lorenzo's comments
---
 controller/encaps.c          | 29 +++++++++-----
 controller/encaps.h          |  7 ++++
 controller/local_data.c      |  1 +
 controller/local_data.h      |  1 +
 controller/physical.c        | 77 ++++++++++++++++++++++--------------
 tests/ovn-controller-vtep.at |  4 ++
 6 files changed, 80 insertions(+), 39 deletions(-)

diff --git a/controller/encaps.c b/controller/encaps.c
index 61f41bf3a..919eea432 100644
--- a/controller/encaps.c
+++ b/controller/encaps.c
@@ -25,7 +25,6 @@
 #include "lib/ovn-sb-idl.h"
 #include "lib/ovsdb-idl.h"
 #include "ovn-controller.h"
-#include "smap.h"
 
 VLOG_DEFINE_THIS_MODULE(encaps);
 
@@ -44,6 +43,7 @@ encaps_register_ovs_idl(struct ovsdb_idl *ovs_idl)
     ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_name);
     ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_type);
     ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_options);
+    ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_other_config);
 }
 
 /* Enough context to create a new tunnel, using tunnel_add(). */
@@ -201,12 +201,14 @@ out:
 }
 
 static void
-tunnel_add(struct tunnel_ctx *tc, const struct sbrec_sb_global *sbg,
-           const char *new_chassis_id, const struct sbrec_encap *encap,
-           const char *local_ip,
+tunnel_add(struct tunnel_ctx *tc,
+           const struct sbrec_sb_global *sbg,
+           const struct sbrec_chassis *chassis_rec,
+           const struct sbrec_encap *encap, const char *local_ip,
            const struct ovsrec_open_vswitch_table *ovs_table)
 {
     struct smap options = SMAP_INITIALIZER(&options);
+    struct smap other_config = SMAP_INITIALIZER(&other_config);
     smap_add(&options, "remote_ip", encap->ip);
     smap_add(&options, "local_ip", local_ip);
     smap_add(&options, "key", "flow");
@@ -221,9 +223,9 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
sbrec_sb_global *sbg,
      * combination of the chassis_name and the remote and local encap-ips to
      * identify a specific tunnel to the remote chassis.
      */
-    tunnel_entry_id = encaps_tunnel_id_create(new_chassis_id, encap->ip,
+    tunnel_entry_id = encaps_tunnel_id_create(chassis_rec->name, encap->ip,
                                               local_ip);
-    tunnel_entry_id_old = encaps_tunnel_id_create_legacy(new_chassis_id,
+    tunnel_entry_id_old = encaps_tunnel_id_create_legacy(chassis_rec->name,
                                                          encap->ip);
     if (csum && (!strcmp(csum, "true") || !strcmp(csum, "false"))) {
         smap_add(&options, "csum", csum);
@@ -258,7 +260,7 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
sbrec_sb_global *sbg,
 
     /* Add auth info if ipsec is enabled. */
     if (sbg->ipsec) {
-        smap_add(&options, "remote_name", new_chassis_id);
+        smap_add(&options, "remote_name", chassis_rec->name);
 
         /* Force NAT-T traversal via configuration */
         /* Two ipsec backends are supported: libreswan and strongswan */
@@ -276,6 +278,11 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
sbrec_sb_global *sbg,
         }
     }
 
+    if (is_ramp_tunnel(&chassis_rec->other_config)) {
+         /* Propagate ramp switch flag from chassis to interface */
+        smap_add(&other_config, "is-vtep", "true");
+    }
+
     /* If there's an existing tunnel record that does not need any change,
      * keep it.  Otherwise, create a new record (if there was an existing
      * record, the new record will supplant it and encaps_run() will delete
@@ -312,10 +319,10 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
sbrec_sb_global *sbg,
      * its name, otherwise generate a new, unique name. */
     char *port_name = (tunnel
                        ? xstrdup(tunnel->port->name)
-                       : tunnel_create_name(tc, new_chassis_id));
+                       : tunnel_create_name(tc, chassis_rec->name));
     if (!port_name) {
         VLOG_WARN("Unable to allocate unique name for '%s' tunnel",
-                  new_chassis_id);
+                  chassis_rec->name);
         goto exit;
     }
 
@@ -323,6 +330,7 @@ tunnel_add(struct tunnel_ctx *tc, const struct 
sbrec_sb_global *sbg,
     ovsrec_interface_set_name(iface, port_name);
     ovsrec_interface_set_type(iface, encap->type);
     ovsrec_interface_set_options(iface, &options);
+    ovsrec_interface_set_other_config(iface, &other_config);
 
     struct ovsrec_port *port = ovsrec_port_insert(tc->ovs_txn);
     ovsrec_port_set_name(port, port_name);
@@ -338,6 +346,7 @@ exit:
     free(tunnel_entry_id);
     free(tunnel_entry_id_old);
     smap_destroy(&options);
+    smap_destroy(&other_config);
 }
 
 static bool
@@ -403,7 +412,7 @@ chassis_tunnel_add(const struct sbrec_chassis *chassis_rec,
             }
             VLOG_DBG("tunnel_add: '%s', local ip: %s", chassis_rec->name,
                      this_chassis->encaps[j]->ip);
-            tunnel_add(tc, sbg, chassis_rec->name, chassis_rec->encaps[i],
+            tunnel_add(tc, sbg, chassis_rec, chassis_rec->encaps[i],
                        this_chassis->encaps[j]->ip, ovs_table);
             tuncnt++;
         }
diff --git a/controller/encaps.h b/controller/encaps.h
index fa5dc17e5..0257d08c1 100644
--- a/controller/encaps.h
+++ b/controller/encaps.h
@@ -17,6 +17,7 @@
 #define OVN_ENCAPS_H 1
 
 #include <stdbool.h>
+#include "smap.h"
 
 /*
  * Given there could be multiple tunnels with different IPs to the same
@@ -68,4 +69,10 @@ bool  encaps_tunnel_id_match(const char *tunnel_id, const 
char *chassis_id,
 
 void encaps_destroy(void);
 
+static inline bool
+is_ramp_tunnel(const struct smap *other_config)
+{
+    return smap_get_bool(other_config, "is-vtep", false);
+}
+
 #endif /* controller/encaps.h */
diff --git a/controller/local_data.c b/controller/local_data.c
index dda746d73..af6c75b40 100644
--- a/controller/local_data.c
+++ b/controller/local_data.c
@@ -532,6 +532,7 @@ local_nonvif_data_run(const struct ovsrec_bridge *br_int,
                 tun->ofport = u16_to_ofp(ofport);
                 tun->type = tunnel_type;
                 tun->is_ipv6 = ip ? addr_is_ipv6(ip) : false;
+                tun->is_ramp_tunnel = is_ramp_tunnel(&iface_rec->other_config);
 
                 free(hash_id);
                 free(ip);
diff --git a/controller/local_data.h b/controller/local_data.h
index 948c1a935..cbb8899eb 100644
--- a/controller/local_data.h
+++ b/controller/local_data.h
@@ -146,6 +146,7 @@ struct chassis_tunnel {
     ofp_port_t ofport;
     enum chassis_tunnel_type type;
     bool is_ipv6;
+    bool is_ramp_tunnel;
 };
 
 /* Flow-based tunnel that consolidates multiple endpoints into a single
diff --git a/controller/physical.c b/controller/physical.c
index 228f3d171..30ebeb1b6 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -351,30 +351,35 @@ put_flow_based_remote_port_redirect_overlay(
     }
 }
 
+/* Add handling for E/W ICMPv4/v6 packets when tunneled packets exceed
+ * path MTU.
+ * If packet needs to be tunneled to another node and the physical
+ * interface used for tunneling has a lower MTU than the packet size,
+ * or if there is a route exception with a smaller MTU, kernel
+ * generates an ICMP "Fragmentation Needed" message, but packet
+ * metadata didn't change. Such packets might have been dropped due
+ * to required metadata modifications for returned packet.
+ *
+ * Mark these packets with MLF_RX_FROM_TUNNEL_BIT for further
+ * processing. Packets received from a RAMP tunnel should be passed
+ * through, and errors handled via normal processing path, since
+ * port metadata is not carried in RAMP packets in VNI.
+ */
 static void
-add_tunnel_ingress_flows(const struct chassis_tunnel *tun,
-                         enum mf_field_id mff_ovn_geneve,
-                         struct ovn_desired_flow_table *flow_table,
-                         struct ofpbuf *ofpacts)
+add_tunnel_ingress_pmtud_flows(const struct chassis_tunnel *tun,
+                               struct ofpbuf *ofpacts,
+                               struct ovn_desired_flow_table *flow_table)
 {
-    /* Main ingress flow (priority 100) */
-    struct match match = MATCH_CATCHALL_INITIALIZER;
-    match_set_in_port(&match, tun->ofport);
-
-    ofpbuf_clear(ofpacts);
-    put_decapsulation(mff_ovn_geneve, tun, ofpacts);
-    put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts);
+    if (tun->is_ramp_tunnel) {
+        return;
+    }
 
-    ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
-                    ofpacts, hc_uuid);
+    struct match match = MATCH_CATCHALL_INITIALIZER;
 
     /* Set allow rx from tunnel bit */
     put_load(1, MFF_LOG_FLAGS, MLF_RX_FROM_TUNNEL_BIT, 1, ofpacts);
     put_resubmit(OFTABLE_CT_ZONE_LOOKUP, ofpacts);
 
-    /* Add specific flows for E/W ICMPv{4,6} packets if tunnelled packets
-     * do not fit path MTU. */
-
     /* IPv4 ICMP flow (priority 120) */
     match_init_catchall(&match);
     match_set_in_port(&match, tun->ofport);
@@ -398,6 +403,26 @@ add_tunnel_ingress_flows(const struct chassis_tunnel *tun,
                     ofpacts, hc_uuid);
 }
 
+static void
+add_tunnel_ingress_flows(const struct chassis_tunnel *tun,
+                         enum mf_field_id mff_ovn_geneve,
+                         struct ovn_desired_flow_table *flow_table,
+                         struct ofpbuf *ofpacts)
+{
+    /* Main ingress flow (priority 100) */
+    struct match match = MATCH_CATCHALL_INITIALIZER;
+    match_set_in_port(&match, tun->ofport);
+
+    ofpbuf_clear(ofpacts);
+    put_decapsulation(mff_ovn_geneve, tun, ofpacts);
+    put_resubmit(OFTABLE_LOCAL_OUTPUT, ofpacts);
+
+    ofctrl_add_flow(flow_table, OFTABLE_PHY_TO_LOG, 100, 0, &match,
+                    ofpacts, hc_uuid);
+
+    add_tunnel_ingress_pmtud_flows(tun, ofpacts, flow_table);
+}
+
 static void
 put_stack(enum mf_field_id field, struct ofpact_stack *stack)
 {
@@ -2827,12 +2852,6 @@ fanout_to_chassis_port_based(enum mf_field_id 
mff_ovn_geneve,
     }
 }
 
-static bool
-chassis_is_vtep(const struct sbrec_chassis *chassis)
-{
-    return smap_get_bool(&chassis->other_config, "is-vtep", false);
-}
-
 static void
 local_output_pb(int64_t tunnel_key, struct ofpbuf *ofpacts)
 {
@@ -3011,19 +3030,19 @@ consider_mc_group(const struct physical_ctx *ctx,
              * otherwise multicast will reach remote ports through localnet
              * port. */
             if (port->chassis) {
-                if (chassis_is_vtep(port->chassis)) {
+                if (is_ramp_tunnel(&port->chassis->other_config)) {
                     sset_add(&vtep_chassis, port->chassis->name);
                 } else {
                     sset_add(&remote_chassis, port->chassis->name);
                 }
             }
             for (size_t j = 0; j < port->n_additional_chassis; j++) {
-                if (chassis_is_vtep(port->additional_chassis[j])) {
-                    sset_add(&vtep_chassis,
-                             port->additional_chassis[j]->name);
+                struct sbrec_chassis *additional_chassis =
+                    port->additional_chassis[j];
+                if (is_ramp_tunnel(&additional_chassis->other_config)) {
+                    sset_add(&vtep_chassis, additional_chassis->name);
                 } else {
-                    sset_add(&remote_chassis,
-                             port->additional_chassis[j]->name);
+                    sset_add(&remote_chassis, additional_chassis->name);
                 }
             }
         }
@@ -3943,7 +3962,7 @@ physical_run(struct physical_ctx *p_ctx,
     struct chassis_tunnel *tun;
     HMAP_FOR_EACH (tun, hmap_node, p_ctx->chassis_tunnels) {
         add_tunnel_ingress_flows(tun, p_ctx->mff_ovn_geneve, flow_table,
-                                &ofpacts);
+                                 &ofpacts);
     }
 
     /* Process packets that arrive from flow-based tunnels. */
diff --git a/tests/ovn-controller-vtep.at b/tests/ovn-controller-vtep.at
index 961324bd2..caf53e291 100644
--- a/tests/ovn-controller-vtep.at
+++ b/tests/ovn-controller-vtep.at
@@ -775,6 +775,10 @@ AT_CHECK([ovs-ofctl dump-flows br-int 
table=OFTABLE_PHY_TO_LOG | grep 'priority=
 priority=110,tun_id=0x<>,in_port=<> 
actions=move:NXM_NX_TUN_ID[[0..23]]->OXM_OF_METADATA[[0..23]],load:0x<>->NXM_NX_REG14[[0..14]],load:0x<>->NXM_NX_REG10[[1]],resubmit(,OFTABLE_LOG_INGRESS_PIPELINE)
 ])
 
+# Skip processing ICMP "packet too big" errors in this table if the packet 
came from a VTEP tunnel.
+AT_CHECK([ovs-ofctl dump-flows br-int table=OFTABLE_PHY_TO_LOG | \
+          grep -E 'icmp_type=3,icmp_code=4|icmp_type=2,icmp_code=0'], [1], [])
+
 OVN_CONTROLLER_VTEP_STOP([], vtep1)
 OVN_CLEANUP([hv1])
 AT_CLEANUP
-- 
2.48.1

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to