On 5/3/23 22:13, Ihar Hrachyshka wrote:
> When a multichassis port belongs to a switch with a localnet port,
> packets originating or directed to the multichassis port are NOT sent
> thorugh the localnet port. Instead, tunneling is enforced in-cluster to
> guarantee delivery of all packets to all chassis of the port.
> 
> This behavior has an unfortunate side effect, where - because of
> additional tunnel header added to each packet - the effective MTU of the
> path for multichassis ports changes from what's set as mtu_request. This
> effectively makes OVN to black hole all packets for the port that use
> full capacity of the interface MTU. This breaks usual TCP / UDP
> services, among other things (SSH, iperf sessions etc.)
> 
> This patch adds flows so that
> - (in table 38) detect too-big packets (table 38), and then
> - (in table 39) icmp fragmentation needed / too big errors are sent
>   back to offending port.
> 
> Once the error is received, the sender is expected to adjust the route
> MTU accordingly, sending the next packets with the new path MTU. After a
> multichassis port is re-assigned to a single chassis, the effective path
> MTU is restored to "usual". Peers will eventually see their "learned"
> path MTU cache expire, which will make them switch back to the "usual"
> MTU.
> 
> Among other scenarios, this patch helps to maintain existing services
> working during live migration of a VM, if multichassis ports are used.
> (E.g. in OpenStack Nueutron.)

I'm not necessarily rejecting this change.  I just wanted to bring up an
alternative approach (I'm not sure if it's possible to implement it though):

The CMS (e.g., Neutron) probably knows before hand the reduced MTU value
it should impose on the localnet port.  That means it might be possible
to just implement all this through logical flows that use
'check_pkt_larger()'.

In any case, the code in this patch looks OK to me overall.  I have a
few minor comments below.

> 
> Fixes: 7084cf437421 ("Always funnel multichassis port traffic through 
> tunnels")
> 
> Signed-off-by: Ihar Hrachyshka <ihrac...@redhat.com>
> ---
>  NEWS                        |   6 +
>  controller/ovn-controller.c |   3 +
>  controller/physical.c       | 293 +++++++++++++++++++++++++++++++++++-
>  controller/physical.h       |   1 +
>  lib/ovn-util.h              |  11 ++
>  tests/ovn.at                | 263 ++++++++++++++++++++++++++++++++
>  6 files changed, 572 insertions(+), 5 deletions(-)
> 
> diff --git a/NEWS b/NEWS
> index 60467581a..9d5eef268 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -14,6 +14,12 @@ Post v23.03.0
>      existing behaviour of flooding these arp requests to all attached Ports.
>    - Always allow IPv6 Router Discovery, Neighbor Discovery, and Multicast
>      Listener Discovery protocols, regardless of ACLs defined.
> +  - Send ICMP Fragmentation Needed packets back to offending ports when
> +    communicating with multichassis ports using frames that don't fit 
> through a
> +    tunnel. This is done only for logical switches that are attached to a
> +    physical network via a localnet port, in which case multichassis ports 
> may
> +    have an effective MTU different from regular ports and hence may need 
> this
> +    mechanism to maintain connectivity with other peers in the network.
>  
>  OVN v23.03.0 - 03 Mar 2023
>  --------------------------
> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> index c094cb74d..9359925fa 100644
> --- a/controller/ovn-controller.c
> +++ b/controller/ovn-controller.c
> @@ -4083,6 +4083,9 @@ static void init_physical_ctx(struct engine_node *node,
>      p_ctx->patch_ofports = &non_vif_data->patch_ofports;
>      p_ctx->chassis_tunnels = &non_vif_data->chassis_tunnels;
>  
> +    struct controller_engine_ctx *ctrl_ctx = 
> engine_get_context()->client_ctx;
> +    p_ctx->if_mgr = ctrl_ctx->if_mgr;
> +
>      pflow_output_get_debug(node, &p_ctx->debug);
>  }
>  
> diff --git a/controller/physical.c b/controller/physical.c
> index 1b0482e3b..a2a25d067 100644
> --- a/controller/physical.c
> +++ b/controller/physical.c
> @@ -41,6 +41,7 @@
>  #include "lib/ovn-sb-idl.h"
>  #include "lib/ovn-util.h"
>  #include "ovn/actions.h"
> +#include "if-status.h"
>  #include "physical.h"
>  #include "pinctrl.h"
>  #include "openvswitch/shash.h"
> @@ -91,6 +92,7 @@ physical_register_ovs_idl(struct ovsdb_idl *ovs_idl)
>  
>      ovsdb_idl_add_table(ovs_idl, &ovsrec_table_interface);
>      ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_name);
> +    ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_mtu);
>      ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_ofport);
>      ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_external_ids);
>  }
> @@ -1104,6 +1106,273 @@ setup_activation_strategy(const struct 
> sbrec_port_binding *binding,
>      }
>  }
>  
> +static size_t
> +encode_start_controller_op(enum action_opcode opcode, bool pause,
> +                           uint32_t meter_id, struct ofpbuf *ofpacts)
> +{
> +    size_t ofs = ofpacts->size;
> +
> +    struct ofpact_controller *oc = ofpact_put_CONTROLLER(ofpacts);
> +    oc->max_len = UINT16_MAX;
> +    oc->reason = OFPR_ACTION;
> +    oc->pause = pause;
> +    if (!ovs_feature_is_supported(OVS_DP_METER_SUPPORT)) {
> +        meter_id = NX_CTLR_NO_METER;
> +    }
> +    oc->meter_id = meter_id;
> +
> +    struct action_header ah = { .opcode = htonl(opcode) };
> +    ofpbuf_put(ofpacts, &ah, sizeof ah);
> +
> +    return ofs;
> +}
> +
> +static void
> +encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts)
> +{
> +    struct ofpact_controller *oc = ofpbuf_at_assert(ofpacts, ofs, sizeof 
> *oc);
> +    ofpacts->header = oc;
> +    oc->userdata_len = ofpacts->size - (ofs + sizeof *oc);
> +    ofpact_finish_CONTROLLER(ofpacts, &oc);
> +}
> +

I think Mark already pointed out in v1 that these should just be exposed
as public functions in actions.h.

> +/*
> + * Insert a flow to determine if an IP packet is too big for the 
> corresponding
> + * egress interface.
> + */
> +static void
> +determine_if_pkt_too_big(struct ovn_desired_flow_table *flow_table,
> +                         const struct sbrec_port_binding *binding,
> +                         const struct sbrec_port_binding *mcp,
> +                         uint16_t mtu, bool is_ipv6, int direction)
> +{
> +    struct ofpbuf ofpacts;
> +    ofpbuf_init(&ofpacts, 0);
> +
> +    /* Store packet too large flag in reg9[1]. */
> +    struct match match;
> +    match_init_catchall(&match);
> +    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP));
> +    match_set_metadata(&match, htonll(binding->datapath->tunnel_key));
> +    match_set_reg(&match, direction - MFF_REG0, mcp->tunnel_key);
> +
> +    /* reg9[1] is REGBIT_PKT_LARGER as defined by northd */

I think I'd add a comment to northd.c too.  Maybe where we define
REGBIT_PKT_LARGER.

> +    struct ofpact_check_pkt_larger *pkt_larger =
> +        ofpact_put_CHECK_PKT_LARGER(&ofpacts);
> +    pkt_larger->pkt_len = mtu;
> +    pkt_larger->dst.field = mf_from_id(MFF_REG9);
> +    pkt_larger->dst.ofs = 1;
> +
> +    put_resubmit(OFTABLE_OUTPUT_LARGE_PKT_PROCESS, &ofpacts);
> +    ofctrl_add_flow(flow_table, OFTABLE_OUTPUT_LARGE_PKT_DETECT, 100,
> +                    binding->header_.uuid.parts[0], &match, &ofpacts,
> +                    &binding->header_.uuid);
> +    ofpbuf_uninit(&ofpacts);
> +}
> +
> +/*
> + * Insert a flow to reply with ICMP error for IP packets that are too big for
> + * the corresponding egress interface.
> + */
> +/*
> + * NOTE(ihrachys) This reimplements icmp_error as found in
> + * build_icmperr_pkt_big_flows. We may look into reusing the existing OVN
> + * action for this flow in the future.
> + */
> +static void
> +reply_imcp_error_if_pkt_too_big(struct ovn_desired_flow_table *flow_table,
> +                                const struct sbrec_port_binding *binding,
> +                                const struct sbrec_port_binding *mcp,
> +                                uint16_t mtu, bool is_ipv6, int direction)
> +{

Note: this function together with determine_if_pkt_too_big() would be
implemented as logical flows if we could go for the alternative approach
I suggested in the beginning of the email.

> +    struct match match;
> +    match_init_catchall(&match);
> +    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP));
> +    match_set_metadata(&match, htonll(binding->datapath->tunnel_key));
> +    match_set_reg(&match, direction - MFF_REG0, mcp->tunnel_key);
> +    match_set_reg_masked(&match, MFF_REG9 - MFF_REG0, 1 << 1, 1 << 1);
> +
> +    /* Return ICMP error with a part of the original IP packet included. */
> +    struct ofpbuf ofpacts;
> +    ofpbuf_init(&ofpacts, 0);
> +    size_t oc_offset = encode_start_controller_op(
> +        ACTION_OPCODE_ICMP, true, NX_CTLR_NO_METER, &ofpacts);
> +
> +    struct ofpbuf inner_ofpacts;
> +    ofpbuf_init(&inner_ofpacts, 0);
> +
> +    /* The error packet is no longer too large, set REGBIT_PKT_LARGER = 0 */
> +    /* reg9[1] is REGBIT_PKT_LARGER as defined by northd */
> +    ovs_be32 value = htonl(0);
> +    ovs_be32 mask = htonl(1 << 1);
> +    ofpact_put_set_field(
> +        &inner_ofpacts, mf_from_id(MFF_REG9), &value, &mask);
> +
> +    /* The new error packet is delivered locally */
> +    /* REGBIT_EGRESS_LOOPBACK = 1 */
> +    value = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
> +    mask = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
> +    ofpact_put_set_field(
> +        &inner_ofpacts, mf_from_id(MFF_LOG_FLAGS), &value, &mask);
> +
> +    /* eth.src <-> eth.dst */
> +    put_stack(MFF_ETH_DST, ofpact_put_STACK_PUSH(&inner_ofpacts));
> +    put_stack(MFF_ETH_SRC, ofpact_put_STACK_PUSH(&inner_ofpacts));
> +    put_stack(MFF_ETH_DST, ofpact_put_STACK_POP(&inner_ofpacts));
> +    put_stack(MFF_ETH_SRC, ofpact_put_STACK_POP(&inner_ofpacts));
> +
> +    /* ip.src <-> ip.dst */
> +    put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
> +        ofpact_put_STACK_PUSH(&inner_ofpacts));
> +    put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
> +        ofpact_put_STACK_PUSH(&inner_ofpacts));
> +    put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
> +        ofpact_put_STACK_POP(&inner_ofpacts));
> +    put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
> +        ofpact_put_STACK_POP(&inner_ofpacts));
> +
> +    /* ip.ttl = 255 */
> +    struct ofpact_ip_ttl *ip_ttl = ofpact_put_SET_IP_TTL(&inner_ofpacts);
> +    ip_ttl->ttl = 255;
> +
> +    uint16_t frag_mtu = mtu - ETHERNET_OVERHEAD;
> +    size_t frag_mtu_oc_offset;
> +    if (is_ipv6) {
> +        /* icmp6.type = 2 (Packet Too Big) */
> +        /* icmp6.code = 0 */
> +        uint8_t icmp_type = 2;
> +        uint8_t icmp_code = 0;
> +        ofpact_put_set_field(
> +            &inner_ofpacts, mf_from_id(MFF_ICMPV6_TYPE), &icmp_type, NULL);
> +        ofpact_put_set_field(
> +            &inner_ofpacts, mf_from_id(MFF_ICMPV6_CODE), &icmp_code, NULL);
> +
> +        /* icmp6.frag_mtu */
> +        frag_mtu_oc_offset = encode_start_controller_op(
> +            ACTION_OPCODE_PUT_ICMP6_FRAG_MTU, true, NX_CTLR_NO_METER,
> +            &inner_ofpacts);
> +        ovs_be32 frag_mtu_ovs = htonl(frag_mtu);
> +        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
> +    } else {
> +        /* icmp4.type = 3 (Destination Unreachable) */
> +        /* icmp4.code = 4 (Fragmentation Needed) */
> +        uint8_t icmp_type = 3;
> +        uint8_t icmp_code = 4;
> +        ofpact_put_set_field(
> +            &inner_ofpacts, mf_from_id(MFF_ICMPV4_TYPE), &icmp_type, NULL);
> +        ofpact_put_set_field(
> +            &inner_ofpacts, mf_from_id(MFF_ICMPV4_CODE), &icmp_code, NULL);
> +
> +        /* icmp4.frag_mtu = */
> +        frag_mtu_oc_offset = encode_start_controller_op(
> +            ACTION_OPCODE_PUT_ICMP4_FRAG_MTU, true, NX_CTLR_NO_METER,
> +            &inner_ofpacts);
> +        ovs_be16 frag_mtu_ovs = htons(frag_mtu);
> +        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
> +    }
> +    encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
> +
> +    /* Finally, submit the ICMP error back to the ingress pipeline */
> +    put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &inner_ofpacts);
> +
> +    /* Attach nested actions to ICMP error controller handler */
> +    ofpacts_put_openflow_actions(inner_ofpacts.data, inner_ofpacts.size,
> +                                 &ofpacts, OFP15_VERSION);
> +
> +    /* Finalize the ICMP error controller handler */
> +    encode_finish_controller_op(oc_offset, &ofpacts);
> +
> +    ofctrl_add_flow(flow_table, OFTABLE_OUTPUT_LARGE_PKT_PROCESS, 100,
> +                    binding->header_.uuid.parts[0], &match, &ofpacts,
> +                    &binding->header_.uuid);
> +
> +    ofpbuf_uninit(&inner_ofpacts);
> +    ofpbuf_uninit(&ofpacts);
> +}
> +
> +static uint16_t
> +get_tunnel_overhead(struct chassis_tunnel const *tun)
> +{
> +    uint16_t overhead = 0;
> +    enum chassis_tunnel_type type = tun->type;
> +    if (type == GENEVE) {
> +        overhead += GENEVE_TUNNEL_OVERHEAD;
> +    } else if (type == STT) {
> +        overhead += STT_TUNNEL_OVERHEAD;
> +    } else if (type == VXLAN) {
> +        overhead += VXLAN_TUNNEL_OVERHEAD;
> +    } else {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
> +        VLOG_WARN_RL(&rl, "Unknown tunnel type %d, can't determine overhead "
> +                          "size for Path MTU Discovery", type);
> +        return 0;
> +    }
> +    overhead += tun->is_ipv6? IPV6_HEADER_LEN : IPV4_HEADER_LEN;
> +    return overhead;
> +}
> +
> +static uint16_t
> +get_effective_mtu(const struct sbrec_port_binding *mcp,
> +                  struct ovs_list *remote_tunnels,
> +                  const struct if_status_mgr *if_mgr)
> +{
> +    /* Use interface MTU as a base for calculation */
> +    uint16_t iface_mtu = if_status_mgr_iface_get_mtu(if_mgr,
> +                                                     mcp->logical_port);
> +    if (!iface_mtu) {
> +        return 0;
> +    }
> +
> +    /* Iterate over all peer tunnels and find the biggest tunnel overhead */
> +    uint16_t overhead = 0;
> +    struct tunnel *tun;
> +    LIST_FOR_EACH (tun, list_node, remote_tunnels) {
> +        uint16_t tunnel_overhead = get_tunnel_overhead(tun->tun);
> +        if (tunnel_overhead > overhead) {
> +            overhead = tunnel_overhead;
> +        }

Nit:

overhead = MAX(overhead, get_tunnel_overhead(tun->tun));

Thanks,
Dumitru

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to