On 9/7/23 15:59, Ales Musil wrote: > On Fri, Sep 1, 2023 at 12:56 PM Dumitru Ceara <dce...@redhat.com> wrote: > >> Commit 506f7d4bcfbc ("northd: rely on new actions for ecmp-symmetric >> routing") relied on commit_ecmp_nh() to learn the mapping between a >> traffic session's 5-tuple and the MAC of the next-hop to be used for >> that session. This logical action translates to OVS learn() action. >> >> While in theory this is the most correct way of tracking the mapping >> between ECMP symmetric reply sessions and their next hop's MAC address, >> it also creates additional load in ovs-vswitchd (to learn the new flows) >> and introduces latency and affects traffic throughput. >> >> An alternative is to instead re-commit the 5-tuple (along with the >> next-hop MAC and port information) to conntrack every time traffic >> received from the next-hop side is forwarded by the OVN router. >> >> Testing shows that in a scenario with 4 next-hops and ECMP symmetric >> replies enabled with traffic running for 600 seconds latency, throughput >> and ovs-vswitchd CPU usage are significantly better with this change: >> >> - Before: >> - ovs-vswitchd ~1200% CPU (distributed across 17 revalidator threads) >> - Sent: 638.72MiB, 1.06MiB/s >> - Recv: 7.17GiB, 12.24MiB/s >> >> - After: >> - ovs-vswitchd ~7% CPU (distributed across 17 revalidator threads) >> - Sent: 892.69MiB, 1.49MiB/s >> - Recv: 8.63GiB, 14.72MiB/s >> >> The only downside of not using learn() flows is that OVN cannot >> determine on its own when a next-hop goes away (without using BFD). >> This scenario however can probably be handled by the CMS which has more >> knowledge about the rest of the network (outside OVN), e.g., >> ovn-kubernetes currently flushes conntrack entries created for ECMP >> symmetric reply sessions when it detects that the next-hop went away. >> >> If a next-hops changes MAC address that's handled by OVN gracefully and >> the conntrack entry corresponding to that session gets updated >> accordingly. >> >> NOTE: we don't remove the logical actions' implementation as >> ovn-controller needs to be able to translate logical flows generated by >> older versions of ovn-northd. >> >> Fixes: 506f7d4bcfbc ("northd: rely on new actions for ecmp-symmetric >> routing") >> Signed-off-by: Dumitru Ceara <dce...@redhat.com> >> --- >> northd/northd.c | 52 ++++++++++++++++++--------------------------- >> tests/ovn-northd.at | 20 ++++++----------- >> 2 files changed, 27 insertions(+), 45 deletions(-) >> >> diff --git a/northd/northd.c b/northd/northd.c >> index 459aaafb1c..e67d34cd28 100644 >> --- a/northd/northd.c >> +++ b/northd/northd.c >> @@ -262,7 +262,6 @@ enum ovn_stage { >> #define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]" >> #define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]" >> #define REGBIT_DST_NAT_IP_LOCAL "reg9[4]" >> -#define REGBIT_KNOWN_ECMP_NH "reg9[5]" >> #define REGBIT_KNOWN_LB_SESSION "reg9[6]" >> >> /* Register to store the eth address associated to a router port for >> packets >> @@ -370,8 +369,7 @@ enum ovn_stage { >> * | | EGRESS_LOOPBACK/ | G | UNUSED | >> * | R9 | PKT_LARGER/ | 4 | | >> * | | LOOKUP_NEIGHBOR_RESULT/ | | | >> - * | | SKIP_LOOKUP_NEIGHBOR/ | | | >> - * | | KNOWN_ECMP_NH} | | | >> + * | | SKIP_LOOKUP_NEIGHBOR} | | | >> * | | | | | >> * | | REG_ORIG_TP_DPORT_ROUTER | | | >> * | | | | | >> @@ -10846,15 +10844,13 @@ add_ecmp_symmetric_reply_flows(struct hmap >> *lflows, >> cidr); >> free(cidr); >> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100, >> - ds_cstr(&base_match), >> - REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh_mac(); ct_next;", >> - &st_route->header_); >> + ds_cstr(&base_match), "ct_next;", >> + &st_route->header_); >> >> /* And packets that go out over an ECMP route need conntrack */ >> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100, >> - ds_cstr(route_match), >> - REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh(); ct_next;", >> - &st_route->header_); >> + ds_cstr(route_match), "ct_next;", >> + &st_route->header_); >> >> /* Save src eth and inport in ct_label for packets that arrive over >> * an ECMP route. >> @@ -10867,9 +10863,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows, >> ds_put_format(&actions, >> "ct_commit { ct_label.ecmp_reply_eth = eth.src; " >> " %s = %" PRId64 ";}; " >> - "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;", >> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key, >> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true"); >> + "next;", >> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key); >> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100, >> ds_cstr(&match), ds_cstr(&actions), >> &st_route->header_); >> @@ -10880,9 +10875,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows, >> ds_put_format(&actions, >> "ct_commit { ct_label.ecmp_reply_eth = eth.src; " >> " %s = %" PRId64 ";}; " >> - "commit_ecmp_nh(ipv6 = %s, proto = udp); next;", >> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key, >> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true"); >> + "next;", >> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key); >> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100, >> ds_cstr(&match), ds_cstr(&actions), >> &st_route->header_); >> @@ -10893,53 +10887,49 @@ add_ecmp_symmetric_reply_flows(struct hmap >> *lflows, >> ds_put_format(&actions, >> "ct_commit { ct_label.ecmp_reply_eth = eth.src; " >> " %s = %" PRId64 ";}; " >> - "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;", >> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key, >> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true"); >> + "next;", >> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key); >> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100, >> ds_cstr(&match), ds_cstr(&actions), >> &st_route->header_); >> >> ds_clear(&match); >> ds_put_format(&match, >> - "%s && (!ct.rpl && ct.est) && tcp && "REGBIT_KNOWN_ECMP_NH" >> == 0", >> + "%s && (!ct.rpl && ct.est) && tcp", >> ds_cstr(&base_match)); >> ds_clear(&actions); >> ds_put_format(&actions, >> "ct_commit { ct_label.ecmp_reply_eth = eth.src; " >> " %s = %" PRId64 ";}; " >> - "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;", >> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key, >> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true"); >> + "next;", >> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key); >> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100, >> ds_cstr(&match), ds_cstr(&actions), >> &st_route->header_); >> >> ds_clear(&match); >> ds_put_format(&match, >> - "%s && (!ct.rpl && ct.est) && udp && "REGBIT_KNOWN_ECMP_NH" >> == 0", >> + "%s && (!ct.rpl && ct.est) && udp", >> ds_cstr(&base_match)); >> ds_clear(&actions); >> ds_put_format(&actions, >> "ct_commit { ct_label.ecmp_reply_eth = eth.src; " >> " %s = %" PRId64 ";}; " >> - "commit_ecmp_nh(ipv6 = %s, proto = udp); next;", >> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key, >> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true"); >> + "next;", >> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key); >> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100, >> ds_cstr(&match), ds_cstr(&actions), >> &st_route->header_); >> ds_clear(&match); >> ds_put_format(&match, >> - "%s && (!ct.rpl && ct.est) && sctp && "REGBIT_KNOWN_ECMP_NH" >> == 0", >> + "%s && (!ct.rpl && ct.est) && sctp", >> ds_cstr(&base_match)); >> ds_clear(&actions); >> ds_put_format(&actions, >> "ct_commit { ct_label.ecmp_reply_eth = eth.src; " >> " %s = %" PRId64 ";}; " >> - "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;", >> - ct_ecmp_reply_port_match, out_port->sb->tunnel_key, >> - IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true"); >> + "next;", >> + ct_ecmp_reply_port_match, out_port->sb->tunnel_key); >> ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100, >> ds_cstr(&match), ds_cstr(&actions), >> &st_route->header_); >> @@ -10948,7 +10938,7 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows, >> * for where to route the packet. >> */ >> ds_put_format(&ecmp_reply, >> - "ct.rpl && "REGBIT_KNOWN_ECMP_NH" == 1 && %s == >> %"PRId64, >> + "ct.rpl && %s == %"PRId64, >> ct_ecmp_reply_port_match, out_port->sb->tunnel_key); >> ds_clear(&match); >> ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply), >> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at >> index 3ef92bb3ff..2e1a85c9ac 100644 >> --- a/tests/ovn-northd.at >> +++ b/tests/ovn-northd.at >> @@ -6264,24 +6264,16 @@ AT_CHECK([grep -e "lr_in_ip_routing_ecmp" lr0flows >> | sed 's/192\.168\.0\..0/192. >> table=??(lr_in_ip_routing_ecmp), priority=150 , match=(reg8[[0..15]] >> == 0), action=(next;) >> ]) >> >> -AT_CHECK([grep -e "lr_in_ecmp_stateful".*commit_ecmp_nh lr0flows | sed >> 's/table=../table=??/' | sort], [0], [dnl >> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == >> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && sctp && >> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; >> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = sctp); >> next;) >> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == >> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && tcp && >> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; >> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp); >> next;) >> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == >> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && udp && >> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; >> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp); >> next;) >> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == >> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && sctp), >> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; >> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = sctp); >> next;) >> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == >> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && tcp), >> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; >> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp); >> next;) >> - table=??(lr_in_ecmp_stateful), priority=100 , match=(inport == >> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && udp), >> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src; >> ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp); >> next;) >> -]) >> - >> -AT_CHECK([grep -e "lr_in_defrag".*chk_ecmp_nh* lr0flows | sed >> 's/table=../table=??/' | sort], [0], [dnl >> - table=??(lr_in_defrag ), priority=100 , match=(inport == >> "lr0-public" && ip4.src == 1.0.0.1), action=(reg9[[5]] = chk_ecmp_nh_mac(); >> ct_next;) >> - table=??(lr_in_defrag ), priority=100 , match=(reg7 == 0 && >> ip4.dst == 1.0.0.1/32), action=(reg9[[5]] = chk_ecmp_nh(); ct_next;) >> +AT_CHECK([grep -e "lr_in_defrag" lr0flows | sed 's/table=../table=??/' | >> sort], [0], [dnl >> + table=??(lr_in_defrag ), priority=0 , match=(1), action=(next;) >> + table=??(lr_in_defrag ), priority=100 , match=(inport == >> "lr0-public" && ip4.src == 1.0.0.1), action=(ct_next;) >> + table=??(lr_in_defrag ), priority=100 , match=(reg7 == 0 && >> ip4.dst == 1.0.0.1/32), action=(ct_next;) >> ]) >> >> dnl The chassis was created with other_config:ct-no-masked-label=false, >> the flows >> dnl should be using ct_label.ecmp_reply_port. >> AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed >> 's/table=../table=??/'], [0], [dnl >> - table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl && >> reg9[[5]] == 1 && ct_label.ecmp_reply_port == 1), action=(push(xxreg1); >> xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;) >> + table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl && >> ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; >> eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;) >> ]) >> >> dnl Simulate an ovn-controller upgrade to a version that supports >> @@ -6291,7 +6283,7 @@ check ovn-sbctl set chassis ch1 >> other_config:ct-no-masked-label=true >> check ovn-nbctl --wait=sb sync >> ovn-sbctl dump-flows lr0 > lr0flows >> AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed >> 's/table=../table=??/'], [0], [dnl >> - table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl && >> reg9[[5]] == 1 && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); >> xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;) >> + table=??(lr_in_arp_resolve ), priority=200 , match=(ct.rpl && >> ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; >> eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;) >> ]) >> >> # add ecmp route with wrong nexthop >> -- >> 2.39.3 >> >> _______________________________________________ >> dev mailing list >> d...@openvswitch.org >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev >> >> > Looks good to me, thanks. > > Reviewed-by: Ales Musil <amu...@redhat.com> >
Thanks, Ales! I applied this to main and backported it to all branches down to 22.03. Regards, Dumitru _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev