Previously the kernel did not provide a netlink interface to flush/list only conntrack entries matching a specific zone. With [1] it is now possible to flush and list conntrack entries filtered by zone. Older kernels not yet supporting this feature will ignore the filter. For the list request that means just returning all entries (which we can then filter in userspace as before). For the flush request that means deleting all conntrack entries.
These significantly improves the performance of flushing conntrack zones when the conntrack table is large. Since flushing a conntrack zone is normally triggered via an openflow command it blocks the main ovs thread and thereby also blocks new flows from being applied. Using this new feature we can reduce the flushing time for zones by around 93%. In combination with OVN the creation of a Logical_Router (which causes the flushing of a ct zone) could block other operations, e.g. the failover of Logical_Routers (as they cause new flows to be created). This is visible from a user perspective as a ovn-controller that is idle (as it waits for vswitchd) and vswitchd reporting: "blocked 1000 ms waiting for main to quiesce" (potentially with ever increasing times). The following performance tests where run in a qemu vm with 500.000 conntrack entries distributed evenly over 500 ct zones using `ovstest test-netlink-conntrack flush zone=<zoneid>`. With this patch and kernel v6.8-rc2: ----------------------------------------------------------------------------------------------------------------------------------------------------- Min (s) Median (s) 90%ile (s) 99%ile (s) Max (s) Mean (s) Total (s) Count ----------------------------------------------------------------------------------------------------------------------------------------------------- flush zone with 1000 entries 0.260 0.319 0.335 0.348 0.362 0.320 80.02 250 flush zone with no entry 0.228 0.298 0.325 0.340 0.348 0.296 73.93 250 ----------------------------------------------------------------------------------------------------------------------------------------------------- With this patch and kernel v6.7.1: ----------------------------------------------------------------------------------------------------------------------------------------------------- Min (s) Median (s) 90%ile (s) 99%ile (s) Max (s) Mean (s) Total (s) Count ----------------------------------------------------------------------------------------------------------------------------------------------------- flush zone with 1000 entries 3.946 4.237 4.367 4.495 4.543 4.236 1058.992 250 flush zone with no entry 3.462 4.460 4.662 4.931 5.390 4.430 1107.479 250 ----------------------------------------------------------------------------------------------------------------------------------------------------- Without this patch and kernel v6.8-rc2: ----------------------------------------------------------------------------------------------------------------------------------------------------- Min (s) Median (s) 90%ile (s) 99%ile (s) Max (s) Mean (s) Total (s) Count ----------------------------------------------------------------------------------------------------------------------------------------------------- flush zone with 1000 entries 3.497 4.349 4.522 4.773 5.054 4.331 1082.802 250 flush zone with no entry 3.212 4.010 4.572 6.003 6.396 4.071 1017.838 250 ----------------------------------------------------------------------------------------------------------------------------------------------------- [1]: https://github.com/torvalds/linux/commit/eff3c558bb7e61c41b53e4c8130e514a5a4df9ba Co-Authored-By: Luca Czesla <luca.czesla@mail.schwarz> Signed-off-by: Luca Czesla <luca.czesla@mail.schwarz> Co-Authored-By: Max Lamprecht <max.lamprecht@mail.schwarz> Signed-off-by: Max Lamprecht <max.lamprecht@mail.schwarz> Signed-off-by: Felix Huettner <felix.huettner@mail.schwarz> --- v1->v2: - fixed wrong signed-off-by lib/netlink-conntrack.c | 57 +++++++++++++++++++++++++++++++++++++++-- tests/system-traffic.at | 8 ++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/lib/netlink-conntrack.c b/lib/netlink-conntrack.c index 492bfcffb..1b050894d 100644 --- a/lib/netlink-conntrack.c +++ b/lib/netlink-conntrack.c @@ -25,6 +25,7 @@ #include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/nf_conntrack_ftp.h> #include <linux/netfilter/nf_conntrack_sctp.h> +#include <sys/utsname.h> #include "byte-order.h" #include "compiler.h" @@ -141,6 +142,9 @@ nl_ct_dump_start(struct nl_ct_dump_state **statep, const uint16_t *zone, nl_msg_put_nfgenmsg(&state->buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET, NLM_F_REQUEST); + if (zone) { + nl_msg_put_be16(&state->buf, CTA_ZONE, htons(*zone)); + } nl_dump_start(&state->dump, NETLINK_NETFILTER, &state->buf); ofpbuf_clear(&state->buf); @@ -283,23 +287,72 @@ nl_ct_flush_zone(uint16_t flush_zone) return err; } #else + +static bool +netlink_flush_supports_zone(void) +{ + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; + static bool supported = false; + + if (ovsthread_once_start(&once)) { + struct utsname utsname; + int major, minor; + + if (uname(&utsname) == -1) { + VLOG_WARN("uname failed (%s)", ovs_strerror(errno)); + } else if (!ovs_scan(utsname.release, "%d.%d", &major, &minor)) { + VLOG_WARN("uname reported bad OS release (%s)", utsname.release); + } else if (major < 6 || (major == 6 && minor < 8)) { + VLOG_INFO("disabling conntrack flush by zone in Linux kernel %s", + utsname.release); + } else { + supported = true; + } + ovsthread_once_done(&once); + } + return supported; +} + int nl_ct_flush_zone(uint16_t flush_zone) { - /* Apparently, there's no netlink interface to flush a specific zone. + /* In older kernels, there was no netlink interface to flush a specific + * conntrack zone. * This code dumps every connection, checks the zone and eventually * delete the entry. + * In newer kernels there is the option to specifiy a zone for filtering + * during dumps. Older kernels ignore this option. We set it here in the + * hope we only get relevant entries back, but fall back to filtering here + * to keep compatibility. * - * This is race-prone, but it is better than using shell scripts. */ + * This is race-prone, but it is better than using shell scripts. + * + * Additionally newer kernels also support flushing a zone without listing + * it first. */ struct nl_dump dump; struct ofpbuf buf, reply, delete; + int err; + + if (netlink_flush_supports_zone()) { + ofpbuf_init(&buf, NL_DUMP_BUFSIZE); + + nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK, + IPCTNL_MSG_CT_DELETE, NLM_F_REQUEST); + nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone)); + + err = nl_transact(NETLINK_NETFILTER, &buf, NULL); + ofpbuf_uninit(&buf); + + return err; + } ofpbuf_init(&buf, NL_DUMP_BUFSIZE); ofpbuf_init(&delete, NL_DUMP_BUFSIZE); nl_msg_put_nfgenmsg(&buf, 0, AF_UNSPEC, NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET, NLM_F_REQUEST); + nl_msg_put_be16(&buf, CTA_ZONE, htons(flush_zone)); nl_dump_start(&dump, NETLINK_NETFILTER, &buf); ofpbuf_clear(&buf); diff --git a/tests/system-traffic.at b/tests/system-traffic.at index f363a778c..869728a1d 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -3214,6 +3214,14 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.4)], [0], [dnl tcp,orig=(src=10.1.1.3,dst=10.1.1.4,sport=<cleared>,dport=<cleared>),reply=(src=10.1.1.4,dst=10.1.1.3,sport=<cleared>,dport=<cleared>),zone=2,protoinfo=(state=<cleared>) ]) +dnl flushing one zone should leave the others intact +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=2]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack zone=1 | FORMAT_CT(10.1.1.2)], [0], [dnl +tcp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=<cleared>,dport=<cleared>),reply=(src=10.1.1.2,dst=10.1.1.1,sport=<cleared>,dport=<cleared>),zone=1,protoinfo=(state=<cleared>) +]) +AT_CHECK([ovs-appctl dpctl/dump-conntrack zone=2 | FORMAT_CT(10.1.1.4)], [0], [dnl +]) + OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP base-commit: 1be7f896af85f2777f8147df207612339b4480c0 -- 2.43.0 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev