Hello Ales, overall the approach is ok, the only concern is that, unless I'm missing something, in case of many connections, the exact match deletion could potentially take a while, whereas in the previous case the cost was basically a lookup (constant time) and of course the remaining deletion operation.
It would be nice to avoid the extra cost when the whole 5-tuple is specified. WDYT? Ales Musil <amu...@redhat.com> writes: > Curreently in order to flush conntrack you would need to > specify full 5-tuple. Add support for partial match > it still has some limitations however it is capable of flushing > all that match specified field e.g. source ip address. > > Reported-at: https://bugzilla.redhat.com/2120546 > Signed-off-by: Ales Musil <amu...@redhat.com> > --- > NEWS | 2 + > lib/ct-dpif.c | 178 +++++++++++++++++++++++++++++++--------- > lib/dpctl.man | 3 +- > tests/system-traffic.at | 84 ++++++++++++++++++- > 4 files changed, 226 insertions(+), 41 deletions(-) > > diff --git a/NEWS b/NEWS > index ff77ee404..81909812e 100644 > --- a/NEWS > +++ b/NEWS > @@ -23,6 +23,8 @@ Post-v3.0.0 > bug and CVE fixes addressed since its release. > If a user wishes to benefit from these fixes it is recommended to use > DPDK 21.11.2. > + - ovs-dpctl and related ovs-appctl commands: > + * "flush-conntrack" is capable of handling partial 5-tuple. > > > v3.0.0 - 15 Aug 2022 > diff --git a/lib/ct-dpif.c b/lib/ct-dpif.c > index cfc2315e3..57995f5e5 100644 > --- a/lib/ct-dpif.c > +++ b/lib/ct-dpif.c > @@ -18,6 +18,8 @@ > #include "dpif-provider.h" > > #include <errno.h> > +#include <sys/types.h> > +#include <netinet/in.h> > > #include "ct-dpif.h" > #include "openvswitch/ofp-parse.h" > @@ -109,7 +111,113 @@ ct_dpif_dump_done(struct ct_dpif_dump_state *dump) > ? dpif->dpif_class->ct_dump_done(dpif, dump) > : EOPNOTSUPP); > } > - was this intentional? Just checking > + > +static inline bool > +ct_dpif_inet_addr_cmp_partial(const union ct_dpif_inet_addr *partial, > + const union ct_dpif_inet_addr *addr) > +{ > + if (!ipv6_is_zero(&partial->in6) && > + !ipv6_addr_equals(&partial->in6, &addr->in6)) { > + return false; > + } > + return true; > +} > + > +/* Compares the non-zero members if they match. This is usefull for clearing > + * up all conntracks specified by a partial tuple. */ > +static inline bool > +ct_dpif_tuple_cmp_partial(const struct ct_dpif_tuple *partial, > + const struct ct_dpif_tuple *tuple) > +{ > + /* There is no point in continuing if both do not use the same eth type. > */ > + if (partial->l3_type != tuple->l3_type) { > + return false; > + } > + > + if (partial->ip_proto && partial->ip_proto != tuple->ip_proto) { > + return false; > + } > + > + if (!ct_dpif_inet_addr_cmp_partial(&partial->src, &tuple->src)) { > + return false; > + } > + > + if (!ct_dpif_inet_addr_cmp_partial(&partial->dst, &tuple->dst)) { > + return false; > + } > + > + if (partial->ip_proto == IPPROTO_TCP || partial->ip_proto == > IPPROTO_UDP) { > + > + if (partial->src_port && partial->src_port != tuple->src_port) { > + return false; > + } > + > + if (partial->dst_port && partial->dst_port != tuple->dst_port) { > + return false; > + } > + } else if (partial->ip_proto == IPPROTO_ICMP || > + partial->ip_proto == IPPROTO_ICMPV6) { > + > + if (partial->icmp_id != tuple->icmp_id) { > + return false; > + } > + > + if (partial->icmp_type != tuple->icmp_type) { > + return false; > + } > + > + if (partial->icmp_code != tuple->icmp_code) { > + return false; > + } > + } > + > + return true; > +} > + > +static int > +ct_dpif_flush_tuple(struct dpif *dpif, const uint16_t *zone, > + const struct ct_dpif_tuple *tuple) { > + struct ct_dpif_dump_state *dump; > + struct ct_dpif_entry cte; > + int error; > + int tot_bkts; > + > + if (!dpif->dpif_class->ct_flush) { > + return EOPNOTSUPP; > + } > + > + if (VLOG_IS_DBG_ENABLED()) { > + struct ds ds = DS_EMPTY_INITIALIZER; > + ct_dpif_format_tuple(&ds, tuple); > + VLOG_DBG("%s: ct_flush: %s in zone %d", dpif_name(dpif), > ds_cstr(&ds), > + zone ? *zone : 0); > + ds_destroy(&ds); > + } > + > + error = ct_dpif_dump_start(dpif, &dump, zone, &tot_bkts); > + if (error) { > + return error; > + } > + > + while (!(error = ct_dpif_dump_next(dump, &cte))) { > + if (zone && *zone != cte.zone) { > + continue; > + } > + > + if (ct_dpif_tuple_cmp_partial(tuple, &cte.tuple_orig)) { > + error = dpif->dpif_class->ct_flush(dpif, &cte.zone, > + &cte.tuple_orig); > + if (error) { > + break; > + } > + } > + } > + if (error == EOF) { > + error = 0; > + } > + ct_dpif_dump_done(dump); > + return error; > +} > /* Flush the entries in the connection tracker used by 'dpif'. The > * arguments have the following behavior: > * > @@ -123,11 +231,7 @@ ct_dpif_flush(struct dpif *dpif, const uint16_t *zone, > const struct ct_dpif_tuple *tuple) > { > if (tuple) { > - struct ds ds = DS_EMPTY_INITIALIZER; > - ct_dpif_format_tuple(&ds, tuple); > - VLOG_DBG("%s: ct_flush: %s in zone %d", dpif_name(dpif), > ds_cstr(&ds), > - zone ? *zone : 0); > - ds_destroy(&ds); > + return ct_dpif_flush_tuple(dpif, zone, tuple); > } else if (zone) { > VLOG_DBG("%s: ct_flush: zone %"PRIu16, dpif_name(dpif), *zone); > } else { > @@ -581,6 +685,21 @@ ct_dpif_format_tcp_stat(struct ds * ds, int tcp_state, > int conn_per_state) > ds_put_format(ds, "=%u", conn_per_state); > } > > +static bool > +ct_dpif_parse_tuple_ip(union ct_dpif_inet_addr *addr, char *value, > + uint16_t l3_type) > +{ > + if (!ipv6_is_zero(&addr->in6)) { > + return false; > + } > + if (l3_type == AF_INET) { > + ip_parse(value, &addr->ip); > + } else { > + ipv6_parse(value, &addr->in6); > + } > + return true; > +} > + > /* Parses a specification of a conntrack 5-tuple from 's' into 'tuple'. > * Returns true on success. Otherwise, returns false and puts the error > * message in 'ds'. */ > @@ -590,6 +709,8 @@ ct_dpif_parse_tuple(struct ct_dpif_tuple *tuple, const > char *s, struct ds *ds) > char *pos, *key, *value, *copy; > memset(tuple, 0, sizeof *tuple); > > + tuple->l3_type = AF_INET; > + > pos = copy = xstrdup(s); > while (ofputil_parse_key_value(&pos, &key, &value)) { > if (!*value) { > @@ -597,28 +718,17 @@ ct_dpif_parse_tuple(struct ct_dpif_tuple *tuple, const > char *s, struct ds *ds) > goto error; > } > > - if (!strcmp(key, "ct_nw_src") || !strcmp(key, "ct_nw_dst")) { > - if (tuple->l3_type && tuple->l3_type != AF_INET) { > - ds_put_cstr(ds, "L3 type set multiple times"); > - goto error; > - } else { > - tuple->l3_type = AF_INET; > - } > - if (!ip_parse(value, key[6] == 's' ? &tuple->src.ip : > - &tuple->dst.ip)) { > - goto error_with_msg; > - } > - } else if (!strcmp(key, "ct_ipv6_src") || > - !strcmp(key, "ct_ipv6_dst")) { > - if (tuple->l3_type && tuple->l3_type != AF_INET6) { > - ds_put_cstr(ds, "L3 type set multiple times"); > + if (!strcmp(key, "ct_nw_src") || !strcmp(key, "ct_nw_dst") || > + !strcmp(key, "ct_ipv6_src") || !strcmp(key, "ct_ipv6_dst")) { > + tuple->l3_type = key[6] == '6' ? AF_INET6 : AF_INET; > + uint8_t index = key[6] == '6' ? 8 : 6; > + union ct_dpif_inet_addr *addr = key[index] == 's' > + ? &tuple->src > + : &tuple->dst; > + > + if (!ct_dpif_parse_tuple_ip(addr, value, tuple->l3_type)) { > + ds_put_format(ds, "%s is set multiple times", key); > goto error; > - } else { > - tuple->l3_type = AF_INET6; > - } > - if (!ipv6_parse(value, key[8] == 's' ? &tuple->src.in6 : > - &tuple->dst.in6)) { > - goto error_with_msg; > } > } else if (!strcmp(key, "ct_nw_proto")) { > char *err = str_to_u8(value, key, &tuple->ip_proto); > @@ -665,17 +775,9 @@ ct_dpif_parse_tuple(struct ct_dpif_tuple *tuple, const > char *s, struct ds *ds) > } > } > > - if (ipv6_is_zero(&tuple->src.in6) || ipv6_is_zero(&tuple->dst.in6) || > - !tuple->ip_proto) { > - /* icmp_type, icmp_code, and icmp_id can be 0. */ > - if (tuple->ip_proto != IPPROTO_ICMP && > - tuple->ip_proto != IPPROTO_ICMPV6) { > - if (!tuple->src_port || !tuple->dst_port) { > - ds_put_cstr(ds, "at least one of the conntrack 5-tuple > fields " > - "is missing."); > - goto error; > - } > - } > + if (!tuple->ip_proto && (tuple->src_port || tuple->dst_port)) { > + ds_put_cstr(ds, "port is set without protocol"); > + goto error; > } > > free(copy); > diff --git a/lib/dpctl.man b/lib/dpctl.man > index 87ea8087b..b0cabe05d 100644 > --- a/lib/dpctl.man > +++ b/lib/dpctl.man > @@ -312,7 +312,8 @@ If \fBzone\fR=\fIzone\fR is specified, only flushes the > connections in > If \fIct-tuple\fR is provided, flushes the connection entry specified by > \fIct-tuple\fR in \fIzone\fR. The zone defaults to 0 if it is not provided. > The userspace connection tracker requires flushing with the original > pre-NATed > -tuple and a warning log will be otherwise generated. > +tuple and a warning log will be otherwise generated. The tuple can be partial > +and will remove all connections that are matching on the specified fields. > An example of an IPv4 ICMP \fIct-tuple\fR: > .IP > > "ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=1,icmp_type=8,icmp_code=0,icmp_id=10" > diff --git a/tests/system-traffic.at b/tests/system-traffic.at > index 731de439c..81b32f348 100644 > --- a/tests/system-traffic.at > +++ b/tests/system-traffic.at > @@ -2221,7 +2221,7 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep > "orig=.src=10\.1\.1\.1,"], [], > > udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) > ]) > > -AT_CHECK([ovs-appctl dpctl/flush-conntrack > 'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1']) > +AT_CHECK([ovs-appctl dpctl/flush-conntrack > 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17,ct_tp_src=1,ct_tp_dst=2']) > > AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.1,"], > [1], [dnl > ]) > @@ -2233,7 +2233,7 @@ AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep > "orig=.src=10\.1\.1\.2,"], [0], > > udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 > ]) > > -AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=5 > 'ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17,ct_tp_src=1,ct_tp_dst=2']) > +AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=5 > 'ct_nw_src=10.1.1.2,ct_nw_dst=10.1.1.1,ct_nw_proto=17,ct_tp_src=2,ct_tp_dst=1']) > > AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(10.1.1.2)], [0], [dnl > ]) > @@ -2255,6 +2255,86 @@ AT_CHECK([ovs-appctl dpctl/flush-conntrack zone=5 > $ICMP_TUPLE]) > AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "orig=.src=10\.1\.1\.2,"], > [1], [dnl > ]) > > +dnl Test UDP from port 1 and 2, partial flush by src port > +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 > packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 > actions=resubmit(,0)"]) > +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 > packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 > actions=resubmit(,0)"]) > + > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl > +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) > +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 > +]) I see a failure for check-kernel target because the entries are shown in the reverse order. I guess a "| sort" could be needed. > + > +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_src=1']) > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl > +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 > +]) > + > +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_src=2']) > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) > + > +dnl Test UDP from port 1 and 2, partial flush by dst port > +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 > packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 > actions=resubmit(,0)"]) > +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 > packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 > actions=resubmit(,0)"]) > + > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl > +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) > +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 > +]) > + > +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_dst=2']) > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl > +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 > +]) > + > +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_proto=17,ct_tp_dst=1']) > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) > + > +dnl Test UDP from port 1 and 2, partial flush by src address > +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 > packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 > actions=resubmit(,0)"]) > +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 > packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 > actions=resubmit(,0)"]) > + > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl > +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) > +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 > +]) > + > +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_src=10.1.1.1']) > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl > +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 > +]) > + > +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_src=10.1.1.2']) > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) > + > +dnl Test UDP from port 1 and 2, partial flush by dst address > +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=1 > packet=50540000000a50540000000908004500001c000000000011a4cd0a0101010a0101020001000200080000 > actions=resubmit(,0)"]) > +AT_CHECK([ovs-ofctl -O OpenFlow13 packet-out br0 "in_port=2 > packet=50540000000a50540000000908004500001c000000000011a4cd0a0101020a0101010002000100080000 > actions=resubmit(,0)"]) > + > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl > +udp,orig=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),reply=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1) > +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 > +]) > + > +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_dst=10.1.1.2']) > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [0], [dnl > +udp,orig=(src=10.1.1.2,dst=10.1.1.1,sport=2,dport=1),reply=(src=10.1.1.1,dst=10.1.1.2,sport=1,dport=2),zone=5 > +]) > + > +AT_CHECK([ovs-appctl dpctl/flush-conntrack 'ct_nw_dst=10.1.1.1']) > + > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep "10\.1\.1\.1"], [1]) > + > OVS_TRAFFIC_VSWITCHD_STOP > AT_CLEANUP > > -- > 2.37.2 > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev