git sha: 2a063b239d7295d23f89a306406e720c375c297d
Author: Eelco Chaudron <[email protected]>
Subject: dpif-offload-dpdk: Abstract rte_flow implementation from dpif-netdev.
This commit abstracts the rte_flow-specific logic from dpif-netdev into
the dpif-offload-dpdk provider, making the generic datapath implementation
independent of specific offload technologies.
> diff --git a/lib/automake.mk b/lib/automake.mk
> index f7586bb0a..c0edef8e9 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -508,6 +508,7 @@ if DPDK_NETDEV
> lib_libopenvswitch_la_SOURCES += \
> lib/dpdk.c \
> lib/dpif-offload-dpdk.c \
> + lib/dpif-offload-dpdk-private.h \
> lib/netdev-dpdk.c \
> lib/netdev-offload-dpdk.c \
> lib/netdev-offload-dpdk.h
> diff --git a/lib/dpif-netdev-private-flow.h b/lib/dpif-netdev-private-flow.h
> index 7425dd44e..753358925 100644
> --- a/lib/dpif-netdev-private-flow.h
> +++ b/lib/dpif-netdev-private-flow.h
> @@ -89,7 +89,7 @@ struct dp_netdev_flow {
> /* 'flow_table'. */
> const struct cmap_node simple_match_node; /* In dp_netdev_pmd_thread's
> 'simple_match_table'. */
> - const struct cmap_node mark_node; /* In owning flow_mark's mark_to_flow
> */
> + const struct cmap_node mufid_node; /* In dp_netdev's 'mufid_to_flow'. */
> const ovs_u128 ufid; /* Unique flow identifier. */
> const ovs_u128 mega_ufid; /* Unique mega flow identifier. */
> const unsigned pmd_id; /* The 'core_id' of pmd thread owning this
> */
Does this field change from mark_node to mufid_node actually get used
anywhere in the code? I don't see any references to mufid_node being
added or any mufid_to_flow map being created.
> @@ -102,18 +102,14 @@ struct dp_netdev_flow {
> struct ovs_refcount ref_cnt;
>
> bool dead;
> - uint32_t mark; /* Unique flow mark for netdev offloading.
> */
> + bool offloaded;
> + atomic_int offload_queue_depth;
> uint64_t simple_match_mark; /* Unique flow mark for the simple match. */
> odp_port_t orig_in_port;
>
> /* Statistics. */
> struct dp_netdev_flow_stats stats;
>
> - /* Statistics and attributes received from the netdev offload provider.
> */
> - atomic_int netdev_flow_get_result;
> - struct dp_netdev_flow_stats last_stats;
> - struct dp_netdev_flow_attrs last_attrs;
> -
> /* Actions. */
> OVSRCU_TYPE(struct dp_netdev_actions *) actions;
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
[ ... ]
> @@ -2463,137 +2463,171 @@ log_netdev_flow_change(const struct dp_netdev_flow
> *flow,
> ds_destroy(&ds);
> }
>
> -static void
> -queue_netdev_flow_put(struct dp_netdev_pmd_thread *pmd,
> - struct dp_netdev_flow *flow, struct match *match,
> - const struct nlattr *actions, size_t actions_len,
> - int op)
> +/* Offloaded flows can be handled asynchronously, so we do not always know
> + * whether a specific flow is offloaded or not. It might still be pending;
> + * in fact, multiple modifications can be pending, and the actual offload
> + * state depends on the completion of each modification.
> + *
> + * To correctly determine whether a flow is offloaded when it is being
> + * destroyed (and therefore requires cleanup), we must ensure that all
> + * operations have completed. To achieve this, we track the number of
> + * outstanding offloaded flow modifications. */
> +static bool
> +offload_queue_inc(struct dp_netdev_flow *flow)
> {
> - struct dp_offload_thread_item *item;
> - struct dp_offload_flow_item *flow_offload;
> + int current;
>
> - if (!dpif_offload_is_offload_enabled()) {
> - return;
> + while (true) {
> + atomic_read(&flow->offload_queue_depth, ¤t);
> + if (current < 0) {
> + /* We are cleaning up, so no longer enqueue operations. */
> + return false;
> + }
> +
> + /* Here we try to atomically increase the value. If we do not
> succeed,
> + * someone else has modified it, and we need to check again for a
> + * current negative value. */
> + if (atomic_compare_exchange_strong(&flow->offload_queue_depth,
> + ¤t, current + 1)) {
> + return true;
> + }
> }
This infinite loop in offload_queue_inc() could potentially be problematic
under high contention. Does this code handle the case where multiple threads
are continuously modifying offload_queue_depth?
> +}
> +
> +static bool
> +offload_queue_dec(struct dp_netdev_flow *flow)
> +{
> + int old;
> +
> + atomic_sub(&flow->offload_queue_depth, 1, &old);
> + ovs_assert(old >= 1);
> +
> + if (old == 1) {
> + /* Note that this only indicates that the queue might be empty. */
> + return true;
> + }
> + return false;
> +}
> +
> +static bool
> +offload_queue_complete(struct dp_netdev_flow *flow)
> +{
> + /* This function returns false if the queue is still in use.
> + * If the queue is empty, it will attempt to atomically mark it as
> + * 'not in use' by making the queue depth negative. This prevents
> + * other flow operations from being added. If successful, it returns
> + * true. */
> + int expected_val = 0;
> +
> + return atomic_compare_exchange_strong(&flow->offload_queue_depth,
> + &expected_val, -1);
> +}
Is there a potential race condition between offload_queue_dec() returning
true and offload_queue_complete() being called where another thread could
increment the queue depth?
> +static void
> +offload_flow_reference_unreference_cb(unsigned pmd_id OVS_UNUSED,
> + void *flow_reference_)
> +{
> + struct dp_netdev_flow *flow_reference = flow_reference_;
> +
> + if (flow_reference) {
> + flow_reference->offloaded = false;
> + dp_netdev_flow_unref(flow_reference);
> + }
> +}
[ ... ]
> @@ -4185,12 +3596,10 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
> /* Do not allocate extra space. */
> flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len);
> memset(&flow->stats, 0, sizeof flow->stats);
> - atomic_init(&flow->netdev_flow_get_result, 0);
> - memset(&flow->last_stats, 0, sizeof flow->last_stats);
> - memset(&flow->last_attrs, 0, sizeof flow->last_attrs);
> flow->dead = false;
> + flow->offloaded = false;
> + atomic_init(&flow->offload_queue_depth, 0);
> flow->batch = NULL;
> - flow->mark = INVALID_FLOW_MARK;
> flow->orig_in_port = orig_in_port;
> *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
> *CONST_CAST(struct flow *, &flow->flow) = match->flow;
[ ... ]
> diff --git a/lib/dpif-offload-dpdk.c b/lib/dpif-offload-dpdk.c
> index 5c3e7f041..547bb68ef 100644
> --- a/lib/dpif-offload-dpdk.c
> +++ b/lib/dpif-offload-dpdk.c
[ ... ]
> +uint32_t
> +dpif_offload_dpdk_allocate_flow_mark(struct dpif_offload_dpdk *offload)
> +{
> + static struct ovsthread_once init_once = OVSTHREAD_ONCE_INITIALIZER;
> + unsigned int uid = dpdk_offload_thread_id() \
> + % offload->offload_thread_count;
> + uint32_t flow_mark;
> +
> + if (ovsthread_once_start(&init_once)) {
> + /* Haven't initiated yet, do it here. */
> + offload->flow_mark_pool = id_fpool_create(
> + offload->offload_thread_count, 1, UINT32_MAX - 1);
> + ovsthread_once_done(&init_once);
> + }
Does this static ovsthread_once cause issues when multiple dpif_offload_dpdk
instances exist? The flow mark pool initialization seems to only happen once
globally but each offload instance might need its own pool.
> +
> + if (id_fpool_new_id(offload->flow_mark_pool, uid, &flow_mark)) {
> + return flow_mark;
> + }
> +
> + return INVALID_FLOW_MARK;
> +}
[ ... ]
> +static void
> +dpif_offload_dpdk_offload_threads_init(struct dpif_offload_dpdk *offload)
> +{
> + offload->offload_threads = xcalloc(offload->offload_thread_count,
> + sizeof(struct dpdk_offload_thread));
> +
> + for (unsigned int tid = 0; tid < offload->offload_thread_count; tid++) {
> + struct dpdk_offload_thread *thread;
> +
> + thread = &offload->offload_threads[tid];
> + mpsc_queue_init(&thread->queue);
> + atomic_init(&thread->enqueued_item, 0);
> + mov_avg_cma_init(&thread->cma);
> + mov_avg_ema_init(&thread->ema, 100);
> + thread->offload = offload;
> + thread->thread = ovs_thread_create(
> + "dpdk_offload", dpif_offload_dpdk_offload_thread_main, thread);
> + }
> +}
Are there any checks to ensure offload_thread_count is reasonable? Could
this code handle the case where offload_thread_count is extremely large
and cause resource exhaustion?
[ ... ]
> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
[ ... ]
> +static struct pmd_data *
> +netdev_offload_dpdk_pmd_data_init(void)
> +{
> + struct pmd_data *mapping = xmalloc(sizeof *mapping);
> +
> + ovs_mutex_init(&mapping->map_lock);
> + cmap_init(&mapping->pmd_id_to_flow_ref);
> + return mapping;
> +}
Does this function handle the case where xmalloc() fails? The code doesn't
seem to check for allocation failure.
> +static void
> +netdev_offload_dpdk_pmd_data_cleanup_mappings(
> + struct dpif_offload_dpdk *offload, struct pmd_data *mapping)
> +{
> + struct pmd_id_to_flow_ref_data *data;
> +
> + if (!mapping) {
> + return;
> + }
> +
> + ovs_mutex_lock(&mapping->map_lock);
> +
> + CMAP_FOR_EACH (data, node, &mapping->pmd_id_to_flow_ref) {
> + cmap_remove(&mapping->pmd_id_to_flow_ref, &data->node,
> + hash_int(data->pmd_id, 0));
> +
> + dpif_offload_dpdk_flow_unreference(offload, data->pmd_id,
> + data->flow_reference);
> + ovsrcu_postpone(free, data);
> + }
> +
> + ovs_mutex_unlock(&mapping->map_lock);
> +}
Could this cleanup function cause issues if
dpif_offload_dpdk_flow_unreference()
callbacks try to access the mapping being cleaned up?
[ ... ]
> @@ -2506,13 +2554,15 @@ out:
> }
>
> static struct ufid_to_rte_flow_data *
> -netdev_offload_dpdk_add_flow(struct netdev *netdev,
> +netdev_offload_dpdk_add_flow(struct dpif_offload_dpdk *offload,
> + struct pmd_data *pmd_mapping,
> + struct netdev *netdev,
> struct match *match,
> struct nlattr *nl_actions,
> size_t actions_len,
> const ovs_u128 *ufid,
> - struct dpif_netdev_offload_info *info)
> + uint32_t flow_mark,
> + odp_port_t orig_in_port)
> {
> struct flow_patterns patterns = {
> .items = NULL,
> @@ -2523,20 +2573,20 @@ netdev_offload_dpdk_add_flow(struct netdev *netdev,
> bool actions_offloaded = true;
> struct rte_flow *flow;
>
> - if (parse_flow_match(netdev, info->orig_in_port, &patterns, match)) {
> + if (parse_flow_match(offload, netdev, orig_in_port, &patterns, match)) {
> VLOG_DBG_RL(&rl, "%s: matches of ufid "UUID_FMT" are not supported",
> netdev_get_name(netdev), UUID_ARGS((struct uuid *)
> ufid));
> goto out;
> }
>
> - flow = netdev_offload_dpdk_actions(patterns.physdev, &patterns,
> nl_actions,
> - actions_len);
> + flow = netdev_offload_dpdk_actions(offload, patterns.physdev, &patterns,
> + nl_actions, actions_len);
> if (!flow && !netdev_vport_is_vport_class(netdev->netdev_class)) {
> /* If we failed to offload the rule actions fallback to MARK+RSS
> * actions.
> */
> flow = netdev_offload_dpdk_mark_rss(&patterns, netdev,
> - info->flow_mark);
> + flow_mark);
> actions_offloaded = false;
> }
>
> @@ -2544,7 +2594,8 @@ netdev_offload_dpdk_add_flow(struct netdev *netdev,
> goto out;
> }
> flows_data = ufid_to_rte_flow_associate(ufid, netdev, patterns.physdev,
> - flow, actions_offloaded);
> + flow, actions_offloaded,
> + flow_mark, pmd_mapping);
> VLOG_DBG("%s/%s: installed flow %p by ufid "UUID_FMT,
> netdev_get_name(netdev), netdev_get_name(patterns.physdev),
> flow,
> UUID_ARGS((struct uuid *) ufid));
[ ... ]
> @@ -2734,21 +2784,54 @@ netdev_offload_dpdk_flow_put(struct dpif_offload_dpdk
> *offload,
> dpif_netdev_offload_ports_traverse(get_netdev_odp_cb, &aux);
> orig_in_port = aux.odp_port;
> old_stats = rte_flow_data->stats;
> modification = true;
> - ret = netdev_offload_dpdk_flow_destroy(rte_flow_data);
> + pmd_mapping = ovsrcu_get(struct pmd_data *,
> + &rte_flow_data->pmd_mapping);
> + ovsrcu_set(&rte_flow_data->pmd_mapping, NULL);
> + flow_mark = rte_flow_data->flow_mark;
> +
> + ret = netdev_offload_dpdk_flow_destroy(offload, rte_flow_data,
> + false, true);
> if (ret < 0) {
> return ret;
> }
> + } else if (!rte_flow_data) {
> + pmd_mapping = netdev_offload_dpdk_pmd_data_init();
> + netdev_offload_dpdk_pmd_data_associate(pmd_mapping, pmd_id,
> + flow_reference);
> + *previous_flow_reference = NULL;
> + flow_mark = dpif_offload_dpdk_allocate_flow_mark(offload);
> + } else /* if (rte_flow_data) */ {
> + pmd_mapping = ovsrcu_get(struct pmd_data *,
> + &rte_flow_data->pmd_mapping);
> +
> + netdev_offload_dpdk_pmd_data_associate(pmd_mapping, pmd_id,
> + flow_reference);
> + *previous_flow_reference = NULL;
> + }
In the middle else clause, flow_mark is allocated but what happens if
the subsequent netdev_offload_dpdk_add_flow() fails? Does the allocated
flow_mark get properly freed?
> +
> + if (modification || !rte_flow_data) {
> + rte_flow_data = netdev_offload_dpdk_add_flow(offload, pmd_mapping,
> + netdev, match,
> + actions, actions_len,
> + ufid, flow_mark,
> + orig_in_port);
> + if (!rte_flow_data) {
> + /* Clean up existing mappings, except for the current pmd_id one,
> + * as this is handled through the callback. */
> + netdev_offload_dpdk_pmd_data_disassociate(pmd_mapping, pmd_id);
> + netdev_offload_dpdk_pmd_data_cleanup_mappings(offload,
> + pmd_mapping);
> + ovsrcu_postpone(netdev_offload_dpdk_pmd_data_cleanup_mapping,
> + pmd_mapping);
> + dpif_offload_dpdk_free_flow_mark(offload, flow_mark);
> + return -1;
> + }
> + }
Good, the flow_mark cleanup is handled in the error path.
[ ... ]
> diff --git a/lib/netdev-offload-dpdk.h b/lib/netdev-offload-dpdk.h
[ ... ]
> -int netdev_offload_dpdk_init(struct netdev *);
> +int netdev_offload_dpdk_init(struct netdev *,
> + unsigned int offload_thread_count);
> void netdev_offload_dpdk_uninit(struct netdev *);
> -int netdev_offload_dpdk_flow_flush(struct netdev *);
> -uint64_t netdev_offload_dpdk_flow_get_n_offloaded(struct netdev *);
> -int netdev_offload_dpdk_hw_miss_packet_recover(struct netdev *,
> - struct dp_packet *);
> -#ifdef DPDK_NETDEV
> -int netdev_offload_dpdk_flow_put(struct netdev *, struct match *,
> +int netdev_offload_dpdk_flow_flush(struct dpif_offload_dpdk *,
> + struct netdev *);
> +uint64_t netdev_offload_dpdk_flow_get_n_offloaded(
> + struct netdev *, unsigned int offload_thread_count);
> +uint64_t netdev_offload_dpdk_flow_get_n_offloaded_by_thread(
> + struct netdev *, unsigned int tid);
> +int netdev_offload_dpdk_hw_miss_packet_recover(struct dpif_offload_dpdk *,
> + struct netdev *,
> + unsigned pmd_id,
> + struct dp_packet *,
> + void **flow_reference);
> +int netdev_offload_dpdk_flow_put(struct dpif_offload_dpdk *,
> + unsigned pmd_id, void *flow_reference,
> + struct netdev *, struct match *,
> struct nlattr *actions, size_t actions_len,
> const ovs_u128 *ufid,
> - struct dpif_netdev_offload_info *,
> + odp_port_t orig_in_port,
> + void **previous_flow_reference,
> struct dpif_flow_stats *);
> -int netdev_offload_dpdk_flow_del(struct netdev *, const ovs_u128 *ufid,
> +int netdev_offload_dpdk_flow_del(struct dpif_offload_dpdk *, struct netdev *,
> + unsigned pmd_id, const ovs_u128 *ufid,
> + void *flow_reference,
> struct dpif_flow_stats *);
> int netdev_offload_dpdk_flow_get(struct netdev *, struct match *,
> struct nlattr **actions, const ovs_u128
> *ufid,
> struct dpif_flow_stats *,
> struct dpif_flow_attrs *, struct ofpbuf
> *buf);
The API changes look comprehensive. Are all the function signature changes
backwards compatible with existing callers?
> -#else
The removal of the #ifdef DPDK_NETDEV block and associated stub functions
seems correct for this abstraction.
Overall, this is a substantial refactoring that appears to properly separate
concerns, though there are several potential concurrency and error handling
issues to consider.
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev