Yael Chemla via dev <[email protected]> writes:
> Currently, the kernel capability probe for TCA_FLOWER_KEY_ENC_FLAGS
> does not take into account whether hardware offload is enabled. This
> can lead to incorrect detection of encapsulation flag support and
> failed offload attempts on devices lacking proper hardware support.
>
> This patch uses a two-step probe to validate hardware support for
> ENC_FLAGS:
> (1) A global skip_hw probe checks whether the kernel recognizes
> TCA_FLOWER_KEY_ENC_FLAGS.
> (2) Per device, if it’s a tunnel and hw offload is enabled, we
> install a temporary flower rule and read it back with tc_get_flower to
> confirm offloaded_state == IN_HW. This checks whether ENC_FLAGS can
> actually be offloaded to hardware.
> Results are cached per device in the new netdev_tc_data structure, and
> we skip probing for system/non-tunnel devices. This avoids false
> positives where the global skip_hw probe succeeds but the device
> supports the key only in software, and prevents unnecessary probes and
> offload attempts on unsupported devices.
>
> Fixes: 3f7af5233a29 ("netdev-offload-tc: Check if
> TCA_FLOWER_KEY_ENC_FLAGS is supported.")
> Signed-off-by: Yael Chemla <[email protected]>
> ---
Hi Yael,
This is just a quick review. I didn't spend too much time thinking
about the approach, so consider these basic structural things, rather
than a critique for the approach you outlining.
> lib/netdev-offload-tc.c | 123 +++++++++++++++++++++++++++++++++++++---
> lib/tc.c | 2 +-
> lib/tc.h | 2 +
> 3 files changed, 118 insertions(+), 9 deletions(-)
>
> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
> index 804885285..6103b47eb 100644
> --- a/lib/netdev-offload-tc.c
> +++ b/lib/netdev-offload-tc.c
> @@ -79,6 +79,12 @@ struct policer_node {
> uint32_t police_idx;
> };
>
> +/* Per-netdev TC offload data */
> +struct netdev_tc_data {
> + bool enc_flags_support; /* Per-netdev enc flags support */
This name and the global name are the same, which makes reading the code
confusing. Choose something like 'dev_enc_flags_supported'.
> + struct ovsthread_once once; /* avoid concurrency per-netdev TC data */
> +};
> +
> /* ccmap and protective mutex for counting recirculation id (chain) usage. */
> static struct ovs_mutex used_chains_mutex = OVS_MUTEX_INITIALIZER;
> static struct ccmap used_chains OVS_GUARDED;
> @@ -88,6 +94,7 @@ static struct ovs_mutex meter_police_ids_mutex =
> OVS_MUTEX_INITIALIZER;
> static struct id_pool *meter_police_ids
> OVS_GUARDED_BY(meter_police_ids_mutex);
> /* Protects below meter hashmaps. */
> static struct ovs_mutex meter_mutex = OVS_MUTEX_INITIALIZER;
> +static struct ovs_mutex tc_data_alloc_mutex = OVS_MUTEX_INITIALIZER;
> static struct hmap meter_id_to_police_idx OVS_GUARDED_BY(meter_mutex)
> = HMAP_INITIALIZER(&meter_id_to_police_idx);
> static struct hmap police_idx_to_meter_id OVS_GUARDED_BY(meter_mutex)
> @@ -109,6 +116,9 @@ static void parse_tc_flower_to_stats(struct tc_flower
> *flower,
>
> static int get_ufid_adjust_stats(const ovs_u128 *ufid,
> struct dpif_flow_stats *stats);
> +static void netdev_tc_cleanup_data(struct netdev *netdev);
> +static bool netdev_tc_get_enc_flags_support(struct netdev *netdev);
> +static struct netdev_tc_data * get_netdev_tc_data(struct netdev *netdev);
>
> static bool
> is_internal_port(const char *type)
> @@ -581,6 +591,8 @@ netdev_tc_flow_flush(struct netdev *netdev)
> }
> ovs_mutex_unlock(&ufid_lock);
>
> + netdev_tc_cleanup_data(netdev);
> +
> return 0;
> }
>
> @@ -2398,7 +2410,7 @@ netdev_tc_flow_put(struct netdev *netdev, struct match
> *match,
> memset(&tnl_mask->gbp_flags, 0, sizeof tnl_mask->gbp_flags);
> tnl_mask->flags &= ~FLOW_TNL_F_KEY;
>
> - if (enc_flags_support) {
> + if (netdev_tc_get_enc_flags_support(netdev)) {
> if (tnl_mask->flags & FLOW_TNL_F_OAM) {
> if (tnl->flags & FLOW_TNL_F_OAM) {
> flower.key.tunnel.tc_enc_flags |=
> @@ -3026,22 +3038,73 @@ out:
> tc_add_del_qdisc(ifindex, false, block_id, TC_INGRESS);
> }
>
> +/* Helper functions for per-netdev TC data */
> +static struct netdev_tc_data * get_netdev_tc_data(struct netdev *netdev)
> +{
> + struct netdev_tc_data *tc_data;
> +
> + /* avoid races on allocation/ovsrcu_set */
> + ovs_mutex_lock(&tc_data_alloc_mutex);
> + tc_data = (struct netdev_tc_data *) ovsrcu_get_protected(void *,
> +
> &netdev->hw_info.offload_data);
This looks weird to me - why not:
tc_data = ovsrcu_get_protected(struct netdev_tc_data *, ...)
> + if (!tc_data) {
> + tc_data = xzalloc(sizeof *tc_data);
> + tc_data->enc_flags_support = false;
> + tc_data->once.done = false;
> + ovs_mutex_init(&tc_data->once.mutex);
^ Please do something more like:
tc_data->once = OVSTHREAD_ONCE_INITIALIZER;
But I'm not sure we should be using such a tool to do this kind of
probing. See below for a comment on why.
As for how to do it differently, one thing you could do is an atomic
with different states like:
OVS_PROBE_UNINIT,
OVS_PROBE_STARTED,
OVS_PROBE_DONE_SUPPORTED,
OVS_PROBE_DONE_UNSUPPORTED,
OVS_PROBE_CANCELLED
something like that. Then you know what state the thing is in, and can
advance your tests appropriately. This is just an example, please spend
a bit of time considering how the synchronization works.
> + ovsrcu_set(&netdev->hw_info.offload_data, tc_data);
> + }
> + ovs_mutex_unlock(&tc_data_alloc_mutex);
> + return tc_data;
> +}
> +
> +/* Returns the enc flags support for the given netdev, return global flag if
> + * not probed yet (non tunnel devices) */
> +static bool netdev_tc_get_enc_flags_support(struct netdev *netdev)
> +{
> + struct netdev_tc_data *tc_data = (struct netdev_tc_data *)
> + ovsrcu_get(void *,
> &netdev->hw_info.offload_data);
> + return tc_data ? tc_data->enc_flags_support : enc_flags_support;
> +}
> +
> +/* Cleanup the per-netdev TC data */
> +static void netdev_tc_cleanup_data(struct netdev *netdev)
> +{
> + struct netdev_tc_data *tc_data;
> +
> + tc_data = (struct netdev_tc_data *)
> + ovsrcu_get(void *, &netdev->hw_info.offload_data);
> +
> + if (tc_data) {
> + ovsrcu_set(&netdev->hw_info.offload_data, NULL);
> + ovsrcu_postpone(free, tc_data);
Is it possible that we can create a race here during flow flushing?
Also, one issue with using an ovsthread_once element shows up here as we
don't cleanup the mutex associated with the ovsthread_once object calling
an ovs_mutex_destroy. The design of this object assumes that it remains
valid for the entire life of the program, rather than having a limited
scope.
I suggest re-designing it to use a different data structure.
> + }
> +}
> +
> static void
> -probe_enc_flags_support(int ifindex)
> +probe_enc_flags_support(struct netdev *netdev, struct netdev_tc_data
> *tc_data,
> + enum tc_offload_policy policy)
> {
> + const char *netdev_type = netdev_get_type(netdev);
> + const char *netdev_name = netdev_get_name(netdev);
> struct tc_flower flower;
> struct tcf_id id;
> int block_id = 0;
> int prio = TC_RESERVED_PRIORITY_FEATURE_PROBE;
> + int ifindex;
> int error;
>
> +
> + /* assume ifindex is already checked in calling function */
> + ifindex = netdev_get_ifindex(netdev);
Why not keep the ifindex being passed?
> error = tc_add_del_qdisc(ifindex, true, block_id, TC_INGRESS);
> if (error) {
> return;
> }
>
> memset(&flower, 0, sizeof flower);
> - flower.tc_policy = TC_POLICY_SKIP_HW;
> +
> + flower.tc_policy = policy;
> flower.key.eth_type = htons(ETH_P_IP);
> flower.mask.eth_type = OVS_BE16_MAX;
> flower.tunnel = true;
> @@ -3049,22 +3112,54 @@ probe_enc_flags_support(int ifindex)
> flower.mask.tunnel.ipv4.ipv4_src = OVS_BE32_MAX;
> flower.mask.tunnel.ipv4.ipv4_dst = OVS_BE32_MAX;
> flower.mask.tunnel.tp_dst = OVS_BE16_MAX;
> - flower.mask.tunnel.tc_enc_flags = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT;
> +
> + /* respect the policy */
> + if (policy == TC_POLICY_NONE) {
> + flower.mask.tunnel.tc_enc_flags =
> TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT |
> + TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM;
> + flower.key.tunnel.tc_enc_flags =
> TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT |
> + TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM;
> + } else {
> + flower.mask.tunnel.tc_enc_flags =
> TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT;
> + flower.key.tunnel.tc_enc_flags =
> TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT;
> + }
> +
> flower.key.tunnel.ipv4.ipv4_src = htonl(0x01010101);
> flower.key.tunnel.ipv4.ipv4_dst = htonl(0x01010102);
> flower.key.tunnel.tp_dst = htons(46354);
> - flower.key.tunnel.tc_enc_flags = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT;
>
> id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
> + flower.probing = true;
> error = tc_replace_flower(&id, &flower);
> if (error) {
> + VLOG_INFO("probe tc: enc flags are not supported for %s device %s.",
> + netdev_type, netdev_name);
> goto out;
> }
>
> + /* Query the newly-added rule, check its offload state before deletion
> + * when policy is TC_POLICY_NONE it's the case of tunnel device */
> + if (policy == TC_POLICY_NONE) {
> + struct tc_flower flower_state;
> +
> + memset(&flower_state, 0, sizeof flower_state);
> + if (!tc_get_flower(&id, &flower_state)) {
> + tc_data->enc_flags_support =
> + (flower_state.offloaded_state == TC_OFFLOADED_STATE_IN_HW);
> + }
> + } else {
> + tc_data->enc_flags_support = true;
> + }
> +
> tc_del_flower_filter(&id);
>
> - enc_flags_support = true;
> - VLOG_INFO("probe tc: enc flags are supported.");
> + if (tc_data->enc_flags_support) {
> + VLOG_INFO("probe tc: enc flags with tc_policy(%d) are offloaded for
> %s device %s.",
> + policy, netdev_type, netdev_name);
> + } else {
> + VLOG_INFO("probe tc: enc flags with tc_policy(%d) are not offloaded
> for %s device %s.",
> + policy, netdev_type, netdev_name);
> + }
> out:
> tc_add_del_qdisc(ifindex, false, block_id, TC_INGRESS);
> }
> @@ -3152,7 +3247,9 @@ netdev_tc_init_flow_api(struct netdev *netdev)
> {
> static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
> enum tc_qdisc_hook hook = get_tc_qdisc_hook(netdev);
> + struct netdev_tc_data dummy_tcdata = {0};
> static bool get_chain_supported = true;
> + struct netdev_tc_data *tc_data;
> uint32_t block_id = 0;
> struct tcf_id id;
> int ifindex;
> @@ -3199,7 +3296,8 @@ netdev_tc_init_flow_api(struct netdev *netdev)
> probe_multi_mask_per_prio(ifindex);
> probe_ct_state_support(ifindex);
> probe_vxlan_gbp_support(ifindex);
> - probe_enc_flags_support(ifindex);
> + probe_enc_flags_support(netdev, &dummy_tcdata, TC_POLICY_SKIP_HW);
> + enc_flags_support = dummy_tcdata.enc_flags_support;
>
> ovs_mutex_lock(&meter_police_ids_mutex);
> meter_police_ids = id_pool_create(METER_POLICE_IDS_BASE,
> @@ -3211,6 +3309,15 @@ netdev_tc_init_flow_api(struct netdev *netdev)
> ovsthread_once_done(&once);
> }
>
> + /* probe if tunnel device and hw offload is enabled */
> + if (netdev_get_tunnel_config(netdev) && netdev_is_flow_api_enabled()) {
> + tc_data = get_netdev_tc_data(netdev);
> + if (ovsthread_once_start(&tc_data->once)) {
> + probe_enc_flags_support(netdev, tc_data, TC_POLICY_NONE);
> + ovsthread_once_done(&tc_data->once);
> + }
> + }
> +
> error = tc_add_del_qdisc(ifindex, true, block_id, hook);
>
> if (error && error != EEXIST) {
> diff --git a/lib/tc.c b/lib/tc.c
> index a5f9bc1c1..6367333a2 100644
> --- a/lib/tc.c
> +++ b/lib/tc.c
> @@ -3834,7 +3834,7 @@ nl_msg_put_flower_options(struct ofpbuf *request,
> struct tc_flower *flower)
> }
> }
>
> - if (policy == TC_POLICY_NONE) {
> + if (!flower->probing && policy == TC_POLICY_NONE) {
> policy = tc_policy;
> }
>
> diff --git a/lib/tc.h b/lib/tc.h
> index 883e21f4c..e30f2f582 100644
> --- a/lib/tc.h
> +++ b/lib/tc.h
> @@ -403,6 +403,8 @@ struct tc_flower {
> enum tc_offloaded_state offloaded_state;
> /* Used to force skip_hw when probing tc features. */
> enum tc_offload_policy tc_policy;
> + /* Used to indicate that the flower is being probed. */
> + bool probing;
> };
>
> int tc_replace_flower(struct tcf_id *id, struct tc_flower *flower);
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev