Yael Chemla via dev <[email protected]> writes:

> Currently, the kernel capability probe for TCA_FLOWER_KEY_ENC_FLAGS
> does not take into account whether hardware offload is enabled. This
> can lead to incorrect detection of encapsulation flag support and
> failed offload attempts on devices lacking proper hardware support.
>
> This patch uses a two-step probe to validate hardware support for
> ENC_FLAGS:
> (1) A global skip_hw probe checks whether the kernel recognizes
> TCA_FLOWER_KEY_ENC_FLAGS.
> (2) Per device, if it’s a tunnel and hw offload is enabled, we
> install a temporary flower rule and read it back with tc_get_flower to
> confirm offloaded_state == IN_HW. This checks whether ENC_FLAGS can
> actually be offloaded to hardware.
> Results are cached per device in the new netdev_tc_data structure, and
> we skip probing for system/non-tunnel devices. This avoids false
> positives where the global skip_hw probe succeeds but the device
> supports the key only in software, and prevents unnecessary probes and
> offload attempts on unsupported devices.
>
> Fixes: 3f7af5233a29 ("netdev-offload-tc: Check if
> TCA_FLOWER_KEY_ENC_FLAGS is supported.")
> Signed-off-by: Yael Chemla <[email protected]>
> ---

Hi Yael,

This is just a quick review.  I didn't spend too much time thinking
about the approach, so consider these basic structural things, rather
than a critique for the approach you outlining.

>  lib/netdev-offload-tc.c | 123 +++++++++++++++++++++++++++++++++++++---
>  lib/tc.c                |   2 +-
>  lib/tc.h                |   2 +
>  3 files changed, 118 insertions(+), 9 deletions(-)
>
> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
> index 804885285..6103b47eb 100644
> --- a/lib/netdev-offload-tc.c
> +++ b/lib/netdev-offload-tc.c
> @@ -79,6 +79,12 @@ struct policer_node {
>      uint32_t police_idx;
>  };
>  
> +/* Per-netdev TC offload data */
> +struct netdev_tc_data {
> +    bool enc_flags_support;     /* Per-netdev enc flags support */

This name and the global name are the same, which makes reading the code
confusing.  Choose something like 'dev_enc_flags_supported'.

> +    struct ovsthread_once once; /* avoid concurrency per-netdev TC data */
> +};
> +
>  /* ccmap and protective mutex for counting recirculation id (chain) usage. */
>  static struct ovs_mutex used_chains_mutex = OVS_MUTEX_INITIALIZER;
>  static struct ccmap used_chains OVS_GUARDED;
> @@ -88,6 +94,7 @@ static struct ovs_mutex meter_police_ids_mutex = 
> OVS_MUTEX_INITIALIZER;
>  static struct id_pool *meter_police_ids 
> OVS_GUARDED_BY(meter_police_ids_mutex);
>  /* Protects below meter hashmaps. */
>  static struct ovs_mutex meter_mutex = OVS_MUTEX_INITIALIZER;
> +static struct ovs_mutex tc_data_alloc_mutex = OVS_MUTEX_INITIALIZER;
>  static struct hmap meter_id_to_police_idx OVS_GUARDED_BY(meter_mutex)
>      = HMAP_INITIALIZER(&meter_id_to_police_idx);
>  static struct hmap police_idx_to_meter_id OVS_GUARDED_BY(meter_mutex)
> @@ -109,6 +116,9 @@ static void parse_tc_flower_to_stats(struct tc_flower 
> *flower,
>  
>  static int get_ufid_adjust_stats(const ovs_u128 *ufid,
>                                   struct dpif_flow_stats *stats);
> +static void netdev_tc_cleanup_data(struct netdev *netdev);
> +static bool netdev_tc_get_enc_flags_support(struct netdev *netdev);
> +static struct netdev_tc_data * get_netdev_tc_data(struct netdev *netdev);
>  
>  static bool
>  is_internal_port(const char *type)
> @@ -581,6 +591,8 @@ netdev_tc_flow_flush(struct netdev *netdev)
>      }
>      ovs_mutex_unlock(&ufid_lock);
>  
> +    netdev_tc_cleanup_data(netdev);
> +
>      return 0;
>  }
>  
> @@ -2398,7 +2410,7 @@ netdev_tc_flow_put(struct netdev *netdev, struct match 
> *match,
>          memset(&tnl_mask->gbp_flags, 0, sizeof tnl_mask->gbp_flags);
>          tnl_mask->flags &= ~FLOW_TNL_F_KEY;
>  
> -        if (enc_flags_support) {
> +        if (netdev_tc_get_enc_flags_support(netdev)) {
>              if (tnl_mask->flags & FLOW_TNL_F_OAM) {
>                  if (tnl->flags & FLOW_TNL_F_OAM) {
>                      flower.key.tunnel.tc_enc_flags |=
> @@ -3026,22 +3038,73 @@ out:
>      tc_add_del_qdisc(ifindex, false, block_id, TC_INGRESS);
>  }
>  
> +/* Helper functions for per-netdev TC data */
> +static struct netdev_tc_data * get_netdev_tc_data(struct netdev *netdev)
> +{
> +    struct netdev_tc_data *tc_data;
> +
> +    /* avoid races on allocation/ovsrcu_set */
> +    ovs_mutex_lock(&tc_data_alloc_mutex);
> +    tc_data = (struct netdev_tc_data *) ovsrcu_get_protected(void *,
> +                                                
> &netdev->hw_info.offload_data);

This looks weird to me - why not:

tc_data = ovsrcu_get_protected(struct netdev_tc_data *, ...)

> +    if (!tc_data) {
> +        tc_data = xzalloc(sizeof *tc_data);
> +        tc_data->enc_flags_support = false;
> +        tc_data->once.done = false;
> +        ovs_mutex_init(&tc_data->once.mutex);

^ Please do something more like:

  tc_data->once = OVSTHREAD_ONCE_INITIALIZER;

But I'm not sure we should be using such a tool to do this kind of
probing.  See below for a comment on why.

As for how to do it differently, one thing you could do is an atomic
with different states like:

OVS_PROBE_UNINIT,
OVS_PROBE_STARTED,
OVS_PROBE_DONE_SUPPORTED,
OVS_PROBE_DONE_UNSUPPORTED,
OVS_PROBE_CANCELLED

something like that.  Then you know what state the thing is in, and can
advance your tests appropriately.  This is just an example, please spend
a bit of time considering how the synchronization works.

> +        ovsrcu_set(&netdev->hw_info.offload_data, tc_data);
> +    }
> +    ovs_mutex_unlock(&tc_data_alloc_mutex);
> +    return tc_data;
> +}
> +
> +/* Returns the enc flags support for the given netdev, return global flag if
> + * not probed yet (non tunnel devices) */
> +static bool netdev_tc_get_enc_flags_support(struct netdev *netdev)
> +{
> +    struct netdev_tc_data *tc_data = (struct netdev_tc_data *)
> +                            ovsrcu_get(void *, 
> &netdev->hw_info.offload_data);
> +    return tc_data ? tc_data->enc_flags_support : enc_flags_support;
> +}
> +
> +/* Cleanup the per-netdev TC data */
> +static void netdev_tc_cleanup_data(struct netdev *netdev)
> +{
> +    struct netdev_tc_data *tc_data;
> +
> +    tc_data = (struct netdev_tc_data *)
> +              ovsrcu_get(void *, &netdev->hw_info.offload_data);
> +
> +    if (tc_data) {
> +        ovsrcu_set(&netdev->hw_info.offload_data, NULL);
> +        ovsrcu_postpone(free, tc_data);

Is it possible that we can create a race here during flow flushing?
Also, one issue with using an ovsthread_once element shows up here as we
don't cleanup the mutex associated with the ovsthread_once object calling
an ovs_mutex_destroy.  The design of this object assumes that it remains
valid for the entire life of the program, rather than having a limited
scope.

I suggest re-designing it to use a different data structure.

> +    }
> +}
> +
>  static void
> -probe_enc_flags_support(int ifindex)
> +probe_enc_flags_support(struct netdev *netdev, struct netdev_tc_data 
> *tc_data,
> +                        enum tc_offload_policy policy)
>  {
> +    const char *netdev_type = netdev_get_type(netdev);
> +    const char *netdev_name = netdev_get_name(netdev);
>      struct tc_flower flower;
>      struct tcf_id id;
>      int block_id = 0;
>      int prio = TC_RESERVED_PRIORITY_FEATURE_PROBE;
> +    int ifindex;
>      int error;
>  
> +
> +    /* assume ifindex is already checked in calling function */
> +    ifindex = netdev_get_ifindex(netdev);

Why not keep the ifindex being passed?

>      error = tc_add_del_qdisc(ifindex, true, block_id, TC_INGRESS);
>      if (error) {
>          return;
>      }
>  
>      memset(&flower, 0, sizeof flower);
> -    flower.tc_policy = TC_POLICY_SKIP_HW;
> +
> +    flower.tc_policy = policy;
>      flower.key.eth_type = htons(ETH_P_IP);
>      flower.mask.eth_type = OVS_BE16_MAX;
>      flower.tunnel = true;
> @@ -3049,22 +3112,54 @@ probe_enc_flags_support(int ifindex)
>      flower.mask.tunnel.ipv4.ipv4_src = OVS_BE32_MAX;
>      flower.mask.tunnel.ipv4.ipv4_dst = OVS_BE32_MAX;
>      flower.mask.tunnel.tp_dst = OVS_BE16_MAX;
> -    flower.mask.tunnel.tc_enc_flags = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT;
> +
> +    /* respect the policy */
> +    if (policy == TC_POLICY_NONE) {
> +        flower.mask.tunnel.tc_enc_flags = 
> TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT |
> +                                          TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM;
> +        flower.key.tunnel.tc_enc_flags = 
> TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT |
> +                                         TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM;
> +    } else {
> +        flower.mask.tunnel.tc_enc_flags = 
> TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT;
> +        flower.key.tunnel.tc_enc_flags = 
> TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT;
> +    }
> +
>      flower.key.tunnel.ipv4.ipv4_src = htonl(0x01010101);
>      flower.key.tunnel.ipv4.ipv4_dst = htonl(0x01010102);
>      flower.key.tunnel.tp_dst = htons(46354);
> -    flower.key.tunnel.tc_enc_flags = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT;
>  
>      id = tc_make_tcf_id(ifindex, block_id, prio, TC_INGRESS);
> +    flower.probing = true;
>      error = tc_replace_flower(&id, &flower);
>      if (error) {
> +        VLOG_INFO("probe tc: enc flags are not supported for %s device %s.",
> +                  netdev_type, netdev_name);
>          goto out;
>      }
>  
> +    /* Query the newly-added rule, check its offload state before deletion
> +     * when policy is TC_POLICY_NONE it's the case of tunnel device */
> +    if (policy == TC_POLICY_NONE) {
> +        struct tc_flower flower_state;
> +
> +        memset(&flower_state, 0, sizeof flower_state);
> +        if (!tc_get_flower(&id, &flower_state)) {
> +            tc_data->enc_flags_support =
> +                (flower_state.offloaded_state == TC_OFFLOADED_STATE_IN_HW);
> +        }
> +    } else {
> +        tc_data->enc_flags_support = true;
> +    }
> +
>      tc_del_flower_filter(&id);
>  
> -    enc_flags_support = true;
> -    VLOG_INFO("probe tc: enc flags are supported.");
> +    if (tc_data->enc_flags_support) {
> +        VLOG_INFO("probe tc: enc flags with tc_policy(%d) are offloaded for 
> %s device %s.",
> +                  policy, netdev_type, netdev_name);
> +    } else {
> +        VLOG_INFO("probe tc: enc flags with tc_policy(%d) are not offloaded 
> for %s device %s.",
> +                  policy, netdev_type, netdev_name);
> +    }
>  out:
>      tc_add_del_qdisc(ifindex, false, block_id, TC_INGRESS);
>  }
> @@ -3152,7 +3247,9 @@ netdev_tc_init_flow_api(struct netdev *netdev)
>  {
>      static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
>      enum tc_qdisc_hook hook = get_tc_qdisc_hook(netdev);
> +    struct netdev_tc_data dummy_tcdata = {0};
>      static bool get_chain_supported = true;
> +    struct netdev_tc_data *tc_data;
>      uint32_t block_id = 0;
>      struct tcf_id id;
>      int ifindex;
> @@ -3199,7 +3296,8 @@ netdev_tc_init_flow_api(struct netdev *netdev)
>          probe_multi_mask_per_prio(ifindex);
>          probe_ct_state_support(ifindex);
>          probe_vxlan_gbp_support(ifindex);
> -        probe_enc_flags_support(ifindex);
> +        probe_enc_flags_support(netdev, &dummy_tcdata, TC_POLICY_SKIP_HW);
> +        enc_flags_support = dummy_tcdata.enc_flags_support;
>  
>          ovs_mutex_lock(&meter_police_ids_mutex);
>          meter_police_ids = id_pool_create(METER_POLICE_IDS_BASE,
> @@ -3211,6 +3309,15 @@ netdev_tc_init_flow_api(struct netdev *netdev)
>          ovsthread_once_done(&once);
>      }
>  
> +    /* probe if tunnel device and hw offload is enabled */
> +    if (netdev_get_tunnel_config(netdev) && netdev_is_flow_api_enabled()) {
> +        tc_data = get_netdev_tc_data(netdev);
> +        if (ovsthread_once_start(&tc_data->once)) {
> +            probe_enc_flags_support(netdev, tc_data, TC_POLICY_NONE);
> +            ovsthread_once_done(&tc_data->once);
> +        }
> +    }
> +
>      error = tc_add_del_qdisc(ifindex, true, block_id, hook);
>  
>      if (error && error != EEXIST) {
> diff --git a/lib/tc.c b/lib/tc.c
> index a5f9bc1c1..6367333a2 100644
> --- a/lib/tc.c
> +++ b/lib/tc.c
> @@ -3834,7 +3834,7 @@ nl_msg_put_flower_options(struct ofpbuf *request, 
> struct tc_flower *flower)
>          }
>      }
>  
> -    if (policy == TC_POLICY_NONE) {
> +    if (!flower->probing && policy == TC_POLICY_NONE) {
>          policy = tc_policy;
>      }
>  
> diff --git a/lib/tc.h b/lib/tc.h
> index 883e21f4c..e30f2f582 100644
> --- a/lib/tc.h
> +++ b/lib/tc.h
> @@ -403,6 +403,8 @@ struct tc_flower {
>      enum tc_offloaded_state offloaded_state;
>      /* Used to force skip_hw when probing tc features. */
>      enum tc_offload_policy tc_policy;
> +    /* Used to indicate that the flower is being probed. */
> +    bool probing;
>  };
>  
>  int tc_replace_flower(struct tcf_id *id, struct tc_flower *flower);

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to