On Mon, Mar 28, 2022 at 04:08:29PM +0200, Eelco Chaudron wrote:
> On Mon, Mar 28, 2022 at 3:15 PM Eelco Chaudron <echau...@redhat.com> wrote:
> 
> >
> >
> > On 25 Mar 2022, at 18:05, Tao Liu wrote:
> >
> > > On Fri, Mar 25, 2022 at 12:08:20PM +0100, Eelco Chaudron wrote:
> > >>
> > >>
> > >> On 22 Mar 2022, at 13:04, Tao Liu wrote:
> > >>
> > >>> If netdev goes away(i.e. qemu with a vnet shutdown), kernel delete all
> > tc
> > >>> filters, those tcf_id related to the netdev will be left in ufid_to_tc
> > >>> hashmap.
> > >>> When qemu restart with a same vnet but different ifindex assigned,
> > >>> a dup ufid may add. Especially after hashmap_expand, the old entry will
> > >>> insert before the new one, delete or modify tc flow will always fail.
> > >>>
> > >>> So delete the stale entry to avoid a duplicated ufid in hashmap.
> > >>
> > >> There are some comments below, as I still do not fully understand the
> > fix.
> > >>
> > > Please let me explain more.
> > > As the scenery of qemu restart described above, filters are flushed by
> > > kernel, but tcf_ids remain in ufid_to_tc hashmap.
> > >
> > > Considering a ping running continuously from other host to VM, the first
> > > reply from VM triggers upcall after qemu restart. A same ufid is
> > generated
> > > if the request processed on same handler. At this time, an old ufid
> > exists
> > > in hashmap, and flow del returns ENODEV(ifindex is wrong).
> >
> > Can this not be solved by modifying the previous fix to this?
> >
> > --- a/lib/netdev-offload-tc.c
> > +++ b/lib/netdev-offload-tc.c
> > @@ -198,7 +198,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id, const
> > ovs_u128 *ufid)
> >      int err;
> >
> >      err = tc_del_filter(id);
> > -    if (!err) {
> > +    if (!err || err = ENODEV) {
> >         del_ufid_tc_mapping(ufid);
> >      }
> >      return err;
> >  }
> >
> >
> > > However flow put
> > > still succeeds because there is no filter in kernel. And a dup ufid adds
> > in
> > > hashmap.
> > >
> > > After hashmap_expand, the old entry inserts before the new one.
> > >
> > > If flow expires, revalidator fails to del flow bacause we always get the
> > > old entry. There are constant warnings in ovs-vswitch.log:
> > >
> > >     dpif(revalidator49)|WARN|system@ovs-system: failed to flow_del (No
> > such
> > >     file or directory) ufid:d81939b3-8c8d-4b35-8d58-88c098709b91 ...
> > >
> > > In case of flow mod, flow del inside flow put still fails with ENODEV,
> > and
> > > tc_replace_flower also fails because old filter exists in kernel.
> > >
> > > The idea of this patch is that, if a filter can't get from kernel, we can
> > > just delete the mapping, and no need to call del filter as it does not
> > > exist. In this way, we can keep consistent between kernel and userspace.
> >
> > Thanks for the details, and I think the problem is a bit more complex.
> >
> > As in theory duplicate ufid's could exist (guess even on the same port ;).
> > We could also fix this by including the interface in the lookup, in
> > get_ufid_tc_mapping()?
> >
> 
> Not sure how OVS deals with the uniqueness of the ufid, if they assume its
> unique within the system, the fix can be simplified to something like this:
> 
> --- a/lib/netdev-offload-tc.c
> +++ b/lib/netdev-offload-tc.c
> 
> @@ -218,8 +222,21 @@ add_ufid_tc_mapping(struct netdev *netdev, const
> ovs_u128 *ufid,
>  {
>      struct ufid_tc_data *new_data = xzalloc(sizeof *new_data);
>      size_t ufid_hash = hash_bytes(ufid, sizeof *ufid, 0);
> +    struct tcf_id existing_id;
>      size_t tc_hash;
> 
> +    /* ADD COMMENT WHY ITS SAFE TO DELETE THIS. */
> +    if (!get_ufid_tc_mapping(ufid, &existing_id)) {
> +        del_ufid_tc_mapping(ufid);
> +    }
> +
>      tc_hash = hash_int(hash_int(id->prio, id->handle), id->ifindex);
>      tc_hash = hash_int(id->chain, tc_hash);
> 
> 
This fix works.
And I make a minor modification to avoid multiple hash computation.

--- a/lib/netdev-offload-tc.c
+++ b/lib/netdev-offload-tc.c
@@ -216,6 +216,7 @@ static void
 add_ufid_tc_mapping(struct netdev *netdev, const ovs_u128 *ufid,
                     struct tcf_id *id)
 {
+    struct ufid_tc_data *data;
     struct ufid_tc_data *new_data = xzalloc(sizeof *new_data);
     size_t ufid_hash = hash_bytes(ufid, sizeof *ufid, 0);
     size_t tc_hash;
@@ -228,6 +229,16 @@ add_ufid_tc_mapping(struct netdev *netdev, const ovs_u128 
*ufid,
     new_data->netdev = netdev_ref(netdev);
 
     ovs_mutex_lock(&ufid_lock);
+    /* Remove the stale ufid mapping to avoid a dup entry.
+     * ufid is unique in ovs which guaranteed by ukey_install__(), so it's
+     * safe to delete here. */
+    HMAP_FOR_EACH_WITH_HASH (data, ufid_to_tc_node, ufid_hash, &ufid_to_tc) {
+        if (ovs_u128_equals(*ufid, data->ufid)) {
+            del_ufid_tc_mapping_unlocked(ufid);
+            break;
+        }
+    }
+
     hmap_insert(&ufid_to_tc, &new_data->ufid_to_tc_node, ufid_hash);
     hmap_insert(&tc_to_ufid, &new_data->tc_to_ufid_node, tc_hash);
     ovs_mutex_unlock(&ufid_lock);

But it's still a little confused, because we have called
get_ufid_tc_mapping() and del_filter_and_ufid_mapping() in flow put.
Watching netlink message sounds a good idea as you said.

> 
> > For example something like this:
> >
> > diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
> > index 12d0a9af3..8be3c5dc3 100644
> > --- a/lib/netdev-offload-tc.c
> > +++ b/lib/netdev-offload-tc.c
> > @@ -239,14 +239,15 @@ add_ufid_tc_mapping(struct netdev *netdev, const
> > ovs_u128 *ufid,
> >   * Otherwise returns the error.
> >   */
> >  static int
> > -get_ufid_tc_mapping(const ovs_u128 *ufid, struct tcf_id *id)
> > +get_ufid_tc_mapping(struct netdev *netdev, const ovs_u128 *ufid,
> > +                    struct tcf_id *id)
> >  {
> >      size_t ufid_hash = hash_bytes(ufid, sizeof *ufid, 0);
> >      struct ufid_tc_data *data;
> >
> >      ovs_mutex_lock(&ufid_lock);
> >      HMAP_FOR_EACH_WITH_HASH (data, ufid_to_tc_node, ufid_hash,
> > &ufid_to_tc) {
> > -        if (ovs_u128_equals(*ufid, data->ufid)) {
> > +        if (netdev == data->netdev && ovs_u128_equals(*ufid, data->ufid)
> > ) {
> >              *id = data->id;
> >              ovs_mutex_unlock(&ufid_lock);
> >              return 0;
> > @@ -1943,7 +1944,7 @@ netdev_tc_flow_put(struct netdev *netdev, struct
> > match *match,
> >          return EOPNOTSUPP;
> >      }
> >
> > -    if (get_ufid_tc_mapping(ufid, &id) == 0) {
> > +    if (get_ufid_tc_mapping(netdev, ufid, &id) == 0) {
> >          VLOG_DBG_RL(&rl, "updating old handle: %d prio: %d",
> >                      id.handle, id.prio);
> >          info->tc_modify_flow_deleted = !del_filter_and_ufid_mapping(&id,
> > ufid);
> > @@ -1986,7 +1987,7 @@ netdev_tc_flow_get(struct netdev *netdev,
> >      struct tcf_id id;
> >      int err;
> >
> > -    err = get_ufid_tc_mapping(ufid, &id);
> > +    err = get_ufid_tc_mapping(netdev, ufid, &id);
> >      if (err) {
> >          return err;
> >      }
> > @@ -2013,7 +2014,7 @@ netdev_tc_flow_get(struct netdev *netdev,
> >  }
> >
> >  static int
> > -netdev_tc_flow_del(struct netdev *netdev OVS_UNUSED,
> > +netdev_tc_flow_del(struct netdev *netdev,
> >                     const ovs_u128 *ufid,
> >                     struct dpif_flow_stats *stats)
> >  {
> > @@ -2021,7 +2022,7 @@ netdev_tc_flow_del(struct netdev *netdev OVS_UNUSED,
> >      struct tcf_id id;
> >      int error;
> >
> > -    error = get_ufid_tc_mapping(ufid, &id);
> > +    error = get_ufid_tc_mapping(netdev, ufid, &id);
> >      if (error) {
> >          return error;
> >      }
> >
> > Hover this does not fix the root cause (same with your fix below). We need
> > a way to delete these stale mappings. Maybe we can watch some netlink
> > messages, or re-use the netdev layer, but I think we hold a reference in
> > the mapping so we won't delete either.
> >
> > I think this is what we should try to fix, not working around the leftover
> > entries.
> >
> > >>> Fixes: dd8ca104acd7 ("netdev-tc-offloads: Don't delete ufid mapping if
> > fail to delete filter")))
> > >>> Signed-off-by: Tao Liu <thomas....@ucloud.cn>
> > >>> ---
> > >>>  lib/netdev-offload-tc.c | 37 ++++++++++++++++++++++---------------
> > >>>  1 file changed, 22 insertions(+), 15 deletions(-)
> > >>>
> > >>> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
> > >>> index 12d0a9af3..e9dd66790 100644
> > >>> --- a/lib/netdev-offload-tc.c
> > >>> +++ b/lib/netdev-offload-tc.c
> > >>> @@ -167,6 +167,9 @@ struct ufid_tc_data {
> > >>>      struct netdev *netdev;
> > >>>  };
> > >>>
> > >>> +static void parse_tc_flower_to_stats(struct tc_flower *flower,
> > >>> +                                     struct dpif_flow_stats *stats);
> > >>> +
> > >>>  static void
> > >>>  del_ufid_tc_mapping_unlocked(const ovs_u128 *ufid)
> > >>>  {
> > >>> @@ -200,11 +203,24 @@ del_ufid_tc_mapping(const ovs_u128 *ufid)
> > >>>
> > >>>  /* Wrapper function to delete filter and ufid tc mapping */
> > >>>  static int
> > >>> -del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid)
> > >>> +del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid,
> > >>> +                            struct dpif_flow_stats *stats)
> > >>>  {
> > >>> -    int err;
> > >>> +    struct tc_flower flower;
> > >>> +    int err = 0, get_err;
> > >>> +
> > >>> +    get_err = tc_get_flower(id, &flower);
> > >>> +    if (!get_err) {
> > >>
> > >> We should definitely add a comment here explaining the reasoning behind
> > doing the get first and then the delete!
> > >>
> > > Will add it.
> > >
> > >> However, it’s still not clear to me why this change does not undo,
> > dd8ca104acd7?
> > >>
> > >> Because tc_get_flower() just finds out if the entry with the ID exists
> > in the kernel, and than only try to delete it.
> > >>
> > >> For this can you just not check a specific return value in the
> > tc_del_filter() call?
> > >>
> > >> I assume dd8ca104acd7 just want to skip removing the
> > del_ufid_tc_mapping() if the entry does NOT exist in the kernel?
> > >> If not maybe Jianbo can explain a bit more the corner case, or maybe
> > has a test case for this?
> > >>
> > > commit dd8ca104acd7 prevents the case of mapping deleted but filter
> > exists
> > > in kernel(del fails). This patch includes both caeses of the
> > inconsistency.
> > >
> > >>> +        err = tc_del_filter(id);
> > >>> +    }
> > >>> +
> > >>> +    if (stats) {
> > >>> +        memset(stats, 0, sizeof *stats);
> > >>> +        if (!get_err) {
> > >>> +            parse_tc_flower_to_stats(&flower, stats);
> > >>> +        }
> > >>> +    }
> > >>
> > >> I guess this was moved here to avoid the additional tc_get_flower()
> > call below.
> > >>
> > > Yes, it is.
> > >
> > >> I would fold the stats part in the above if() case to avoid the need
> > for the get_err variable, i.e.
> > >>
> > >>     if (!tc_get_flower(id, &flower)) {
> > >>       err = tc_del_filter(id);
> > >>       if (stats) {
> > >>         memset(stats, 0, sizeof *stats);
> > >>         if (!get_err) {
> > >>             parse_tc_flower_to_stats(&flower, stats);
> > >>         }
> > >>       }
> > >>
> > > We also need to zero stats if tc_get_flower fails.
> > >
> > >>
> > >>>
> > >>> -    err = tc_del_filter(id);
> > >>>      if (!err) {
> > >>>          del_ufid_tc_mapping(ufid);
> > >>>      }
> > >>> @@ -1946,7 +1962,8 @@ netdev_tc_flow_put(struct netdev *netdev, struct
> > match *match,
> > >>>      if (get_ufid_tc_mapping(ufid, &id) == 0) {
> > >>>          VLOG_DBG_RL(&rl, "updating old handle: %d prio: %d",
> > >>>                      id.handle, id.prio);
> > >>> -        info->tc_modify_flow_deleted =
> > !del_filter_and_ufid_mapping(&id, ufid);
> > >>> +        info->tc_modify_flow_deleted =
> > >>> +                    !del_filter_and_ufid_mapping(&id, ufid, NULL);
> > >>>      }
> > >>>
> > >>>      prio = get_prio_for_tc_flower(&flower);
> > >>> @@ -2017,7 +2034,6 @@ netdev_tc_flow_del(struct netdev *netdev
> > OVS_UNUSED,
> > >>>                     const ovs_u128 *ufid,
> > >>>                     struct dpif_flow_stats *stats)
> > >>>  {
> > >>> -    struct tc_flower flower;
> > >>>      struct tcf_id id;
> > >>>      int error;
> > >>>
> > >>> @@ -2026,16 +2042,7 @@ netdev_tc_flow_del(struct netdev *netdev
> > OVS_UNUSED,
> > >>>          return error;
> > >>>      }
> > >>>
> > >>> -    if (stats) {
> > >>> -        memset(stats, 0, sizeof *stats);
> > >>> -        if (!tc_get_flower(&id, &flower)) {
> > >>> -            parse_tc_flower_to_stats(&flower, stats);
> > >>> -        }
> > >>> -    }
> > >>> -
> > >>> -    error = del_filter_and_ufid_mapping(&id, ufid);
> > >>> -
> > >>> -    return error;
> > >>> +    return del_filter_and_ufid_mapping(&id, ufid, stats);
> > >>>  }
> > >>>
> > >>>  static int
> > >>> --
> > >>> 2.23.0
> > >>>
> > >>> _______________________________________________
> > >>> dev mailing list
> > >>> d...@openvswitch.org
> > >>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> > >>
> > >>
> > >
> > > --
> > > Best regards,
> > > Tao Liu
> >
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to