On 25 Mar 2022, at 18:05, Tao Liu wrote:

> On Fri, Mar 25, 2022 at 12:08:20PM +0100, Eelco Chaudron wrote:
>>
>>
>> On 22 Mar 2022, at 13:04, Tao Liu wrote:
>>
>>> If netdev goes away(i.e. qemu with a vnet shutdown), kernel delete all tc
>>> filters, those tcf_id related to the netdev will be left in ufid_to_tc
>>> hashmap.
>>> When qemu restart with a same vnet but different ifindex assigned,
>>> a dup ufid may add. Especially after hashmap_expand, the old entry will
>>> insert before the new one, delete or modify tc flow will always fail.
>>>
>>> So delete the stale entry to avoid a duplicated ufid in hashmap.
>>
>> There are some comments below, as I still do not fully understand the fix.
>>
> Please let me explain more.
> As the scenery of qemu restart described above, filters are flushed by
> kernel, but tcf_ids remain in ufid_to_tc hashmap.
>
> Considering a ping running continuously from other host to VM, the first
> reply from VM triggers upcall after qemu restart. A same ufid is generated
> if the request processed on same handler. At this time, an old ufid exists
> in hashmap, and flow del returns ENODEV(ifindex is wrong).

Can this not be solved by modifying the previous fix to this?

--- a/lib/netdev-offload-tc.c
+++ b/lib/netdev-offload-tc.c
@@ -198,7 +198,9 @@ del_filter_and_ufid_mapping(struct tcf_id *id, const 
ovs_u128 *ufid)
     int err;

     err = tc_del_filter(id);
-    if (!err) {
+    if (!err || err = ENODEV) {
        del_ufid_tc_mapping(ufid);
     }
     return err;
 }


> However flow put
> still succeeds because there is no filter in kernel. And a dup ufid adds in
> hashmap.
>
> After hashmap_expand, the old entry inserts before the new one.
>
> If flow expires, revalidator fails to del flow bacause we always get the
> old entry. There are constant warnings in ovs-vswitch.log:
>
>     dpif(revalidator49)|WARN|system@ovs-system: failed to flow_del (No such
>     file or directory) ufid:d81939b3-8c8d-4b35-8d58-88c098709b91 ...
>
> In case of flow mod, flow del inside flow put still fails with ENODEV, and
> tc_replace_flower also fails because old filter exists in kernel.
>
> The idea of this patch is that, if a filter can't get from kernel, we can
> just delete the mapping, and no need to call del filter as it does not
> exist. In this way, we can keep consistent between kernel and userspace.

Thanks for the details, and I think the problem is a bit more complex.

As in theory duplicate ufid's could exist (guess even on the same port ;).
We could also fix this by including the interface in the lookup, in 
get_ufid_tc_mapping()?
For example something like this:

diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
index 12d0a9af3..8be3c5dc3 100644
--- a/lib/netdev-offload-tc.c
+++ b/lib/netdev-offload-tc.c
@@ -239,14 +239,15 @@ add_ufid_tc_mapping(struct netdev *netdev, const ovs_u128 
*ufid,
  * Otherwise returns the error.
  */
 static int
-get_ufid_tc_mapping(const ovs_u128 *ufid, struct tcf_id *id)
+get_ufid_tc_mapping(struct netdev *netdev, const ovs_u128 *ufid,
+                    struct tcf_id *id)
 {
     size_t ufid_hash = hash_bytes(ufid, sizeof *ufid, 0);
     struct ufid_tc_data *data;

     ovs_mutex_lock(&ufid_lock);
     HMAP_FOR_EACH_WITH_HASH (data, ufid_to_tc_node, ufid_hash, &ufid_to_tc) {
-        if (ovs_u128_equals(*ufid, data->ufid)) {
+        if (netdev == data->netdev && ovs_u128_equals(*ufid, data->ufid) ) {
             *id = data->id;
             ovs_mutex_unlock(&ufid_lock);
             return 0;
@@ -1943,7 +1944,7 @@ netdev_tc_flow_put(struct netdev *netdev, struct match 
*match,
         return EOPNOTSUPP;
     }

-    if (get_ufid_tc_mapping(ufid, &id) == 0) {
+    if (get_ufid_tc_mapping(netdev, ufid, &id) == 0) {
         VLOG_DBG_RL(&rl, "updating old handle: %d prio: %d",
                     id.handle, id.prio);
         info->tc_modify_flow_deleted = !del_filter_and_ufid_mapping(&id, ufid);
@@ -1986,7 +1987,7 @@ netdev_tc_flow_get(struct netdev *netdev,
     struct tcf_id id;
     int err;

-    err = get_ufid_tc_mapping(ufid, &id);
+    err = get_ufid_tc_mapping(netdev, ufid, &id);
     if (err) {
         return err;
     }
@@ -2013,7 +2014,7 @@ netdev_tc_flow_get(struct netdev *netdev,
 }

 static int
-netdev_tc_flow_del(struct netdev *netdev OVS_UNUSED,
+netdev_tc_flow_del(struct netdev *netdev,
                    const ovs_u128 *ufid,
                    struct dpif_flow_stats *stats)
 {
@@ -2021,7 +2022,7 @@ netdev_tc_flow_del(struct netdev *netdev OVS_UNUSED,
     struct tcf_id id;
     int error;

-    error = get_ufid_tc_mapping(ufid, &id);
+    error = get_ufid_tc_mapping(netdev, ufid, &id);
     if (error) {
         return error;
     }

Hover this does not fix the root cause (same with your fix below). We need a 
way to delete these stale mappings. Maybe we can watch some netlink messages, 
or re-use the netdev layer, but I think we hold a reference in the mapping so 
we won't delete either.

I think this is what we should try to fix, not working around the leftover 
entries.

>>> Fixes: dd8ca104acd7 ("netdev-tc-offloads: Don't delete ufid mapping if fail 
>>> to delete filter")))
>>> Signed-off-by: Tao Liu <thomas....@ucloud.cn>
>>> ---
>>>  lib/netdev-offload-tc.c | 37 ++++++++++++++++++++++---------------
>>>  1 file changed, 22 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
>>> index 12d0a9af3..e9dd66790 100644
>>> --- a/lib/netdev-offload-tc.c
>>> +++ b/lib/netdev-offload-tc.c
>>> @@ -167,6 +167,9 @@ struct ufid_tc_data {
>>>      struct netdev *netdev;
>>>  };
>>>
>>> +static void parse_tc_flower_to_stats(struct tc_flower *flower,
>>> +                                     struct dpif_flow_stats *stats);
>>> +
>>>  static void
>>>  del_ufid_tc_mapping_unlocked(const ovs_u128 *ufid)
>>>  {
>>> @@ -200,11 +203,24 @@ del_ufid_tc_mapping(const ovs_u128 *ufid)
>>>
>>>  /* Wrapper function to delete filter and ufid tc mapping */
>>>  static int
>>> -del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid)
>>> +del_filter_and_ufid_mapping(struct tcf_id *id, const ovs_u128 *ufid,
>>> +                            struct dpif_flow_stats *stats)
>>>  {
>>> -    int err;
>>> +    struct tc_flower flower;
>>> +    int err = 0, get_err;
>>> +
>>> +    get_err = tc_get_flower(id, &flower);
>>> +    if (!get_err) {
>>
>> We should definitely add a comment here explaining the reasoning behind 
>> doing the get first and then the delete!
>>
> Will add it.
>
>> However, it’s still not clear to me why this change does not undo, 
>> dd8ca104acd7?
>>
>> Because tc_get_flower() just finds out if the entry with the ID exists in 
>> the kernel, and than only try to delete it.
>>
>> For this can you just not check a specific return value in the 
>> tc_del_filter() call?
>>
>> I assume dd8ca104acd7 just want to skip removing the del_ufid_tc_mapping() 
>> if the entry does NOT exist in the kernel?
>> If not maybe Jianbo can explain a bit more the corner case, or maybe has a 
>> test case for this?
>>
> commit dd8ca104acd7 prevents the case of mapping deleted but filter exists
> in kernel(del fails). This patch includes both caeses of the inconsistency.
>
>>> +        err = tc_del_filter(id);
>>> +    }
>>> +
>>> +    if (stats) {
>>> +        memset(stats, 0, sizeof *stats);
>>> +        if (!get_err) {
>>> +            parse_tc_flower_to_stats(&flower, stats);
>>> +        }
>>> +    }
>>
>> I guess this was moved here to avoid the additional tc_get_flower() call 
>> below.
>>
> Yes, it is.
>
>> I would fold the stats part in the above if() case to avoid the need for the 
>> get_err variable, i.e.
>>
>>     if (!tc_get_flower(id, &flower)) {
>>       err = tc_del_filter(id);
>>       if (stats) {
>>         memset(stats, 0, sizeof *stats);
>>         if (!get_err) {
>>             parse_tc_flower_to_stats(&flower, stats);
>>         }
>>       }
>>
> We also need to zero stats if tc_get_flower fails.
>
>>
>>>
>>> -    err = tc_del_filter(id);
>>>      if (!err) {
>>>          del_ufid_tc_mapping(ufid);
>>>      }
>>> @@ -1946,7 +1962,8 @@ netdev_tc_flow_put(struct netdev *netdev, struct 
>>> match *match,
>>>      if (get_ufid_tc_mapping(ufid, &id) == 0) {
>>>          VLOG_DBG_RL(&rl, "updating old handle: %d prio: %d",
>>>                      id.handle, id.prio);
>>> -        info->tc_modify_flow_deleted = !del_filter_and_ufid_mapping(&id, 
>>> ufid);
>>> +        info->tc_modify_flow_deleted =
>>> +                    !del_filter_and_ufid_mapping(&id, ufid, NULL);
>>>      }
>>>
>>>      prio = get_prio_for_tc_flower(&flower);
>>> @@ -2017,7 +2034,6 @@ netdev_tc_flow_del(struct netdev *netdev OVS_UNUSED,
>>>                     const ovs_u128 *ufid,
>>>                     struct dpif_flow_stats *stats)
>>>  {
>>> -    struct tc_flower flower;
>>>      struct tcf_id id;
>>>      int error;
>>>
>>> @@ -2026,16 +2042,7 @@ netdev_tc_flow_del(struct netdev *netdev OVS_UNUSED,
>>>          return error;
>>>      }
>>>
>>> -    if (stats) {
>>> -        memset(stats, 0, sizeof *stats);
>>> -        if (!tc_get_flower(&id, &flower)) {
>>> -            parse_tc_flower_to_stats(&flower, stats);
>>> -        }
>>> -    }
>>> -
>>> -    error = del_filter_and_ufid_mapping(&id, ufid);
>>> -
>>> -    return error;
>>> +    return del_filter_and_ufid_mapping(&id, ufid, stats);
>>>  }
>>>
>>>  static int
>>> -- 
>>> 2.23.0
>>>
>>> _______________________________________________
>>> dev mailing list
>>> d...@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>>
>
> --
> Best regards,
> Tao Liu

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to