On 9/10/25 4:04 PM, Ales Musil wrote:
> On Wed, Sep 10, 2025 at 3:04 PM Ales Musil <[email protected]> wrote:
> 
>>
>>
>> On Wed, Sep 10, 2025 at 9:44 AM Dumitru Ceara <[email protected]> wrote:
>>
>>> On 9/10/25 7:59 AM, Ales Musil via dev wrote:
>>>> We were removing routes while still iterating over them,
>>>> while it might work it's not a good practice. Postpone
>>>> the removal after the dump is done.
>>>>
>>>> Reported-at: https://issues.redhat.com/browse/FDP-1594
>>>> Signed-off-by: Ales Musil <[email protected]>
>>>> ---
>>>
>>> Hi Ales,
>>>
>>> Thanks for the fix!
>>>
>>> There's one CI test failure in the ovsrobot run of this patch but it's
>>> not related to the change:
>>>
>>> https://github.com/ovsrobot/ovn/actions/runs/17604844865/job/50013775059#step:11:4605
>>
>>
>> Thank you Dumitru,
>>
>> it seems that we have a new flake.
>>
>>
>>>
>>>
>>>>  controller/route-exchange-netlink.c | 70 ++++++++++++++++++++---------
>>>>  1 file changed, 50 insertions(+), 20 deletions(-)
>>>>
>>>> diff --git a/controller/route-exchange-netlink.c
>>> b/controller/route-exchange-netlink.c
>>>> index cec4b1ec3..91f059492 100644
>>>> --- a/controller/route-exchange-netlink.c
>>>> +++ b/controller/route-exchange-netlink.c
>>>> @@ -200,9 +200,9 @@ struct route_msg_handle_data {
>>>>      const struct sbrec_datapath_binding *db;
>>>>      struct hmapx *routes_to_advertise;
>>>>      struct vector *learned_routes;
>>>> +    struct vector *stale_routes;
>>>>      const struct hmap *routes;
>>>>      uint32_t table_id; /* requested table id. */
>>>> -    int ret;
>>>>  };
>>>>
>>>>  static void
>>>> @@ -211,7 +211,6 @@ handle_route_msg(const struct route_table_msg *msg,
>>> void *data)
>>>>      struct route_msg_handle_data *handle_data = data;
>>>>      const struct route_data *rd = &msg->rd;
>>>>      struct advertise_route_entry *ar;
>>>> -    int err;
>>>>
>>>>      if (handle_data->table_id != rd->rta_table_id) {
>>>>          /* We do not have the NLM_F_DUMP_FILTERED info here, so check
>>> if the
>>>> @@ -261,23 +260,37 @@ handle_route_msg(const struct route_table_msg
>>> *msg, void *data)
>>>>              }
>>>>          }
>>>>      }
>>>> -    err = re_nl_delete_route(rd->rta_table_id, &rd->rta_dst,
>>>> -                             rd->rtm_dst_len, rd->rta_priority);
>>>> -    if (err) {
>>>> -        char addr_s[INET6_ADDRSTRLEN + 1];
>>>> -        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
>>>> -        VLOG_WARN_RL(&rl, "Delete route table_id=%"PRIu32" dst=%s
>>> plen=%d "
>>>> -                     "failed: %s", rd->rta_table_id,
>>>> -                     ipv6_string_mapped(
>>>> -                         addr_s, &rd->rta_dst) ? addr_s : "(invalid)",
>>>> -                     rd->rtm_dst_len,
>>>> -                     ovs_strerror(err));
>>>> -
>>>> -        if (!handle_data->ret) {
>>>> -            /* Report the first error value to the caller. */
>>>> -            handle_data->ret = err;
>>>> +
>>>> +    if (handle_data->stale_routes) {
>>>> +        vector_push(handle_data->stale_routes, rd);
>>>> +    }
>>>> +}
>>>> +
>>>> +static int
>>>> +re_nl_delete_stale_routes(const struct vector *stale_routes)
>>>> +{
>>>> +    int ret = 0;
>>>> +
>>>> +    const struct route_data *rd;
>>>> +    VECTOR_FOR_EACH_PTR (stale_routes, rd) {
>>>> +        int err = re_nl_delete_route(rd->rta_table_id, &rd->rta_dst,
>>>> +                                     rd->rtm_dst_len,
>>> rd->rta_priority);
>>>> +        if (err) {
>>>> +            char addr_s[INET6_ADDRSTRLEN + 1];
>>>> +            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5,
>>> 20);
>>>> +            VLOG_WARN_RL(&rl, "Delete route table_id=%"PRIu32" dst=%s
>>> plen=%d "
>>>> +                         "failed: %s", rd->rta_table_id,
>>>> +                         ipv6_string_mapped(
>>>> +                             addr_s, &rd->rta_dst) ? addr_s :
>>> "(invalid)",
>>>> +                         rd->rtm_dst_len,
>>>> +                         ovs_strerror(err));
>>>> +            if (!ret) {
>>>> +                ret = err;
>>>> +            }
>>>>          }
>>>>      }
>>>> +
>>>> +    return ret;
>>>>  }
>>>>
>>>>  int
>>>> @@ -286,8 +299,9 @@ re_nl_sync_routes(uint32_t table_id, const struct
>>> hmap *routes,
>>>>                    const struct sbrec_datapath_binding *db)
>>>>  {
>>>>      struct hmapx routes_to_advertise =
>>> HMAPX_INITIALIZER(&routes_to_advertise);
>>>> +    struct vector stale_routes = VECTOR_EMPTY_INITIALIZER(struct
>>> route_data);
>>>>      struct advertise_route_entry *ar;
>>>> -    int ret;
>>>> +    int ret = 0;
>>>>
>>>>      HMAP_FOR_EACH (ar, node, routes) {
>>>>          hmapx_add(&routes_to_advertise, ar);
>>>> @@ -300,11 +314,11 @@ re_nl_sync_routes(uint32_t table_id, const struct
>>> hmap *routes,
>>>>          .routes = routes,
>>>>          .routes_to_advertise = &routes_to_advertise,
>>>>          .learned_routes = learned_routes,
>>>> +        .stale_routes = &stale_routes,
>>>>          .db = db,
>>>>          .table_id = table_id,
>>>>      };
>>>>      route_table_dump_one_table(table_id, handle_route_msg, &data);
>>>> -    ret = data.ret;
>>>>
>>>>      /* Add any remaining routes in the routes_to_advertise hmapx to the
>>>>       * system routing table. */
>>>> @@ -328,7 +342,14 @@ re_nl_sync_routes(uint32_t table_id, const struct
>>> hmap *routes,
>>>>              }
>>>>          }
>>>>      }
>>>> +
>>>> +    int err = re_nl_delete_stale_routes(&stale_routes);
>>>> +    if (!ret) {
>>>> +        ret = err;
>>>> +    }
>>>> +
>>>>      hmapx_destroy(&routes_to_advertise);
>>>> +    vector_destroy(&stale_routes);
>>>>
>>>>      return ret;
>>>>  }
>>>> @@ -336,15 +357,24 @@ re_nl_sync_routes(uint32_t table_id, const struct
>>> hmap *routes,
>>>>  int
>>>>  re_nl_cleanup_routes(uint32_t table_id)
>>>>  {
>>>> +    int ret = 0;
>>>> +    struct vector stale_routes = VECTOR_EMPTY_INITIALIZER(struct
>>> route_data);
>>>>      /* Remove routes from the system that are not in the host_routes
>>> hmap and
>>>>       * remove entries from host_routes hmap that match routes already
>>> installed
>>>>       * in the system. */
>>>>      struct route_msg_handle_data data = {
>>>>          .routes_to_advertise = NULL,
>>>>          .learned_routes = NULL,
>>>> +        .stale_routes = &stale_routes,
>>>>          .table_id = table_id,
>>>>      };
>>>>      route_table_dump_one_table(table_id, handle_route_msg, &data);
>>>>
>>>> -    return data.ret;
>>>> +    int err = re_nl_delete_stale_routes(&stale_routes);
>>>
>>> Nit: I think I would've just inlined the re_nl_delete_stale_routes()
>>> code here like we do in ne_nl_sync_neigh().  It saves us the check below
>>> and the function itself is not that large, IMO, it gives a better
>>> overview of what's going on.
>>>
>>
>> This one is actually called twice once in "re_nl_cleanup_routes()"
>> and second in "re_nl_sync_routes()". That's why there is separate
>> function. Which is different from "ne_nl_sync_neigh()" where deletion
>> happens only once. With that said I'll leave it as is.
>>
>>
>>> But I'll leave it up to you:
>>>
>>> Acked-by: Dumitru Ceara <[email protected]>
>>>
>>>> +    if (!ret) {
>>>> +        ret = err;
>>>> +    }
>>>> +    vector_destroy(&stale_routes);
>>>> +
>>>> +    return ret;
>>>>  }
>>>
>>> Regards,
>>> Dumitru
>>>
>>>
>> I went ahead and merged this into main and backported to 25.09. I will
>> have to create a custom backport patch for 25.03, I'll send it out to the
>> ML.
>>
>> Regards,
>> Ales
>>
> 
> As discussed offline this is the backport I'm going to apply, let me know
> if it looks good to you.
> 
> https://github.com/almusil/ovn/commit/62722d8713c9ba4596c6976b29b7334aaa100dbf
> 

It looks good to me, thanks for taking care of this!

Regards,
Dumitru

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to