Thanks for the patch. A minor detail might have been overlooked

On Fri, Oct 10, 2025 at 11:27 AM Rosemarie O'Riorden via dev <
[email protected]> wrote:

> If you set a chassis' encap ip to be the same as that of another
> chassis, OVN should detect it, log an error, and delete the chassis
> record.
>
> But this was not the case; OVN would allow the transaction. However, the
> commit to the southbound database would fail, because there was already
> an encap record with that same ip and type (from the other chassis).
> The OVS db on the problematic chassis would then rapidly increase in
> size. Essentially, the controller would get stuck in a loop of deleting
> and recreating the tunnels.
>
> This change makes it so when encaps are change, before building encaps,
> the controller detects if there is already an existing encap tunnel of
> the same ip and type. If so, it logs an error, refuses to build the
> encaps, and deletes the associated chassis and chassis private record.
>
> Reported-at: https://issues.redhat.com/browse/FDP-1481
> Co-authored-by: Nicholas Hubbard <[email protected]>
> Signed-off-by: Nicholas Hubbard <[email protected]>
> Signed-off-by: Rosemarie O'Riorden <[email protected]>
> ---
> v3:
> - Update misleading comment
> - Add deletion of chassis private record
> - Move declaration function name to same line as return type
> - Delete "Transaction causes multiple rows" error from exceptions of
> affected
>   test
> - Update my own error to be more specific in exceptions of affected test
> - Update test to not be racy; Check for db row counts instead of saving
> file
>   size after change
> - Update commit message to reflect changes
>
> v2: Fix typo in title.
> ---
>  controller/chassis.c        | 99 ++++++++++++++++++++++++++-----------
>  controller/chassis.h        |  3 +-
>  controller/ovn-controller.c |  5 +-
>  lib/chassis-index.c         | 20 ++++++++
>  lib/chassis-index.h         |  3 ++
>  tests/ovn-controller.at     | 36 ++++++++++++--
>  6 files changed, 130 insertions(+), 36 deletions(-)
>
> diff --git a/controller/chassis.c b/controller/chassis.c
> index 07bef1c56..12117259d 100644
> --- a/controller/chassis.c
> +++ b/controller/chassis.c
> @@ -77,6 +77,12 @@ struct ovs_chassis_cfg {
>      bool ct_label_flush;
>  };
>
> +enum chassis_update_status {
> +    CHASSIS_UPDATED,
> +    CHASSIS_NOT_UPDATED,
> +    CHASSIS_NEED_DELETE
> +};
> +
>  static void
>  ovs_chassis_cfg_init(struct ovs_chassis_cfg *cfg)
>  {
> @@ -699,7 +705,8 @@ chassis_build_encaps(struct ovsdb_idl_txn
> *ovnsb_idl_txn,
>                       const char *encap_ip_default,
>                       const char *chassis_id,
>                       const char *encap_csum,
> -                     size_t *n_encap)
> +                     size_t *n_encap,
> +                     struct ovsdb_idl_index *sbrec_encaps)
>  {
>      size_t tunnel_count = 0;
>
> @@ -713,6 +720,18 @@ chassis_build_encaps(struct ovsdb_idl_txn
> *ovnsb_idl_txn,
>
>      SSET_FOR_EACH (encap_ip, encap_ip_set) {
>          SSET_FOR_EACH (encap_type, encap_type_set) {
> +            const struct sbrec_encap *encap_rec =
> +                encap_lookup_by_ip_and_type(sbrec_encaps,
> +                                            encap_ip, encap_type);
> +            if (encap_rec && strcmp(encap_rec->chassis_name, chassis_id))
> {
> +                static struct vlog_rate_limit rl =
> VLOG_RATE_LIMIT_INIT(5, 1);
> +                VLOG_WARN_RL(&rl, "'%s' already has encap ip '%s' and "
> +                             "type '%s', cannot duplicate on '%s'",
> +                             encap_rec->chassis_name, encap_rec->ip,
> +                             encap_rec->type, chassis_id);
> +                continue;
>

if there are multiple encap_ips specified and one of them is a duplicate
then the loop that this patch fixes will still happen.



> +            }
> +
>              struct sbrec_encap *encap = sbrec_encap_insert(ovnsb_idl_txn);
>
>              sbrec_encap_set_type(encap, encap_type);
> @@ -778,7 +797,7 @@ update_supported_sset(struct sset *supported)
>
>  static void
>  remove_unsupported_options(const struct sbrec_chassis *chassis_rec,
> -                           bool *updated)
> +                           enum chassis_update_status *update_status)
>  {
>      struct sset supported_options = SSET_INITIALIZER(&supported_options);
>      update_supported_sset(&supported_options);
> @@ -789,7 +808,7 @@ remove_unsupported_options(const struct sbrec_chassis
> *chassis_rec,
>              VLOG_WARN("Removing unsupported key \"%s\" from chassis
> record.",
>                        node->key);
>              sbrec_chassis_update_other_config_delkey(chassis_rec,
> node->key);
> -            *updated = true;
> +            *update_status = CHASSIS_UPDATED;
>          }
>      }
>
> @@ -827,25 +846,28 @@ chassis_get_record(struct ovsdb_idl_txn
> *ovnsb_idl_txn,
>  }
>
>  /* Update a Chassis record based on the config in the ovs config.
> - * Returns true if 'chassis_rec' was updated, false otherwise.
> + * Returns CHASSIS_UPDATED if 'chassis_rec' was updated,
> CHASSIS_NEED_DELETE if
> + * another chassis exists with an encap with same IP and type as
> 'chassis', and
> + * returns CHASSIS_NOT_UPDATED otherwise.
>   */
> -static bool
> +static enum chassis_update_status
>  chassis_update(const struct sbrec_chassis *chassis_rec,
>                 struct ovsdb_idl_txn *ovnsb_idl_txn,
>                 const struct ovs_chassis_cfg *ovs_cfg,
>                 const char *chassis_id,
> -               const struct sset *transport_zones)
> +               const struct sset *transport_zones,
> +               struct ovsdb_idl_index *sbrec_encaps)
>  {
> -    bool updated = false;
> +    enum chassis_update_status update_status = CHASSIS_NOT_UPDATED;
>
>      if (strcmp(chassis_id, chassis_rec->name)) {
>          sbrec_chassis_set_name(chassis_rec, chassis_id);
> -        updated = true;
> +        update_status = CHASSIS_UPDATED;
>      }
>
>      if (strcmp(ovs_cfg->hostname, chassis_rec->hostname)) {
>          sbrec_chassis_set_hostname(chassis_rec, ovs_cfg->hostname);
> -        updated = true;
> +        update_status = CHASSIS_UPDATED;
>      }
>
>      if (chassis_other_config_changed(ovs_cfg, chassis_rec)) {
> @@ -856,12 +878,12 @@ chassis_update(const struct sbrec_chassis
> *chassis_rec,
>          sbrec_chassis_verify_other_config(chassis_rec);
>          sbrec_chassis_set_other_config(chassis_rec, &other_config);
>          smap_destroy(&other_config);
> -        updated = true;
> +        update_status = CHASSIS_UPDATED;
>      }
>
>      update_chassis_transport_zones(transport_zones, chassis_rec);
>
> -    remove_unsupported_options(chassis_rec, &updated);
> +    remove_unsupported_options(chassis_rec, &update_status);
>
>      /* If any of the encaps should change, update them. */
>      bool tunnels_changed =
> @@ -871,20 +893,27 @@ chassis_update(const struct sbrec_chassis
> *chassis_rec,
>                                  ovs_cfg->encap_csum,
>                                  chassis_rec);
>      if (!tunnels_changed) {
> -        return updated;
> +        return update_status;
>      }
>
>      struct sbrec_encap **encaps;
>      size_t n_encap;
>
>      encaps =
> -        chassis_build_encaps(ovnsb_idl_txn, &ovs_cfg->encap_type_set,
> +        chassis_build_encaps(ovnsb_idl_txn,
> +                             &ovs_cfg->encap_type_set,
>                               &ovs_cfg->encap_ip_set,
>                               ovs_cfg->encap_ip_default, chassis_id,
> -                             ovs_cfg->encap_csum, &n_encap);
> +                             ovs_cfg->encap_csum, &n_encap,
> +                             sbrec_encaps);
>      sbrec_chassis_set_encaps(chassis_rec, encaps, n_encap);
>      free(encaps);
> -    return true;
> +

+    if (!n_encap) {

+        return CHASSIS_NEED_DELETE;
> +    }
> +
> +    return CHASSIS_UPDATED;
>  }
>
>  /* If this is a chassis_private config update after we initialized the
> record
> @@ -935,7 +964,8 @@ chassis_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
>              const char *chassis_id,
>              const struct ovsrec_bridge *br_int,
>              const struct sset *transport_zones,
> -            const struct sbrec_chassis_private **chassis_private)
> +            const struct sbrec_chassis_private **chassis_private,
> +            struct ovsdb_idl_index *sbrec_encaps)
>  {
>      struct ovs_chassis_cfg ovs_cfg;
>
> @@ -956,22 +986,31 @@ chassis_run(struct ovsdb_idl_txn *ovnsb_idl_txn,
>       * modified in the ovs table.
>       */
>      if (chassis_rec && ovnsb_idl_txn) {
> -        bool updated = chassis_update(chassis_rec, ovnsb_idl_txn,
> &ovs_cfg,
> -                                      chassis_id, transport_zones);
> -
> -        if (!existed || updated) {
> -            ovsdb_idl_txn_add_comment(ovnsb_idl_txn,
> -                                      "ovn-controller: %s chassis '%s'",
> -                                      !existed ? "registering" :
> "updating",
> -                                      chassis_id);
> -        }
> +        enum chassis_update_status update_status =
> +            chassis_update(chassis_rec, ovnsb_idl_txn, &ovs_cfg,
> +                           chassis_id,
> +                           transport_zones, sbrec_encaps);
> +
> +        *chassis_private = chassis_private_get_record(ovnsb_idl_txn,
> +                               sbrec_chassis_private_by_name, chassis_id);
>
> -        *chassis_private =
> -            chassis_private_get_record(ovnsb_idl_txn,
> -                                       sbrec_chassis_private_by_name,
> +        if (update_status == CHASSIS_NEED_DELETE) {
> +            sbrec_chassis_delete(chassis_rec);
> +            chassis_rec = NULL;
> +            if (*chassis_private) {
> +                sbrec_chassis_private_delete(*chassis_private);
> +            }
> +        } else {
> +            if (!existed || update_status == CHASSIS_UPDATED) {
> +                ovsdb_idl_txn_add_comment(ovnsb_idl_txn,
> +                                          "ovn-controller: %s chassis
> '%s'",
> +                                          !existed ? "registering"
> +                                          : "updating", chassis_id);
> +            }
> +            if (*chassis_private) {
> +                chassis_private_update(*chassis_private, chassis_rec,
>                                         chassis_id);
> -        if (*chassis_private) {
> -            chassis_private_update(*chassis_private, chassis_rec,
> chassis_id);
> +            }
>          }
>      }
>
> diff --git a/controller/chassis.h b/controller/chassis.h
> index 03cc2f906..14251bdad 100644
> --- a/controller/chassis.h
> +++ b/controller/chassis.h
> @@ -44,7 +44,8 @@ const struct sbrec_chassis *chassis_run(
>      const struct ovsrec_open_vswitch_table *,
>      const char *chassis_id, const struct ovsrec_bridge *br_int,
>      const struct sset *transport_zones,
> -    const struct sbrec_chassis_private **chassis_private);
> +    const struct sbrec_chassis_private **chassis_private,
> +    struct ovsdb_idl_index *sbrec_chassis_encaps);
>  bool chassis_cleanup(struct ovsdb_idl_txn *ovs_idl_txn,
>                       struct ovsdb_idl_txn *ovnsb_idl_txn,
>                       const struct ovsrec_open_vswitch_table *,
> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> index 6fbf3a227..67a2c447c 100644
> --- a/controller/ovn-controller.c
> +++ b/controller/ovn-controller.c
> @@ -6684,6 +6684,9 @@ main(int argc, char *argv[])
>      struct ovsdb_idl_index *sbrec_learned_route_index_by_datapath
>          = ovsdb_idl_index_create1(ovnsb_idl_loop.idl,
>                                    &sbrec_learned_route_col_datapath);
> +    struct ovsdb_idl_index *sbrec_encaps
> +        = ovsdb_idl_index_create2(ovnsb_idl_loop.idl,
> +                                  &sbrec_encap_col_type,
> &sbrec_encap_col_ip);
>
>      ovsdb_idl_track_add_all(ovnsb_idl_loop.idl);
>      ovsdb_idl_omit_alert(ovnsb_idl_loop.idl,
> @@ -7401,7 +7404,7 @@ main(int argc, char *argv[])
>                                        sbrec_chassis_private_by_name,
>                                        ovs_table, chassis_id,
>                                        br_int, &transport_zones,
> -                                      &chassis_private);
> +                                      &chassis_private, sbrec_encaps);
>              }
>
>              /* If any OVS feature support changed, force a full recompute.
> diff --git a/lib/chassis-index.c b/lib/chassis-index.c
> index 4b38036cb..953cc6ed4 100644
> --- a/lib/chassis-index.c
> +++ b/lib/chassis-index.c
> @@ -115,3 +115,23 @@ ha_chassis_group_lookup_by_name(
>
>      return retval;
>  }
> +
> +/* Finds and returns the encap with the given ip and type, or NULL if no
> such
> + * encap exists. */
> +const struct sbrec_encap *
> +encap_lookup_by_ip_and_type(struct ovsdb_idl_index *sbrec_chassis_encaps,
> +                       const char *ip, const char *type)
> +{
> +    struct sbrec_encap *target = sbrec_encap_index_init_row(
> +        sbrec_chassis_encaps);
> +
> +    sbrec_encap_index_set_ip(target, ip);
> +    sbrec_encap_index_set_type(target, type);
> +
> +    struct sbrec_encap *retval = sbrec_encap_index_find(
> +        sbrec_chassis_encaps, target);
> +
> +    sbrec_encap_index_destroy_row(target);
> +
> +    return retval;
> +}
> diff --git a/lib/chassis-index.h b/lib/chassis-index.h
> index bc654da13..7161458b2 100644
> --- a/lib/chassis-index.h
> +++ b/lib/chassis-index.h
> @@ -35,5 +35,8 @@ chassis_private_lookup_by_name(
>  struct ovsdb_idl_index *ha_chassis_group_index_create(struct ovsdb_idl
> *idl);
>  const struct sbrec_ha_chassis_group *ha_chassis_group_lookup_by_name(
>      struct ovsdb_idl_index *sbrec_ha_chassis_grp_by_name, const char
> *name);
> +const struct sbrec_encap *encap_lookup_by_ip_and_type(
> +    struct ovsdb_idl_index *sbrec_chassis_encaps,
> +    const char *ip, const char *type);
>
>  #endif /* lib/chassis-index.h */
> diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
> index 7bc1aca59..6713f1a2b 100644
> --- a/tests/ovn-controller.at
> +++ b/tests/ovn-controller.at
> @@ -277,7 +277,8 @@ ovs-vsctl -- set Open_vSwitch .
> external-ids:hostname="${new_sysid}" \
>            -- set Open_vSwitch .
> external-ids:ovn-remote="${current_remote}"
>
>  OVS_WAIT_UNTIL([
> -    grep -q 'Transaction causes multiple rows in \\"Encap\\" table to
> have identical values' hv/ovn-controller.log
> +    grep -q "'hv' already has encap ip '192.168.0.1' and type 'geneve', "
> \
> +            "cannot duplicate on 'hv-foo'" hv/ovn-controller.log
>  ])
>
>  # Destroy the stale entries manually and ovn-controller should now be able
> @@ -305,12 +306,12 @@ check ovn-sbctl destroy chassis_private . -- destroy
> chassis .
>  # - could not create datapath br-int of unknown type foobar
>  # - unknown datapath type bar
>  # - could not create datapath br-int of unknown type bar
> -# - Transaction causes multiple rows ...
>  # - failed to create bridge br-int: Address family not supported by
> protocol # due to previous errors.
> +# - 'hv' already has encap ip '192.168.0.1' and type 'geneve', cannot
> duplicate on 'hv-foo'
>  OVN_CLEANUP_SBOX([hv], ["/foo/d
>  /bar/d
> -/Transaction causes multiple rows/d
> -/failed to create bridge/d"])
> +/failed to create bridge/d
> +/already has encap ip.*cannot duplicate on/d"])
>
>  OVN_CLEANUP_VSWITCH([main])
>  as ovn-sb
> @@ -3821,3 +3822,30 @@ wait_ports 0
>  OVN_CLEANUP([hv1
>  /Invalid VXLAN port number.*/d])
>  AT_CLEANUP
> +
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([ovn-controller - two encaps same IP and type])
> +AT_KEYWORDS([ovn])
> +ovn_start
> +net_add n1
> +
> +sim_add hv1
> +as hv1
> +check ovs-vsctl add-br br-phys
> +ovn_attach n1 br-phys 192.168.0.11
> +
> +sim_add hv2
> +as hv2
> +check ovs-vsctl add-br br-phys
> +ovn_attach n1 br-phys 192.168.0.12
> +
> +# Set hv2's encap IP to be the same as hv1's.
> +check ovs-vsctl set Open_vSwitch .
> external-ids:ovn-encap-ip="192.168.0.11"
>

if you change this line to

check ovs-vsctl set Open_vSwitch .
external-ids:ovn-encap-ip="192.168.0.11,192.168.0.12"

the controller does not add the duplicated encap-ip but ovn-controller gets
stuck in a loop trying to and failing.


> +wait_row_count Encap 0 ip="192.168.0.12"
> +
> +# Make sure hv2 has been deleted because of the misconfiguration.
> +wait_row_count Chassis 0 name="hv2"
> +wait_row_count Chassis_Private 0 name="hv2"
> +
> +AT_CLEANUP
> +])
> --
> 2.51.0
>
> _______________________________________________
> dev mailing list
> [email protected]
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
Thanks,
Jacob Tanenbaum
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to