Hi Rose Thanks for looking into this issue and providing a patch. I have two additional comments - see below
Thanks Xavier On Thu, Sep 18, 2025 at 8:47 AM Ales Musil via dev <[email protected]> wrote: > On Thu, Sep 18, 2025 at 12:02 AM Rosemarie O'Riorden via dev < > [email protected]> wrote: > > > If you set a chassis' encap ip to be the same as that of another > > chassis, OVN should detect it, log an error, and delete the chassis. > > > > But this was not the case; OVN would allow the transaction. However, the > > commit to the southbound database would fail, because there was already > > an encap record with that same ip and type (from the other chassis). > > The OVS db on the problematic chassis would then rapidly increase in > > size. Essentially, the controller would get stuck in a loop of deleting > > and recreating the tunnels. > > > > This change makes it so before building encaps, the controller detects > > if there is an existing encap tunnel of a certain ip and type. If so, > > it logs an error, refuses to build the encaps, and deletes the > > associated chassis record. > Note that this is a different behavior from what OVN does today when the ovn-encap-ip is removed. If the ovn-encap-ip is removed, the ovn-controller will log a message, but will not delete the encap and chassis entries in sb. I think that removing the encap entry, as you propose, is a good thing to do, so I entered a different FDP tracking this part: https://issues.redhat.com/browse/FDP-1711. See below about deletion of the chassis record. > > > > Reported-at: https://issues.redhat.com/browse/FDP-1481 > > Co-authored-by: Nicholas Hubbard <[email protected]> > > Signed-off-by: Nicholas Hubbard <[email protected]> > > Signed-off-by: Rosemarie O'Riorden <[email protected]> > > --- > > v2: Fix typo in title. > > --- > > > > Hi Rosemarie, > > thank you for the patch. I have a couple of comments inline. > > controller/chassis.c | 97 +++++++++++++++++++++++++------------ > > controller/chassis.h | 3 +- > > controller/ovn-controller.c | 5 +- > > lib/chassis-index.c | 20 ++++++++ > > lib/chassis-index.h | 3 ++ > > tests/ovn-controller.at | 34 ++++++++++++- > > 6 files changed, 128 insertions(+), 34 deletions(-) > > > > diff --git a/controller/chassis.c b/controller/chassis.c > > index 07bef1c56..ac204c779 100644 > > --- a/controller/chassis.c > > +++ b/controller/chassis.c > > @@ -77,6 +77,12 @@ struct ovs_chassis_cfg { > > bool ct_label_flush; > > }; > > > > +enum chassis_update_status { > > + CHASSIS_UPDATED, > > + CHASSIS_NOT_UPDATED, > > + CHASSIS_NEED_DELETE > > +}; > > + > > static void > > ovs_chassis_cfg_init(struct ovs_chassis_cfg *cfg) > > { > > @@ -699,7 +705,8 @@ chassis_build_encaps(struct ovsdb_idl_txn > > *ovnsb_idl_txn, > > const char *encap_ip_default, > > const char *chassis_id, > > const char *encap_csum, > > - size_t *n_encap) > > + size_t *n_encap, > > + struct ovsdb_idl_index *sbrec_encaps) > > { > > size_t tunnel_count = 0; > > > > @@ -713,6 +720,18 @@ chassis_build_encaps(struct ovsdb_idl_txn > > *ovnsb_idl_txn, > > > > SSET_FOR_EACH (encap_ip, encap_ip_set) { > > SSET_FOR_EACH (encap_type, encap_type_set) { > > + const struct sbrec_encap *encap_rec = > > + encap_lookup_by_ip_and_type(sbrec_encaps, > > + encap_ip, encap_type); > > + if (encap_rec && strcmp(encap_rec->chassis_name, > chassis_id)) > > { > > + static struct vlog_rate_limit rl = > > VLOG_RATE_LIMIT_INIT(5, 1); > > + VLOG_WARN_RL(&rl, "'%s' already has encap ip '%s' and " > > + "type '%s', cannot duplicate on '%s'", > > + encap_rec->chassis_name, encap_rec->ip, > > + encap_rec->type, chassis_id); > > + continue; > > + } > > + > > struct sbrec_encap *encap = > sbrec_encap_insert(ovnsb_idl_txn); > > > > sbrec_encap_set_type(encap, encap_type); > > @@ -778,7 +797,7 @@ update_supported_sset(struct sset *supported) > > > > static void > > remove_unsupported_options(const struct sbrec_chassis *chassis_rec, > > - bool *updated) > > + enum chassis_update_status *update_status) > > { > > struct sset supported_options = > SSET_INITIALIZER(&supported_options); > > update_supported_sset(&supported_options); > > @@ -789,7 +808,7 @@ remove_unsupported_options(const struct sbrec_chassis > > *chassis_rec, > > VLOG_WARN("Removing unsupported key \"%s\" from chassis > > record.", > > node->key); > > sbrec_chassis_update_other_config_delkey(chassis_rec, > > node->key); > > - *updated = true; > > + *update_status = CHASSIS_UPDATED; > > } > > } > > > > @@ -827,25 +846,27 @@ chassis_get_record(struct ovsdb_idl_txn > > *ovnsb_idl_txn, > > } > > > > /* Update a Chassis record based on the config in the ovs config. > > - * Returns true if 'chassis_rec' was updated, false otherwise. > > + * Returns 1 if 'chassis_rec' was updated, -1 if another chassis exists > > + * with an encap with same IP and type as 'chassis', and returns 0 > > otherwise. > > > > This comment is very misleading, I don't see the function returning > integers, it returns enum. > > > > */ > > -static bool > > +static enum chassis_update_status > > chassis_update(const struct sbrec_chassis *chassis_rec, > > struct ovsdb_idl_txn *ovnsb_idl_txn, > > const struct ovs_chassis_cfg *ovs_cfg, > > const char *chassis_id, > > - const struct sset *transport_zones) > > + const struct sset *transport_zones, > > + struct ovsdb_idl_index *sbrec_encaps) > > { > > - bool updated = false; > > + enum chassis_update_status update_status = CHASSIS_NOT_UPDATED; > > > > if (strcmp(chassis_id, chassis_rec->name)) { > > sbrec_chassis_set_name(chassis_rec, chassis_id); > > - updated = true; > > + update_status = CHASSIS_UPDATED; > > } > > > > if (strcmp(ovs_cfg->hostname, chassis_rec->hostname)) { > > sbrec_chassis_set_hostname(chassis_rec, ovs_cfg->hostname); > > - updated = true; > > + update_status = CHASSIS_UPDATED; > > } > > > > if (chassis_other_config_changed(ovs_cfg, chassis_rec)) { > > @@ -856,12 +877,12 @@ chassis_update(const struct sbrec_chassis > > *chassis_rec, > > sbrec_chassis_verify_other_config(chassis_rec); > > sbrec_chassis_set_other_config(chassis_rec, &other_config); > > smap_destroy(&other_config); > > - updated = true; > > + update_status = CHASSIS_UPDATED; > > } > > > > update_chassis_transport_zones(transport_zones, chassis_rec); > > > > - remove_unsupported_options(chassis_rec, &updated); > > + remove_unsupported_options(chassis_rec, &update_status); > > > > /* If any of the encaps should change, update them. */ > > bool tunnels_changed = > > @@ -871,20 +892,27 @@ chassis_update(const struct sbrec_chassis > > *chassis_rec, > > ovs_cfg->encap_csum, > > chassis_rec); > > if (!tunnels_changed) { > > - return updated; > > + return update_status; > > } > > > > struct sbrec_encap **encaps; > > size_t n_encap; > > > > encaps = > > - chassis_build_encaps(ovnsb_idl_txn, &ovs_cfg->encap_type_set, > > + chassis_build_encaps(ovnsb_idl_txn, > > + &ovs_cfg->encap_type_set, > > &ovs_cfg->encap_ip_set, > > ovs_cfg->encap_ip_default, chassis_id, > > - ovs_cfg->encap_csum, &n_encap); > > + ovs_cfg->encap_csum, &n_encap, > > + sbrec_encaps); > > sbrec_chassis_set_encaps(chassis_rec, encaps, n_encap); > > free(encaps); > > - return true; > > + > > + if (!n_encap) { > > + return CHASSIS_NEED_DELETE; > > + } > > + > > + return CHASSIS_UPDATED; > > } > > > > /* If this is a chassis_private config update after we initialized the > > record > > @@ -935,7 +963,8 @@ chassis_run(struct ovsdb_idl_txn *ovnsb_idl_txn, > > const char *chassis_id, > > const struct ovsrec_bridge *br_int, > > const struct sset *transport_zones, > > - const struct sbrec_chassis_private **chassis_private) > > + const struct sbrec_chassis_private **chassis_private, > > + struct ovsdb_idl_index *sbrec_encaps) > > { > > struct ovs_chassis_cfg ovs_cfg; > > > > @@ -956,22 +985,30 @@ chassis_run(struct ovsdb_idl_txn *ovnsb_idl_txn, > > * modified in the ovs table. > > */ > > if (chassis_rec && ovnsb_idl_txn) { > > - bool updated = chassis_update(chassis_rec, ovnsb_idl_txn, > > &ovs_cfg, > > - chassis_id, transport_zones); > > - > > - if (!existed || updated) { > > - ovsdb_idl_txn_add_comment(ovnsb_idl_txn, > > - "ovn-controller: %s chassis '%s'", > > - !existed ? "registering" : > > "updating", > > - chassis_id); > > - } > > + enum chassis_update_status update_status = > > + chassis_update(chassis_rec, ovnsb_idl_txn, &ovs_cfg, > > + chassis_id, > > + transport_zones, sbrec_encaps); > > + > > + if (update_status == CHASSIS_NEED_DELETE) { > > + sbrec_chassis_delete(chassis_rec); > > > > Shouldn't we also delete the chassis private record if it exists? > I don't think there is a point in having one of them to remain. > I'm also not sure if the sbrec deletion is the right way to do it, > we could relax the schema and decrease the min for encaps > to 0, which would allow us to keep the chassis record without > any encap being configured. There is only single place that > expects at least 1 encap, that is ovn-ic, but looking at the > code nothing prevents us from skipping chassis without > encap instead of crashing. > I agree that we should probably not delete the chassis record. Removing the chassis might also result in tunnels not being deleted in some cases. With removing tunnels we would ensure that traffic will not be sent to this wrongly configured chassis. > > > > + chassis_rec = NULL; > > + } else { > > > This adds yet another nesting, please consider using goto > to reduce it. > > + if (!existed || update_status == CHASSIS_UPDATED) { > > + ovsdb_idl_txn_add_comment(ovnsb_idl_txn, > > + "ovn-controller: %s chassis > > '%s'", > > + !existed ? "registering" > > + : "updating", chassis_id); > > + } > > > > - *chassis_private = > > - chassis_private_get_record(ovnsb_idl_txn, > > - sbrec_chassis_private_by_name, > > + *chassis_private = > > + chassis_private_get_record(ovnsb_idl_txn, > > + > sbrec_chassis_private_by_name, > > + chassis_id); > > + if (*chassis_private) { > > + chassis_private_update(*chassis_private, chassis_rec, > > chassis_id); > > - if (*chassis_private) { > > - chassis_private_update(*chassis_private, chassis_rec, > > chassis_id); > > + } > > } > > } > > > > diff --git a/controller/chassis.h b/controller/chassis.h > > index 03cc2f906..14251bdad 100644 > > --- a/controller/chassis.h > > +++ b/controller/chassis.h > > @@ -44,7 +44,8 @@ const struct sbrec_chassis *chassis_run( > > const struct ovsrec_open_vswitch_table *, > > const char *chassis_id, const struct ovsrec_bridge *br_int, > > const struct sset *transport_zones, > > - const struct sbrec_chassis_private **chassis_private); > > + const struct sbrec_chassis_private **chassis_private, > > + struct ovsdb_idl_index *sbrec_chassis_encaps); > > bool chassis_cleanup(struct ovsdb_idl_txn *ovs_idl_txn, > > struct ovsdb_idl_txn *ovnsb_idl_txn, > > const struct ovsrec_open_vswitch_table *, > > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c > > index 330227933..7faf1c38f 100644 > > --- a/controller/ovn-controller.c > > +++ b/controller/ovn-controller.c > > @@ -6570,6 +6570,9 @@ main(int argc, char *argv[]) > > struct ovsdb_idl_index *sbrec_learned_route_index_by_datapath > > = ovsdb_idl_index_create1(ovnsb_idl_loop.idl, > > &sbrec_learned_route_col_datapath); > > + struct ovsdb_idl_index *sbrec_encaps > > + = ovsdb_idl_index_create2(ovnsb_idl_loop.idl, > > + &sbrec_encap_col_type, > > &sbrec_encap_col_ip); > > > > ovsdb_idl_track_add_all(ovnsb_idl_loop.idl); > > ovsdb_idl_omit_alert(ovnsb_idl_loop.idl, > > @@ -7275,7 +7278,7 @@ main(int argc, char *argv[]) > > sbrec_chassis_private_by_name, > > ovs_table, chassis_id, > > br_int, &transport_zones, > > - &chassis_private); > > + &chassis_private, sbrec_encaps); > > } > > > > /* If any OVS feature support changed, force a full > recompute. > > diff --git a/lib/chassis-index.c b/lib/chassis-index.c > > index 4b38036cb..953cc6ed4 100644 > > --- a/lib/chassis-index.c > > +++ b/lib/chassis-index.c > > @@ -115,3 +115,23 @@ ha_chassis_group_lookup_by_name( > > > > return retval; > > } > > + > > +/* Finds and returns the encap with the given ip and type, or NULL if no > > such > > + * encap exists. */ > > +const struct sbrec_encap * > > +encap_lookup_by_ip_and_type(struct ovsdb_idl_index > *sbrec_chassis_encaps, > > + const char *ip, const char *type) > > +{ > > + struct sbrec_encap *target = sbrec_encap_index_init_row( > > + sbrec_chassis_encaps); > > + > > + sbrec_encap_index_set_ip(target, ip); > > + sbrec_encap_index_set_type(target, type); > > + > > + struct sbrec_encap *retval = sbrec_encap_index_find( > > + sbrec_chassis_encaps, target); > > + > > + sbrec_encap_index_destroy_row(target); > > + > > + return retval; > > +} > > diff --git a/lib/chassis-index.h b/lib/chassis-index.h > > index bc654da13..dd94596c4 100644 > > --- a/lib/chassis-index.h > > +++ b/lib/chassis-index.h > > @@ -35,5 +35,8 @@ chassis_private_lookup_by_name( > > struct ovsdb_idl_index *ha_chassis_group_index_create(struct ovsdb_idl > > *idl); > > const struct sbrec_ha_chassis_group *ha_chassis_group_lookup_by_name( > > struct ovsdb_idl_index *sbrec_ha_chassis_grp_by_name, const char > > *name); > > +const struct sbrec_encap * > > +encap_lookup_by_ip_and_type(struct ovsdb_idl_index > *sbrec_chassis_encaps, > > + const char *ip, const char *type); > > > > nit: For declaration the return type and name should be at the same line. > > > > > > #endif /* lib/chassis-index.h */ > > diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at > > index a8a9a2da2..9d158ebc4 100644 > > --- a/tests/ovn-controller.at > > +++ b/tests/ovn-controller.at > > @@ -277,7 +277,8 @@ ovs-vsctl -- set Open_vSwitch . > > external-ids:hostname="${new_sysid}" \ > > -- set Open_vSwitch . > > external-ids:ovn-remote="${current_remote}" > > > > OVS_WAIT_UNTIL([ > > - grep -q 'Transaction causes multiple rows in \\"Encap\\" table to > > have identical values' hv/ovn-controller.log > > + grep -q "'hv' already has encap ip '192.168.0.1' and type 'geneve', > " > > \ > > + "cannot duplicate on 'hv-foo'" hv/ovn-controller.log > > ]) > > > > # Destroy the stale entries manually and ovn-controller should now be > able > > @@ -310,7 +311,8 @@ check ovn-sbctl destroy chassis_private . -- destroy > > chassis . > > OVN_CLEANUP_SBOX([hv], ["/foo/d > > /bar/d > > /Transaction causes multiple rows/d > I guess we should delete the "Transaction causes multiple rows" potential error. > > -/failed to create bridge/d"]) > > +/failed to create bridge/d > > +/cannot duplicate on/d"]) > > > > OVN_CLEANUP_VSWITCH([main]) > > as ovn-sb > > @@ -3876,3 +3878,31 @@ wait_ports 0 > > OVN_CLEANUP([hv1 > > /Invalid VXLAN port number.*/d]) > > AT_CLEANUP > > + > > +OVN_FOR_EACH_NORTHD([ > > +AT_SETUP([ovn-controller - qwerty two encaps same IP and type]) > > +AT_KEYWORDS([ovn]) > > +ovn_start > > +net_add n1 > > + > > +sim_add hv1 > > +as hv1 > > +check ovs-vsctl add-br br-phys > > +ovn_attach n1 br-phys 192.168.0.11 > > + > > +sim_add hv2 > > +as hv2 > > +check ovs-vsctl add-br br-phys > > +ovn_attach n1 br-phys 192.168.0.12 > > + > > +# Set hv2's encap IP to be the same as hv1's, and save hv2's db size. > > +check ovs-vsctl set Open_vSwitch . > > external-ids:ovn-encap-ip="192.168.0.11" > > +old_size=$(wc -c hv2/conf.db) > > > > This is racy, you're saving the size *after* the change. It would be better > to actually wait for the 192.168.0.12 encap to disappear and then > check count of encap table, possibly chassis table (depending if we still > want to go with delete or not). > > > > + > > +# Make sure hv2's db has not grown, as no transactions should have > > occurred. > > +sleep 3 > > +new_size=$(wc -c hv2/conf.db) > > +check test "$new_size" = "$old_size" > > > + > > +AT_CLEANUP > > +]) > > -- > > 2.51.0 > > > > _______________________________________________ > > dev mailing list > > [email protected] > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev > > > > > Thanks, > Ales > _______________________________________________ > dev mailing list > [email protected] > https://mail.openvswitch.org/mailman/listinfo/ovs-dev > _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
