On Tue, Jun 20, 2023 at 4:26 PM Xavier Simonart <xsimo...@redhat.com> wrote:
> If an interface with an qos option is deleted at the same > time as an ofport notification from ovs (causing runtime_data recompute) > is received, the binding module was trying to delete twice the same qos > queue, causing ovs to raise an exception. > > Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2213219 > Fixes: 7d1d111ff213 ("controller: configure qos through ovs qos table and > do not run tc directly") > Signed-off-by: Xavier Simonart <xsimo...@redhat.com> > Hi, I have one question below. > --- > v2: rebased on origin/main > --- > controller/binding.c | 22 ++++++++++++ > controller/binding.h | 1 + > controller/ovn-controller.c | 12 +++++++ > tests/ovn-macros.at | 34 ++++++++++++++++++ > tests/ovn.at | 70 +++++++++++++++++++++++++++++++++++++ > tests/system-ovn.at | 18 ---------- > 6 files changed, 139 insertions(+), 18 deletions(-) > > diff --git a/controller/binding.c b/controller/binding.c > index 486095ca7..8069a2e0d 100644 > --- a/controller/binding.c > +++ b/controller/binding.c > @@ -396,9 +396,23 @@ configure_qos(const struct sbrec_port_binding *pb, > q->burst = burst; > } > > +static const struct ovsrec_queue * > +find_qos_queue_by_external_ids(const struct smap *external_ids, > + struct ovsdb_idl_index *ovsrec_queue_by_external_ids) > +{ > + const struct ovsrec_queue *queue = > + ovsrec_queue_index_init_row(ovsrec_queue_by_external_ids); > + ovsrec_queue_index_set_external_ids(queue, external_ids); > + const struct ovsrec_queue *retval = > + ovsrec_queue_index_find(ovsrec_queue_by_external_ids, queue); > + ovsrec_queue_index_destroy_row(queue); > + return retval; > +} > + > static void > ovs_qos_entries_gc(struct ovsdb_idl_txn *ovs_idl_txn, > struct ovsdb_idl_index *ovsrec_port_by_qos, > + struct ovsdb_idl_index *ovsrec_queue_by_external_ids, > const struct ovsrec_qos_table *qos_table, > struct hmap *queue_map) > { > @@ -414,6 +428,13 @@ ovs_qos_entries_gc(struct ovsdb_idl_txn *ovs_idl_txn, > if (!queue) { > continue; > } > + const struct ovsrec_queue *ovsrec_queue = > + find_qos_queue_by_external_ids(&queue->external_ids, > + > ovsrec_queue_by_external_ids); > Since we do not completely control the external ids, isn't there a chance that we will have outdated external ids which will result in leaked qos records in the end? Maybe my understanding of the index search over smap is wrong. > + if (!ovsrec_queue) { > + VLOG_DBG("queue already deleted !"); > + continue; > + } > > const char *port = smap_get(&queue->external_ids, "ovn_port"); > if (!port) { > @@ -2165,6 +2186,7 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct > binding_ctx_out *b_ctx_out) > shash_destroy(&bridge_mappings); > /* Remove stale QoS entries. */ > ovs_qos_entries_gc(b_ctx_in->ovs_idl_txn, > b_ctx_in->ovsrec_port_by_qos, > + b_ctx_in->ovsrec_queue_by_external_ids, > b_ctx_in->qos_table, b_ctx_out->qos_map); > > cleanup_claimed_port_timestamps(); > diff --git a/controller/binding.h b/controller/binding.h > index 0e57f02ee..e3ab1d7ca 100644 > --- a/controller/binding.h > +++ b/controller/binding.h > @@ -47,6 +47,7 @@ struct binding_ctx_in { > struct ovsdb_idl_index *sbrec_port_binding_by_datapath; > struct ovsdb_idl_index *sbrec_port_binding_by_name; > struct ovsdb_idl_index *ovsrec_port_by_qos; > + struct ovsdb_idl_index *ovsrec_queue_by_external_ids; > const struct ovsrec_qos_table *qos_table; > const struct sbrec_port_binding_table *port_binding_table; > const struct ovsrec_bridge *br_int; > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c > index a47406979..bb84554fc 100644 > --- a/controller/ovn-controller.c > +++ b/controller/ovn-controller.c > @@ -1116,6 +1116,7 @@ enum sb_engine_node { > OVS_NODE(port, "port") \ > OVS_NODE(interface, "interface") \ > OVS_NODE(qos, "qos") \ > + OVS_NODE(queue, "queue") \ > OVS_NODE(flow_sample_collector_set, "flow_sample_collector_set") > > enum ovs_engine_node { > @@ -1576,6 +1577,10 @@ init_binding_ctx(struct engine_node *node, > engine_ovsdb_node_get_index( > engine_get_input("OVS_port", node), "qos"); > > + struct ovsdb_idl_index *ovsrec_queue_by_external_ids = > + engine_ovsdb_node_get_index( > + engine_get_input("OVS_queue", node), "external_ids"); > + > struct controller_engine_ctx *ctrl_ctx = > engine_get_context()->client_ctx; > > b_ctx_in->ovnsb_idl_txn = engine_get_context()->ovnsb_idl_txn; > @@ -1584,6 +1589,7 @@ init_binding_ctx(struct engine_node *node, > b_ctx_in->sbrec_port_binding_by_datapath = > sbrec_port_binding_by_datapath; > b_ctx_in->sbrec_port_binding_by_name = sbrec_port_binding_by_name; > b_ctx_in->ovsrec_port_by_qos = ovsrec_port_by_qos; > + b_ctx_in->ovsrec_queue_by_external_ids = ovsrec_queue_by_external_ids; > b_ctx_in->iface_table = iface_shadow->iface_table; > b_ctx_in->iface_table_external_ids_old = > &iface_shadow->iface_table_external_ids_old; > @@ -4599,6 +4605,9 @@ main(int argc, char *argv[]) > struct ovsdb_idl_index *ovsrec_port_by_qos > = ovsdb_idl_index_create1(ovs_idl_loop.idl, > &ovsrec_port_col_qos); > + struct ovsdb_idl_index *ovsrec_queue_by_external_ids > + = ovsdb_idl_index_create1(ovs_idl_loop.idl, > + &ovsrec_queue_col_external_ids); > struct ovsdb_idl_index *ovsrec_flow_sample_collector_set_by_id > = ovsdb_idl_index_create2(ovs_idl_loop.idl, > > &ovsrec_flow_sample_collector_set_col_bridge, > @@ -4899,6 +4908,7 @@ main(int argc, char *argv[]) > engine_add_input(&en_runtime_data, &en_ovs_open_vswitch, NULL); > engine_add_input(&en_runtime_data, &en_ovs_bridge, NULL); > engine_add_input(&en_runtime_data, &en_ovs_qos, NULL); > + engine_add_input(&en_runtime_data, &en_ovs_queue, NULL); > > engine_add_input(&en_runtime_data, &en_sb_chassis, NULL); > engine_add_input(&en_runtime_data, &en_sb_datapath_binding, > @@ -4960,6 +4970,8 @@ main(int argc, char *argv[]) > engine_ovsdb_node_add_index(&en_ovs_flow_sample_collector_set, "id", > ovsrec_flow_sample_collector_set_by_id); > engine_ovsdb_node_add_index(&en_ovs_port, "qos", ovsrec_port_by_qos); > + engine_ovsdb_node_add_index(&en_ovs_queue, "external_ids", > + ovsrec_queue_by_external_ids); > > struct ed_type_lflow_output *lflow_output_data = > engine_get_internal_data(&en_lflow_output); > diff --git a/tests/ovn-macros.at b/tests/ovn-macros.at > index 6f2d085ae..7223846ef 100644 > --- a/tests/ovn-macros.at > +++ b/tests/ovn-macros.at > @@ -840,6 +840,40 @@ fmt_pkt() { > print(out.decode())" | $PYTHON3 > } > > +sleep_sb() { > + echo SB going to sleep > + AT_CHECK([kill -STOP $(cat ovn-sb/ovsdb-server.pid)]) > +} > +wake_up_sb() { > + echo SB waking up > + AT_CHECK([kill -CONT $(cat ovn-sb/ovsdb-server.pid)]) > +} > +sleep_controller() { > + echo Controller $hv going to sleep > + hv=$1 > + as $hv > + check ovn-appctl debug/pause > + OVS_WAIT_UNTIL([test x$(ovn-appctl -t ovn-controller debug/status) = > "xpaused"]) > +} > +wake_up_controller() { > + hv=$1 > + as $hv > + echo Controller $hv waking up > + ovn-appctl debug/resume > + OVS_WAIT_UNTIL([test x$(ovn-appctl -t ovn-controller debug/status) = > "xrunning"]) > +} > +sleep_ovs() { > + hv=$1 > + echo ovs $hv going to sleep > + AT_CHECK([kill -STOP $(cat $hv/ovs-vswitchd.pid)]) > +} > + > +wake_up_ovs() { > + hv=$1 > + echo ovs $hv going to sleep > + AT_CHECK([kill -CONT $(cat $hv/ovs-vswitchd.pid)]) > +} > + > OVS_END_SHELL_HELPERS > > m4_define([OVN_POPULATE_ARP], [AT_CHECK(ovn_populate_arp__, [0], > [ignore])]) > diff --git a/tests/ovn.at b/tests/ovn.at > index 544fba187..2c221a05c 100644 > --- a/tests/ovn.at > +++ b/tests/ovn.at > @@ -36458,3 +36458,73 @@ > OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/vif1-tx.pcap], [expected]) > > AT_CLEANUP > ]) > + > +OVN_FOR_EACH_NORTHD([ > +AT_SETUP([OVN QoS port deletion]) > +ovn_start > + > +check ovn-nbctl ls-add ls1 > +check ovn-nbctl lsp-add ls1 public1 > +check ovn-nbctl lsp-set-addresses public1 unknown > +check ovn-nbctl lsp-set-type public1 localnet > +check ovn-nbctl lsp-set-options public1 network_name=phys > +net_add n > + > +# two hypervisors, each connected to the same network > +for i in 1 2; do > + sim_add hv-$i > + as hv-$i > + ovs-vsctl add-br br-phys > + ovs-vsctl set open . external-ids:ovn-bridge-mappings=phys:br-phys > + ovn_attach n br-phys 192.168.0.$i > +done > + > +check ovn-nbctl lsp-add ls1 lsp1 > +check ovn-nbctl lsp-set-addresses lsp1 f0:00:00:00:00:03 > +as hv-1 > +ovs-vsctl add-port br-int vif1 -- \ > + set Interface vif1 external-ids:iface-id=lsp1 \ > + ofport-request=3 > + > +OVS_WAIT_UNTIL([test x`ovn-nbctl lsp-get-up lsp1` = xup]) > + > +check ovn-nbctl set Logical_Switch_Port lsp1 options:qos_max_rate=800000 > +check ovn-nbctl --wait=hv set Logical_Switch_Port lsp1 > options:qos_burst=9000000 > + > +AS_BOX([$(date +%H:%M:%S.%03N) checking deletion of port with qos > options]) > +check ovn-nbctl ls-add ls2 > +check ovn-nbctl lsp-add ls2 lsp2 > +check ovn-nbctl lsp-set-addresses lsp2 f0:00:00:00:00:05 > +as hv-1 > +ovs-vsctl add-port br-int vif2 -- \ > + set Interface vif2 external-ids:iface-id=lsp2 \ > + ofport-request=5 > +OVS_WAIT_UNTIL([test x`ovn-nbctl lsp-get-up lsp2` = xup]) > + > +# Sleep ovs to postpone ofport notification to ovn > +sleep_ovs hv-1 > + > +# Create localnet; this will cause patch-port creation > +check ovn-nbctl lsp-add ls2 public2 > +check ovn-nbctl lsp-set-addresses public2 unknown > +check ovn-nbctl lsp-set-type public2 localnet > +check ovn-nbctl --wait=sb set Logical_Switch_Port public2 > options:qos_min_rate=6000000000 options:qos_max_rate=7000000000 > options:qos_burst=8000000000 options:network_name=phys > + > +# Let's now send ovn controller to sleep, so it will receive both ofport > notification and ls deletion simultaneously > +sleep_controller hv-1 > + > +# Tme to wake up ovs > +wake_up_ovs hv-1 > + > +# Delete lsp1 > +check ovn-nbctl --wait=sb lsp-del lsp1 > + > +# And finally wake up controller > +wake_up_controller hv-1 > + > +# Make sure ovn-controller is still OK > +ovn-nbctl --wait=hv sync > +OVS_WAIT_UNTIL([test $(as hv-1 ovs-vsctl list qos | grep -c linux-htb) > -eq 1]) > + > +AT_CLEANUP > +]) > diff --git a/tests/system-ovn.at b/tests/system-ovn.at > index 44a8072d6..18a79410e 100644 > --- a/tests/system-ovn.at > +++ b/tests/system-ovn.at > @@ -10897,20 +10897,6 @@ wait_for_local_bindings() { > [kill -CONT $(cat ovn-sb/ovsdb-server.pid)] > ) > } > -sleep_sb() { > - echo SB going to sleep > - AT_CHECK([kill -STOP $(cat ovn-sb/ovsdb-server.pid)]) > -} > -wake_up_sb() { > - echo SB waking up > - AT_CHECK([kill -CONT $(cat ovn-sb/ovsdb-server.pid)]) > -} > -sleep_controller() { > - echo Controller going to sleep > - ovn-appctl debug/pause > - OVS_WAIT_UNTIL([test x$(ovn-appctl -t ovn-controller debug/status) = > "xpaused"]) > -} > - > stop_ovsdb_controller_updates() { > TCP_PORT=$1 > echo Stopping updates from ovn-controller to ovsdb using port $TCP_PORT > @@ -10922,10 +10908,6 @@ restart_ovsdb_controller_updates() { > echo Restarting updates from ovn-controller to ovsdb > iptables -D INPUT -p tcp --destination-port $TCP_PORT -j DROP > } > -wake_up_controller() { > - echo Controller waking up > - ovn-appctl debug/resume > -} > ensure_controller_run() { > # We want to make sure controller could run at least one full loop. > # We can't use wait=hv as sb might be sleeping. > -- > 2.31.1 > > _______________________________________________ > dev mailing list > d...@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev > > Thanks, Ales -- Ales Musil Senior Software Engineer - OVN Core Red Hat EMEA <https://www.redhat.com> amu...@redhat.com IM: amusil <https://red.ht/sig> _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev