Whenever an OpenFlow error is returned by OvS, trigger a reconnect of
the OpenFlow (rconn) connection.  This will clear any installed OpenFlow
rules/groups. To ensure consistency, trigger a full I-P recompute too.

An example of scenario that can result in an OpenFlow error returned by
OvS follows (describing two main loop iterations in ovn-controller):
  - Iteration I:
    a. get updates from SB
    b. process these updates and generate "desired" openflows (lets assume
    this generates quite a lot of desired openflow modifications)
    c.1. add bundle-open msg to rconn
    c.2. add openflow mod msgs to rconn (only some of these make it through,
    the rest gets queued, the rconn is backlogged at this point).
    c.3. add bundle-commit msg to rconn (this gets queued)

  - Iteration II:
    a. get updates from SB (rconn is still backlogged)
    b. process the updates and generate "desired" openflows (lets assume
    this takes 10+ seconds for the specific SB updates)

At some point, while step II.b was being executed OvS declared the bundle
operation (started at I.c.1) timeout.  We now act on this error by
reconnecting which in turn triggers a flush of the rconn backlog and
gives more chance to the next full recompute to succeed in installing
all flows.

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2134880
Reported-by: François Rigault <fr...@amadeus.com>
CC: Ilya Maximets <i.maxim...@ovn.org>
Signed-off-by: Dumitru Ceara <dce...@redhat.com>
---
 controller/ofctrl.c         | 17 ++++++++++++++---
 controller/ofctrl.h         |  2 +-
 controller/ovn-controller.c | 10 ++++++++--
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/controller/ofctrl.c b/controller/ofctrl.c
index b1ba1c743a..1da23bc27e 100644
--- a/controller/ofctrl.c
+++ b/controller/ofctrl.c
@@ -766,13 +766,18 @@ ofctrl_get_mf_field_id(void)
 
 /* Runs the OpenFlow state machine against 'br_int', which is local to the
  * hypervisor on which we are running.  Attempts to negotiate a Geneve option
- * field for class OVN_GENEVE_CLASS, type OVN_GENEVE_TYPE. */
-void
+ * field for class OVN_GENEVE_CLASS, type OVN_GENEVE_TYPE.
+ *
+ * Returns 'true' if an OpenFlow reconnect happened; 'false' otherwise.
+ */
+bool
 ofctrl_run(const struct ovsrec_bridge *br_int,
            const struct ovsrec_open_vswitch_table *ovs_table,
            struct shash *pending_ct_zones)
 {
     char *target = xasprintf("unix:%s/%s.mgmt", ovs_rundir(), br_int->name);
+    bool reconnected = false;
+
     if (strcmp(target, rconn_get_target(swconn))) {
         VLOG_INFO("%s: connecting to switch", target);
         rconn_connect(swconn, target, target);
@@ -782,10 +787,12 @@ ofctrl_run(const struct ovsrec_bridge *br_int,
     rconn_run(swconn);
 
     if (!rconn_is_connected(swconn)) {
-        return;
+        goto done;
     }
+
     if (seqno != rconn_get_connection_seqno(swconn)) {
         seqno = rconn_get_connection_seqno(swconn);
+        reconnected = true;
         state = S_NEW;
 
         /* Reset the state of any outstanding ct flushes to resend them. */
@@ -855,6 +862,9 @@ ofctrl_run(const struct ovsrec_bridge *br_int,
          * point, so ensure that we come back again without waiting. */
         poll_immediate_wake();
     }
+
+done:
+    return reconnected;
 }
 
 void
@@ -909,6 +919,7 @@ ofctrl_recv(const struct ofp_header *oh, enum ofptype type)
     } else if (type == OFPTYPE_ERROR) {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 300);
         log_openflow_rl(&rl, VLL_INFO, oh, "OpenFlow error");
+        rconn_reconnect(swconn);
     } else {
         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 300);
         log_openflow_rl(&rl, VLL_DBG, oh, "OpenFlow packet ignored");
diff --git a/controller/ofctrl.h b/controller/ofctrl.h
index f5751e3ee4..105f9370be 100644
--- a/controller/ofctrl.h
+++ b/controller/ofctrl.h
@@ -51,7 +51,7 @@ struct ovn_desired_flow_table {
 void ofctrl_init(struct ovn_extend_table *group_table,
                  struct ovn_extend_table *meter_table,
                  int inactivity_probe_interval);
-void ofctrl_run(const struct ovsrec_bridge *br_int,
+bool ofctrl_run(const struct ovsrec_bridge *br_int,
                 const struct ovsrec_open_vswitch_table *,
                 struct shash *pending_ct_zones);
 enum mf_field_id ofctrl_get_mf_field_id(void);
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 1151d36644..b301c50157 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -5075,8 +5075,14 @@ main(int argc, char *argv[])
 
             if (br_int) {
                 ct_zones_data = engine_get_data(&en_ct_zones);
-                if (ct_zones_data) {
-                    ofctrl_run(br_int, ovs_table, &ct_zones_data->pending);
+                if (ct_zones_data && ofctrl_run(br_int, ovs_table,
+                                                &ct_zones_data->pending)) {
+                    static struct vlog_rate_limit rl
+                            = VLOG_RATE_LIMIT_INIT(1, 1);
+
+                    VLOG_INFO_RL(&rl, "OVS OpenFlow connection reconnected,"
+                                      "force recompute.");
+                    engine_set_force_recompute(true);
                 }
 
                 if (chassis) {
-- 
2.31.1

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to