From: Han Zhou <hz...@ovn.org>

Whenever OpenFlow connection between ovn-controller and OVS is
connected/reconnected, typically during ovn-controller/OVS
restart/upgrade, ovn-controller would:
1. clears all the existing flows after the initial hand-shaking
2. compute the new flows
3. install the new flows to OVS.

In large scale environments when there are a big number of flows
needs to be computed by ovn-controller, the step 2 and 3 above can take
very long time (from seconds to minutes, depending on the scale and
topology), which would cause a data plane down time.

The purpose of this patch is to reduce data plane down time during
ovn-controller restart/upgrade. It adds a new state S_WAIT_BEFORE_CLEAR
to the ofctrl state machine, so that ovn-controller waits for a period
of time before clearing the existing OVS flows while it is computing the
new flows. Ideally, ovn-controller should clear the flows after it
completes the new flows computing. However, it is difficult for
ovn-controller to judge if it has generated all the new flows (or the
flows that are sufficient to avoid down time), because flows computation
depends on iterations of SB monitor data processing and condition
changes. So, instead of try to determine by ovn-controller itself, this
patch provides a configuration "ovn-ofctrl-wait-before-clear" for users
to determine the feasible time to wait, according to the scale. As
mentioned in the document, the output of

  ovn-appctl -t ovn-controller inc-engine/recompute
  ovn-appctl -t ovn-controller stopwatch/show flow-generation

can help users determining what value to set.

Tested with ~200k ovs flows generated by ovn-controller on a system with 8
Armv8 A72 cores, with ovn-ofctrl-wait-before-clear set to 7000 (the
stopwatch/show flow-generation Maximum is 3596ms).

Without this patch, there were ~13 seconds ping failures during ovn-controller
restart.

With this patch, the ping failure reduced to ~6 seconds.
(The ~6 seconds down time was introduced by step 3 - install the new
flows to ovs, which will be addressed in future patches)

Signed-off-by: Han Zhou <hz...@ovn.org>
---
 controller/ofctrl.c             | 87 ++++++++++++++++++++++++++++-----
 controller/ofctrl.h             |  4 +-
 controller/ovn-controller.8.xml | 34 +++++++++++++
 controller/ovn-controller.c     |  6 +--
 tests/ovn-controller.at         | 55 ++++++++++++++++++++-
 5 files changed, 168 insertions(+), 18 deletions(-)

diff --git a/controller/ofctrl.c b/controller/ofctrl.c
index 72707baf9..ecabbcc9b 100644
--- a/controller/ofctrl.c
+++ b/controller/ofctrl.c
@@ -47,6 +47,7 @@
 #include "physical.h"
 #include "openvswitch/rconn.h"
 #include "socket-util.h"
+#include "timeval.h"
 #include "util.h"
 #include "vswitch-idl.h"
 
@@ -272,6 +273,7 @@ static unsigned int seqno;
     STATE(S_NEW)                                \
     STATE(S_TLV_TABLE_REQUESTED)                \
     STATE(S_TLV_TABLE_MOD_SENT)                 \
+    STATE(S_WAIT_BEFORE_CLEAR)                  \
     STATE(S_CLEAR_FLOWS)                        \
     STATE(S_UPDATE_FLOWS)
 enum ofctrl_state {
@@ -314,6 +316,14 @@ static uint64_t cur_cfg;
 /* Current state. */
 static enum ofctrl_state state;
 
+/* The time (ms) to stay in the state S_WAIT_BEFORE_CLEAR. Read from
+ * external_ids: ovn-ofctrl-wait-before-clear. */
+static unsigned int wait_before_clear_time = 0;
+
+/* The time when the state S_WAIT_BEFORE_CLEAR should complete.
+ * If the timer is not started yet, it is set to 0. */
+static long long int wait_before_clear_expire = 0;
+
 /* Transaction IDs for messages in flight to the switch. */
 static ovs_be32 xid, xid2;
 
@@ -419,18 +429,19 @@ recv_S_NEW(const struct ofp_header *oh OVS_UNUSED,
  * If we receive an NXT_TLV_TABLE_REPLY:
  *
  *     - If it contains our tunnel metadata option, assign its field ID to
- *       mff_ovn_geneve and transition to S_CLEAR_FLOWS.
+ *       mff_ovn_geneve and transition to S_WAIT_BEFORE_CLEAR.
  *
  *     - Otherwise, if there is an unused tunnel metadata field ID, send
  *       NXT_TLV_TABLE_MOD and OFPT_BARRIER_REQUEST, and transition to
  *       S_TLV_TABLE_MOD_SENT.
  *
  *     - Otherwise, log an error, disable Geneve, and transition to
- *       S_CLEAR_FLOWS.
+ *       S_WAIT_BEFORE_CLEAR.
  *
  * If we receive an OFPT_ERROR:
  *
- *     - Log an error, disable Geneve, and transition to S_CLEAR_FLOWS. */
+ *     - Log an error, disable Geneve, and transition to S_WAIT_BEFORE_CLEAR.
+ */
 
 static void
 run_S_TLV_TABLE_REQUESTED(void)
@@ -457,7 +468,7 @@ process_tlv_table_reply(const struct 
ofputil_tlv_table_reply *reply)
                 return false;
             } else {
                 mff_ovn_geneve = MFF_TUN_METADATA0 + map->index;
-                state = S_CLEAR_FLOWS;
+                state = S_WAIT_BEFORE_CLEAR;
                 return true;
             }
         }
@@ -524,7 +535,7 @@ recv_S_TLV_TABLE_REQUESTED(const struct ofp_header *oh, 
enum ofptype type,
 
     /* Error path. */
     mff_ovn_geneve = 0;
-    state = S_CLEAR_FLOWS;
+    state = S_WAIT_BEFORE_CLEAR;
 }
 
 /* S_TLV_TABLE_MOD_SENT, when NXT_TLV_TABLE_MOD and OFPT_BARRIER_REQUEST
@@ -536,12 +547,12 @@ recv_S_TLV_TABLE_REQUESTED(const struct ofp_header *oh, 
enum ofptype type,
  *       raced with some other controller.  Transition to S_NEW.
  *
  *     - Otherwise, log an error, disable Geneve, and transition to
- *       S_CLEAR_FLOWS.
+ *       S_WAIT_BEFORE_CLEAR.
  *
  * If we receive OFPT_BARRIER_REPLY:
  *
  *     - Set the tunnel metadata field ID to the one that we requested.
- *       Transition to S_CLEAR_FLOWS.
+ *       Transition to S_WAIT_BEFORE_CLEAR.
  */
 
 static void
@@ -556,7 +567,7 @@ recv_S_TLV_TABLE_MOD_SENT(const struct ofp_header *oh, enum 
ofptype type,
     if (oh->xid != xid && oh->xid != xid2) {
         ofctrl_recv(oh, type);
     } else if (oh->xid == xid2 && type == OFPTYPE_BARRIER_REPLY) {
-        state = S_CLEAR_FLOWS;
+        state = S_WAIT_BEFORE_CLEAR;
     } else if (oh->xid == xid && type == OFPTYPE_ERROR) {
         enum ofperr error = ofperr_decode_msg(oh, NULL);
         if (error == OFPERR_NXTTMFC_ALREADY_MAPPED ||
@@ -580,7 +591,36 @@ recv_S_TLV_TABLE_MOD_SENT(const struct ofp_header *oh, 
enum ofptype type,
     return;
 
 error:
-    state = S_CLEAR_FLOWS;
+    state = S_WAIT_BEFORE_CLEAR;
+}
+
+/* S_WAIT_BEFORE_CLEAR, we are almost ready to set up flows, but just wait for
+ * a while until the initial flow compute to complete before we clear the
+ * existing flows in OVS, so that we won't end up with an empty flow table,
+ * which may cause data plane down time. */
+static void
+run_S_WAIT_BEFORE_CLEAR(void)
+{
+    if (!wait_before_clear_time ||
+        (wait_before_clear_expire &&
+         time_msec() >= wait_before_clear_expire)) {
+        wait_before_clear_expire = 0;
+        state = S_CLEAR_FLOWS;
+        return;
+    }
+
+    if (!wait_before_clear_expire) {
+        /* Start the timer. */
+        wait_before_clear_expire = time_msec() + wait_before_clear_time;
+    }
+    poll_timer_wait_until(wait_before_clear_expire);
+}
+
+static void
+recv_S_WAIT_BEFORE_CLEAR(const struct ofp_header *oh, enum ofptype type,
+                         struct shash *pending_ct_zones OVS_UNUSED)
+{
+    ofctrl_recv(oh, type);
 }
 
 /* S_CLEAR_FLOWS, after we've established a Geneve metadata field ID and it's
@@ -719,7 +759,9 @@ ofctrl_get_mf_field_id(void)
  * hypervisor on which we are running.  Attempts to negotiate a Geneve option
  * field for class OVN_GENEVE_CLASS, type OVN_GENEVE_TYPE. */
 void
-ofctrl_run(const struct ovsrec_bridge *br_int, struct shash *pending_ct_zones)
+ofctrl_run(const struct ovsrec_bridge *br_int,
+           const struct ovsrec_open_vswitch_table *ovs_table,
+           struct shash *pending_ct_zones)
 {
     char *target = xasprintf("unix:%s/%s.mgmt", ovs_rundir(), br_int->name);
     if (strcmp(target, rconn_get_target(swconn))) {
@@ -746,6 +788,16 @@ ofctrl_run(const struct ovsrec_bridge *br_int, struct 
shash *pending_ct_zones)
             }
         }
     }
+    const struct ovsrec_open_vswitch *cfg =
+        ovsrec_open_vswitch_table_first(ovs_table);
+    ovs_assert(cfg);
+    unsigned int _wait_before_clear_time =
+        smap_get_uint(&cfg->external_ids, "ovn-ofctrl-wait-before-clear", 0);
+    if (_wait_before_clear_time != wait_before_clear_time) {
+        VLOG_INFO("ofctrl-wait-before-clear is now %u ms (was %u ms)",
+                  _wait_before_clear_time, wait_before_clear_time);
+        wait_before_clear_time = _wait_before_clear_time;
+    }
 
     bool progress = true;
     for (int i = 0; progress && i < 50; i++) {
@@ -2293,16 +2345,25 @@ update_installed_flows_by_track(struct 
ovn_desired_flow_table *flow_table,
     }
 }
 
+bool
+ofctrl_has_backlog(void)
+{
+    if (rconn_packet_counter_n_packets(tx_counter)
+        || rconn_get_version(swconn) < 0) {
+        return true;
+    }
+    return false;
+}
+
 /* The flow table can be updated if the connection to the switch is up and
  * in the correct state and not backlogged with existing flow_mods.  (Our
  * criteria for being backlogged appear very conservative, but the socket
  * between ovn-controller and OVS provides some buffering.) */
-bool
+static bool
 ofctrl_can_put(void)
 {
     if (state != S_UPDATE_FLOWS
-        || rconn_packet_counter_n_packets(tx_counter)
-        || rconn_get_version(swconn) < 0) {
+        || ofctrl_has_backlog()) {
         return false;
     }
     return true;
diff --git a/controller/ofctrl.h b/controller/ofctrl.h
index 014de210d..a0bfca852 100644
--- a/controller/ofctrl.h
+++ b/controller/ofctrl.h
@@ -28,6 +28,7 @@ struct hmap;
 struct match;
 struct ofpbuf;
 struct ovsrec_bridge;
+struct ovsrec_open_vswitch_table;
 struct sbrec_meter_table;
 struct shash;
 
@@ -50,6 +51,7 @@ void ofctrl_init(struct ovn_extend_table *group_table,
                  struct ovn_extend_table *meter_table,
                  int inactivity_probe_interval);
 void ofctrl_run(const struct ovsrec_bridge *br_int,
+                const struct ovsrec_open_vswitch_table *,
                 struct shash *pending_ct_zones);
 enum mf_field_id ofctrl_get_mf_field_id(void);
 void ofctrl_put(struct ovn_desired_flow_table *lflow_table,
@@ -59,7 +61,7 @@ void ofctrl_put(struct ovn_desired_flow_table *lflow_table,
                 uint64_t nb_cfg,
                 bool lflow_changed,
                 bool pflow_changed);
-bool ofctrl_can_put(void);
+bool ofctrl_has_backlog(void);
 void ofctrl_wait(void);
 void ofctrl_destroy(void);
 uint64_t ofctrl_get_cur_cfg(void);
diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml
index e9708fe64..68e7de2d4 100644
--- a/controller/ovn-controller.8.xml
+++ b/controller/ovn-controller.8.xml
@@ -257,6 +257,40 @@
         The default value is considered false if this option is not defined.
       </dd>
 
+      <dt><code>external_ids:ovn-ofctrl-wait-before-clear</code></dt>
+      <dd>
+        The time, in milliseconds, to wait before clearing flows in OVS after
+        OpenFlow connection/reconnection during <code>ovn-controller</code>
+        initialization. The purpose of this wait is to give time for
+        <code>ovn-controller</code> to compute the new flows before clearing
+        existing ones, to avoid data plane down time during
+        <code>ovn-controller</code> restart/upgrade at large scale
+        environments where recomputing the flows takes more than a few seconds
+        or even longer. It is difficult for <code>ovn-controller</code>
+        to determine when the new flows computing is completed, because of the
+        dynamics in the cloud environments, which is why this configuration is
+        provided for users to adjust based on the scale of the environment. By
+        default, it is 0, which means clearing existing flows without waiting.
+        Not setting the value, or setting it too small, may result in data
+        plane down time during upgrade/restart, while setting it too big may
+        result in unnecessary extra control plane latency of applying new
+        changes of CMS during upgrade/restart. In most cases, a slightly bigger
+        value is not harmful, because the extra control plane latency happens
+        only once during the OpenFlow connection. To get a reasonable range of
+        the value setting, it is recommended to run the below commands on a
+        node in the target environment and then set this configuration to twice
+        the value of <code>Maximum</code> shown in the output of the second
+        command.
+        <ul>
+          <li>
+            <code>ovn-appctl -t ovn-controller inc-engine/recompute</code>
+          </li>
+          <li>
+            <code>ovn-appctl -t ovn-controller stopwatch/show 
flow-generation</code>
+          </li>
+        </ul>
+      </dd>
+
       <dt><code>external_ids:ovn-enable-lflow-cache</code></dt>
       <dd>
         The boolean flag indicates if <code>ovn-controller</code> should
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index f5d749a2f..8bfa3a58e 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -3700,7 +3700,7 @@ main(int argc, char *argv[])
             if (br_int) {
                 ct_zones_data = engine_get_data(&en_ct_zones);
                 if (ct_zones_data) {
-                    ofctrl_run(br_int, &ct_zones_data->pending);
+                    ofctrl_run(br_int, ovs_table, &ct_zones_data->pending);
                 }
 
                 if (chassis) {
@@ -3715,7 +3715,7 @@ main(int argc, char *argv[])
                     stopwatch_start(CONTROLLER_LOOP_STOPWATCH_NAME,
                                     time_msec());
                     if (ovnsb_idl_txn) {
-                        if (!ofctrl_can_put()) {
+                        if (ofctrl_has_backlog()) {
                             /* When there are in-flight messages pending to
                              * ovs-vswitchd, we should hold on recomputing so
                              * that the previous flow installations won't be
@@ -3728,7 +3728,7 @@ main(int argc, char *argv[])
                              * change tracking is improved, we can simply skip
                              * this round of engine_run and continue processing
                              * acculated changes incrementally later when
-                             * ofctrl_can_put() returns true. */
+                             * ofctrl_has_backlog() returns false. */
                             engine_run(false);
                         } else {
                             engine_run(true);
diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
index bf76671de..d970dce7c 100644
--- a/tests/ovn-controller.at
+++ b/tests/ovn-controller.at
@@ -899,7 +899,6 @@ AT_CHECK([echo $(($lflow_run_new - $lflow_run_old))], [0], 
[2
 OVN_CLEANUP([hv1])
 AT_CLEANUP
 
-
 AT_SETUP([ovn-controller - check ovn-chassis-mac-mappings])
 
 ovn_start
@@ -907,6 +906,7 @@ ovn_start
 net_add n1
 sim_add hv1
 as hv1
+
 ovs-vsctl add-br br-phys
 ovn_attach n1 br-phys 192.168.0.1
 
@@ -982,3 +982,56 @@ OVS_WAIT_UNTIL([test 6 = $(as hv1 ovs-ofctl dump-flows 
br-int | grep -c $(port_b
 
 OVN_CLEANUP([hv1])
 AT_CLEANUP
+
+AT_SETUP([ovn-controller - ofctrl wait before clearing flows])
+
+ovn_start
+
+net_add n1
+sim_add hv1
+as hv1
+
+check ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+check ovs-vsctl -- add-port br-int hv1-vif1 -- \
+    set interface hv1-vif1 external-ids:iface-id=ls1-lp1
+
+check ovn-nbctl ls-add ls1
+
+check ovn-nbctl --wait=hv lsp-add ls1 ls1-lp1 \
+-- lsp-set-addresses ls1-lp1 "f0:00:00:00:00:01 10.1.2.3"
+
+# Set 5 seconds wait time before clearing OVS flows.
+check ovs-vsctl set open . external_ids:ovn-ofctrl-wait-before-clear=5000
+
+# Stop ovn-controller
+OVS_APP_EXIT_AND_WAIT([ovn-controller])
+
+# The old OVS flows should remain (this is regardless of the configuration)
+AT_CHECK([ovs-ofctl dump-flows br-int | grep 10.1.2.3], [0], [ignore])
+
+# Make a change to the ls1-lp1's IP
+check ovn-nbctl --wait=sb lsp-set-addresses ls1-lp1 "f0:00:00:00:00:01 
10.1.2.4"
+
+# Start ovn-controller, which should compute new flows but not apply them
+# until the wait time is completed.
+start_daemon ovn-controller
+sleep 2
+
+# Check in the middle of the wait.
+lflow_run_1=$(ovn-appctl -t ovn-controller coverage/read-counter lflow_run)
+AT_CHECK([ovs-ofctl dump-flows br-int | grep 10.1.2.3], [0], [ignore])
+
+sleep 5
+
+# Check after the wait
+OVS_WAIT_UNTIL([ovs-ofctl dump-flows br-int | grep 10.1.2.4], [0], [ignore])
+lflow_run_2=$(ovn-appctl -t ovn-controller coverage/read-counter lflow_run)
+
+# Verify that the flow compute completed during the wait (after the wait it
+# merely installs the new flows).
+AT_CHECK_UNQUOTED([echo $lflow_run_1], [0], [$lflow_run_2
+])
+
+OVN_CLEANUP([hv1])
+AT_CLEANUP
-- 
2.37.1

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to