The keepalive thread sends heartbeats to PMD thread and when PMD fails to
respond to successive heartbeats the PMD is potentially stalled. The PMD
state transition is as below:

ALIVE -> MISSING -> DEAD -> GONE

This commit enables PMD healthchecks when PMD doesn't respond to
heartbeats. This is needed to handle false negatives. With this commit
the new state transition is as below:

ALIVE -> MISSING -> DEAD -> CHECK -> GONE

PMD Health checking state is introduced and will immediately kickin when
the PMD gets in to DEAD state. As part of this below are considered.

  - Link status of the ports polled by PMD thread.
  - Statistics of the ports polled by PMD thread.
  - PMD polling and processing cycles.

Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodire...@intel.com>
---
 lib/keepalive.h   |  3 +++
 lib/netdev-dpdk.c | 39 +++++++++++++++++++++++++++++++++++++--
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/lib/keepalive.h b/lib/keepalive.h
index d8e55d5..6d1733c 100644
--- a/lib/keepalive.h
+++ b/lib/keepalive.h
@@ -71,6 +71,9 @@ struct keepalive_shm {
     /* Last seen timestamp of the core */
     uint64_t core_last_seen_times[KEEPALIVE_MAXCORES];
 
+    /* Number of PMD failures */
+    uint32_t core_failures[KEEPALIVE_MAXCORES];
+
     /* Store pmd thread tid */
     pid_t thread_id[KEEPALIVE_MAXCORES];
 
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 74c1ab1..bb93fd8 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -606,6 +606,32 @@ dpdk_failcore_cb(void *ptr_data, const int core_id)
     }
 }
 
+static void
+dpdk_ka_handle_failure(struct keepalive_shm *ka_shm, const int core_id,
+               const enum rte_keepalive_state core_state, uint64_t last_alive)
+{
+    if ((ka_shm->core_state[core_id] != KA_STATE_CHECK) &&
+        (ka_shm->core_state[core_id] == KA_STATE_DEAD)) {
+        ka_set_pmd_state(core_id, KA_STATE_CHECK);
+    } else {
+        /* The core failure has to be incremented only once when the
+         * state transition happens from CHECK -> GONE.
+         */
+        if (ka_shm->core_state[core_id] == KA_STATE_CHECK) {
+            ka_shm->core_failures[core_id]++;
+        }
+
+        /* Set the PMD core state to KA_STATE_GONE i.e failure. */
+        ka_set_pmd_state(core_id, core_state);
+
+        if (ka_is_pmdhealth_check_needed(core_id)) {
+            ka_disable_pmd_health_check(core_id);
+        }
+    }
+
+    ka_shm->core_last_seen_times[core_id] = last_alive;
+}
+
 /* Update the core state in shared memory.
  *
  * This function shall be invoked periodically to write the core status and
@@ -632,10 +658,19 @@ dpdk_ka_update_core_state(void *ptr_data, const int 
core_id,
         ka_shm->core_state[core_id] = KA_STATE_ALIVE;
         ka_shm->core_last_seen_times[core_id] = last_alive;
         break;
-    case RTE_KA_STATE_DOZING:
-    case RTE_KA_STATE_SLEEP:
     case RTE_KA_STATE_DEAD:
+        /* Enable PMD health check here, as we are in penultimate state
+         * of declaring PMD as failed. */
+        ka_enable_pmd_health_check(core_id);
+
+        ka_shm->core_state[core_id] = core_state;
+        ka_shm->core_last_seen_times[core_id] = last_alive;
+        break;
     case RTE_KA_STATE_GONE:
+        dpdk_ka_handle_failure(ka_shm, core_id, core_state, last_alive);
+        break;
+    case RTE_KA_STATE_DOZING:
+    case RTE_KA_STATE_SLEEP:
         ka_shm->core_state[core_id] = core_state;
         ka_shm->core_last_seen_times[core_id] = last_alive;
         break;
-- 
2.4.11

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to