The keepalive thread sends heartbeats to PMD thread and when PMD fails to respond to successive heartbeats the PMD is potentially stalled. The PMD state transition is as below:
ALIVE -> MISSING -> DEAD -> GONE This commit enables PMD healthchecks when PMD doesn't respond to heartbeats. This is needed to handle false negatives. With this commit the new state transition is as below: ALIVE -> MISSING -> DEAD -> CHECK -> GONE PMD Health checking state is introduced and will immediately kickin when the PMD gets in to DEAD state. As part of this below are considered. - Link status of the ports polled by PMD thread. - Statistics of the ports polled by PMD thread. - PMD polling and processing cycles. Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodire...@intel.com> --- lib/keepalive.h | 3 +++ lib/netdev-dpdk.c | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/lib/keepalive.h b/lib/keepalive.h index d8e55d5..6d1733c 100644 --- a/lib/keepalive.h +++ b/lib/keepalive.h @@ -71,6 +71,9 @@ struct keepalive_shm { /* Last seen timestamp of the core */ uint64_t core_last_seen_times[KEEPALIVE_MAXCORES]; + /* Number of PMD failures */ + uint32_t core_failures[KEEPALIVE_MAXCORES]; + /* Store pmd thread tid */ pid_t thread_id[KEEPALIVE_MAXCORES]; diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 74c1ab1..bb93fd8 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -606,6 +606,32 @@ dpdk_failcore_cb(void *ptr_data, const int core_id) } } +static void +dpdk_ka_handle_failure(struct keepalive_shm *ka_shm, const int core_id, + const enum rte_keepalive_state core_state, uint64_t last_alive) +{ + if ((ka_shm->core_state[core_id] != KA_STATE_CHECK) && + (ka_shm->core_state[core_id] == KA_STATE_DEAD)) { + ka_set_pmd_state(core_id, KA_STATE_CHECK); + } else { + /* The core failure has to be incremented only once when the + * state transition happens from CHECK -> GONE. + */ + if (ka_shm->core_state[core_id] == KA_STATE_CHECK) { + ka_shm->core_failures[core_id]++; + } + + /* Set the PMD core state to KA_STATE_GONE i.e failure. */ + ka_set_pmd_state(core_id, core_state); + + if (ka_is_pmdhealth_check_needed(core_id)) { + ka_disable_pmd_health_check(core_id); + } + } + + ka_shm->core_last_seen_times[core_id] = last_alive; +} + /* Update the core state in shared memory. * * This function shall be invoked periodically to write the core status and @@ -632,10 +658,19 @@ dpdk_ka_update_core_state(void *ptr_data, const int core_id, ka_shm->core_state[core_id] = KA_STATE_ALIVE; ka_shm->core_last_seen_times[core_id] = last_alive; break; - case RTE_KA_STATE_DOZING: - case RTE_KA_STATE_SLEEP: case RTE_KA_STATE_DEAD: + /* Enable PMD health check here, as we are in penultimate state + * of declaring PMD as failed. */ + ka_enable_pmd_health_check(core_id); + + ka_shm->core_state[core_id] = core_state; + ka_shm->core_last_seen_times[core_id] = last_alive; + break; case RTE_KA_STATE_GONE: + dpdk_ka_handle_failure(ka_shm, core_id, core_state, last_alive); + break; + case RTE_KA_STATE_DOZING: + case RTE_KA_STATE_SLEEP: ka_shm->core_state[core_id] = core_state; ka_shm->core_last_seen_times[core_id] = last_alive; break; -- 2.4.11 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev