Bhanuprakash Bodireddy <bhanuprakash.bodire...@intel.com> writes: > This commit registers the packet processing PMD cores to keepalive > framework. Only PMDs that have rxqs mapped will be registered and > actively monitored by KA framework. > > This commit spawns a keepalive thread that will dispatch heartbeats to > PMD cores. The pmd threads respond to heartbeats by marking themselves > alive. As long as PMD responds to heartbeats it is considered 'healthy'. > > Signed-off-by: Bhanuprakash Bodireddy <bhanuprakash.bodire...@intel.com> > ---
I'm really confused with this patch. I've stopped reviewing the series. It seems like there's a mix of 'track by core id' and 'track by thread id'. I don't think it's possible to do anything by core id. We can never know what else has been scheduled on those cores, and we cannot be sure that a taskset or other scheduler provisioning call will move the threads. > lib/dpif-netdev.c | 70 +++++++++++++++++++++++++ > lib/keepalive.c | 153 > ++++++++++++++++++++++++++++++++++++++++++++++++++---- > lib/keepalive.h | 17 ++++++ > lib/util.c | 23 ++++++++ > lib/util.h | 2 + > 5 files changed, 254 insertions(+), 11 deletions(-) > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index e2cd931..84c7ffd 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -49,6 +49,7 @@ > #include "flow.h" > #include "hmapx.h" > #include "id-pool.h" > +#include "keepalive.h" > #include "latch.h" > #include "netdev.h" > #include "netdev-vport.h" > @@ -978,6 +979,63 @@ sorted_poll_thread_list(struct dp_netdev *dp, > *n = k; > } > > +static void * > +ovs_keepalive(void *f_ OVS_UNUSED) > +{ > + pthread_detach(pthread_self()); > + > + for (;;) { > + xusleep(get_ka_interval() * 1000); > + } > + > + return NULL; > +} > + > +static void > +ka_thread_start(struct dp_netdev *dp) > +{ > + static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; > + > + if (ovsthread_once_start(&once)) { > + ovs_thread_create("ovs_keepalive", ovs_keepalive, dp); > + > + ovsthread_once_done(&once); > + } > +} > + > +static void > +ka_register_datapath_threads(struct dp_netdev *dp) > +{ > + int ka_init = get_ka_init_status(); > + VLOG_DBG("Keepalive: Was initialization successful? [%s]", > + ka_init ? "Success" : "Failure"); > + if (!ka_init) { > + return; > + } > + > + ka_thread_start(dp); > + > + struct dp_netdev_pmd_thread *pmd; > + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { > + /* Register only PMD threads. */ > + if (pmd->core_id != NON_PMD_CORE_ID) { > + int tid = ka_get_pmd_tid(pmd->core_id); > + > + /* Skip PMD thread with no rxqs mapping. */ why skip these pmds? we should still watch them, and then we can correlated interesting events (for instance... when an rxq gets added whats the change in utilization, etc). > + if (OVS_UNLIKELY(!hmap_count(&pmd->poll_list))) { > + /* rxq mapping changes due to reconfiguration, > + * if there are no rxqs mapped to PMD, unregister it. */ > + ka_unregister_thread(tid, true); > + continue; > + } > + > + ka_register_thread(tid, true); > + VLOG_INFO("Registered PMD thread [%d] on Core [%d] to KA > framework", > + tid, pmd->core_id); > + } > + } > +} > + > static void > dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], > void *aux) > @@ -3625,6 +3683,9 @@ reconfigure_datapath(struct dp_netdev *dp) > > /* Reload affected pmd threads. */ > reload_affected_pmds(dp); > + > + /* Register datapath threads to KA monitoring. */ > + ka_register_datapath_threads(dp); > } > > /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */ > @@ -3824,6 +3885,8 @@ pmd_thread_main(void *f_) > > poll_list = NULL; > > + ka_store_pmd_id(pmd->core_id); > + > /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ > ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); > ovs_numa_thread_setaffinity_core(pmd->core_id); > @@ -3859,6 +3922,9 @@ reload: > : PMD_CYCLES_IDLE); > } > > + /* Mark PMD thread alive. */ > + ka_mark_pmd_thread_alive(pmd->core_id); > + > if (lc++ > 1024) { > bool reload; > > @@ -3892,6 +3958,10 @@ reload: > } > > emc_cache_uninit(&pmd->flow_cache); > + > + int tid = ka_get_pmd_tid(pmd->core_id); > + ka_unregister_thread(tid, true); > + > free(poll_list); > pmd_free_cached_ports(pmd); > return NULL; > diff --git a/lib/keepalive.c b/lib/keepalive.c > index ac73a42..dfafaeb 100644 > --- a/lib/keepalive.c > +++ b/lib/keepalive.c > @@ -23,6 +23,7 @@ > #include "keepalive.h" > #include "lib/vswitch-idl.h" > #include "openvswitch/vlog.h" > +#include "process.h" > #include "timeval.h" > > VLOG_DEFINE_THIS_MODULE(keepalive); > @@ -48,6 +49,134 @@ ka_get_pmd_tid(unsigned core_idx) > return -EINVAL; > } > > +/* Return the Keepalive timer interval. */ > +inline uint32_t Same as before w.r.t. inline in c files. > +get_ka_interval(void) > +{ > + return keepalive_timer_interval; > +} > + > +int > +get_ka_init_status(void) > +{ > + return ka_init_status; > +} > + > +void > +ka_store_pmd_id(unsigned core_idx) > +{ > + int tid = gettid(); > + > + if (ka_is_enabled()) { > + ovs_assert(tid > 0); If someone changes pid_max to be a larger number than a 32-bit integer, won't this assert trigger? > + ka_info->thread_id[core_idx] = tid; > + } > +} > + > +/* Register thread to KA framework. */ > +void > +ka_register_thread(int tid, bool thread_is_pmd) > +{ > + if (ka_is_enabled()) { > + struct ka_process_info *ka_pinfo; > + int core_num = -1; > + char proc_name[18] = "UNDEFINED"; > + > + struct process_info pinfo; > + int success = get_process_info(tid, &pinfo); > + if (success) { > + core_num = pinfo.core_id; > + ovs_strlcpy(proc_name, pinfo.name, sizeof proc_name); > + } > + > + ovs_assert(core_num >= 0); > + > + uint32_t hash = hash_int(tid, 0); > + ovs_mutex_lock(&ka_info->proclist_mutex); > + HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node, > + hash, &ka_info->process_list) { > + /* PMD thread is already registered. */ > + if (ka_pinfo->tid == tid) { > + goto out; > + } > + } > + > + ka_pinfo = xmalloc(sizeof *ka_pinfo); > + ka_pinfo->tid = tid; > + ka_pinfo->heartbeats = true; > + ka_pinfo->core_id = core_num; > + ovs_strlcpy(ka_pinfo->name, proc_name, sizeof ka_pinfo->name); > + > + hmap_insert(&ka_info->process_list, &ka_pinfo->node, hash); > + > + if (thread_is_pmd) { Why treat these differently? Get rid of the thread_is_pmd. > + ka_info->active_cores[core_num] = KA_STATE_ALIVE; > + ka_info->last_alive[core_num] = time_wall_msec(); As noted in the previous patch, there is no way to properly infer core usage from this, so get rid of all the "core based" tracking. > + ka_info->pmd_cnt++; /* Increment PMD count. */ > + } else { > + ka_info->nonpmd_cnt++; /* Increment non-pmd thread count. */ > + } > +out: > + ovs_mutex_unlock(&ka_info->proclist_mutex); > + } > +} > + > +/* Unregister thread from KA framework. */ > +void > +ka_unregister_thread(int tid, bool thread_is_pmd) > +{ > + if (ka_is_enabled()) { > + struct ka_process_info *ka_pinfo; > + > + int core_num = -1; > + struct process_info pinfo; > + if (get_process_info(tid, &pinfo)) { > + core_num = pinfo.core_id; > + } > + > + ovs_assert(core_num >= 0); > + > + ovs_mutex_lock(&ka_info->proclist_mutex); > + HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node, hash_int(tid, 0), > + &ka_info->process_list) { > + /* If PMD thread is registered, remove it from the list */ > + if (ka_pinfo->tid == tid) { > + hmap_remove(&ka_info->process_list, &ka_pinfo->node); > + free(ka_pinfo); > + > + if (thread_is_pmd) { > + ka_info->active_cores[core_num] = KA_STATE_UNUSED; > + ka_info->pmd_cnt--; /* Decrement PMD count. */ > + } else { > + ka_info->nonpmd_cnt--; /* Decrement non-pmd thread cnt. > */ > + } > + > + break; > + } > + } > + > + ovs_mutex_unlock(&ka_info->proclist_mutex); > + } > +} > + > +/* Mark packet processing core alive. */ > +void > +ka_mark_pmd_thread_alive(unsigned core_id) > +{ > + if (ka_is_enabled()) { > + ka_info->state_flags[core_id] = KA_STATE_ALIVE; shouldn't this be done by tid not coreid? > + } > +} > + > +/* Mark packet processing core as idle. */ > +inline void And again. > +ka_mark_pmd_thread_sleep(unsigned core_id) > +{ > + if (ka_is_enabled()) { > + ka_info->state_flags[core_id] = KA_STATE_DOZING; > + } > +} > + > void > ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state, > uint64_t last_alive) > @@ -166,18 +295,20 @@ ka_init(const struct smap *ovs_other_config) This whole diff section is inappropriate. Please don't add code in one patch and then remove it in the next. > void > ka_destroy(void) > { > - if (ka_info) { > - struct ka_process_info *pinfo; > - > - ovs_mutex_lock(&ka_info->proclist_mutex); > - HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) { > - free(pinfo); > - } > - ovs_mutex_unlock(&ka_info->proclist_mutex); > + if (!ka_info) { > + return; > + } > > - hmap_destroy(&ka_info->process_list); > - ovs_mutex_destroy(&ka_info->proclist_mutex); > + struct ka_process_info *pinfo; > > - free(ka_info); > + ovs_mutex_lock(&ka_info->proclist_mutex); > + HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) { > + free(pinfo); > } > + ovs_mutex_unlock(&ka_info->proclist_mutex); > + > + hmap_destroy(&ka_info->process_list); > + ovs_mutex_destroy(&ka_info->proclist_mutex); > + > + free(ka_info); > } > diff --git a/lib/keepalive.h b/lib/keepalive.h > index d133398..607ee3b 100644 > --- a/lib/keepalive.h > +++ b/lib/keepalive.h > @@ -38,8 +38,10 @@ enum keepalive_state { > typedef void (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *); > > struct ka_process_info { > + char name[16]; > int tid; > int core_id; > + bool heartbeats; > enum keepalive_state core_state; > uint64_t core_last_seen_times; > struct hmap_node node; > @@ -52,6 +54,10 @@ struct keepalive_info { > /* List of process/threads monitored by KA framework. */ > struct hmap process_list OVS_GUARDED; > > + /* count of threads registered to KA framework. */ > + uint32_t pmd_cnt; > + uint32_t nonpmd_cnt; > + What are these counts for? > /* Store Datapath threads 'tid'. > * In case of DPDK there can be max of KA_DP_MAXCORES threads. */ > pid_t thread_id[KA_DP_MAXCORES]; > @@ -84,4 +90,15 @@ void ka_update_core_state(const int, const enum > keepalive_state, > > int ka_get_pmd_tid(unsigned core); > bool ka_is_enabled(void); > +void ka_register_thread(int, bool); > +void ka_unregister_thread(int, bool); > +void ka_mark_pmd_thread_alive(unsigned); > +void ka_mark_pmd_thread_sleep(unsigned); > + > +void ka_store_pmd_id(unsigned core); > +uint32_t get_ka_interval(void); > +int get_ka_init_status(void); > +int ka_alloc_portstats(unsigned, int); > +void ka_destroy_portstats(void); > + > #endif /* keepalive.h */ > diff --git a/lib/util.c b/lib/util.c > index 36e3731..83f7e0a 100644 > --- a/lib/util.c > +++ b/lib/util.c > @@ -26,6 +26,9 @@ > #include <stdlib.h> > #include <string.h> > #include <sys/stat.h> > +#ifdef __linux__ > +#include <sys/syscall.h> > +#endif > #include <unistd.h> > #include "bitmap.h" > #include "byte-order.h" > @@ -568,6 +571,16 @@ get_page_size(void) > return cached; > } > > +int > +gettid(void) > +{ > +#ifdef __linux__ > + return (int)syscall(SYS_gettid); This returns a pid_t type. Don't cast it to int, please. Also, call it ovs_gettid() > +#endif > + > + return -EINVAL; > +} > + > /* Returns the time at which the system booted, as the number of milliseconds > * since the epoch, or 0 if the time of boot cannot be determined. */ > long long int > @@ -2197,6 +2210,16 @@ xsleep(unsigned int seconds) > ovsrcu_quiesce_end(); > } > > +void > +xusleep(unsigned int microseconds) > +{ > + ovsrcu_quiesce_start(); > +#ifdef __linux__ > + usleep(microseconds); I commented before on this not being put in a generic place since there's no equivalent. Further, according to the manpages: POSIX.1-2001 declares this function obsolete; use nanosleep(2) instead. POSIX.1-2008 removes the specification of usleep(). Please don't expose this through utils - ONLY linux will provide it, and it's possible at some future point that glibc removes it. > +#endif > + ovsrcu_quiesce_end(); > +} > + > /* Determine whether standard output is a tty or not. This is useful to > decide > * whether to use color output or not when --color option for utilities is > set > * to `auto`. > diff --git a/lib/util.h b/lib/util.h > index 764e0a0..f3cb61a 100644 > --- a/lib/util.h > +++ b/lib/util.h > @@ -120,6 +120,7 @@ const char *get_subprogram_name(void); > > unsigned int get_page_size(void); > long long int get_boot_time(void); > +int gettid(void); > > void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp); > > @@ -489,6 +490,7 @@ ovs_u128_and(const ovs_u128 a, const ovs_u128 b) > } > > void xsleep(unsigned int seconds); > +void xusleep(unsigned int microseconds); > > bool is_stdout_a_tty(void); _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev