>Bhanuprakash Bodireddy <bhanuprakash.bodire...@intel.com> writes:
>
>> This commit registers the packet processing PMD cores to keepalive
>> framework. Only PMDs that have rxqs mapped will be registered and
>> actively monitored by KA framework.
>>
>> This commit spawns a keepalive thread that will dispatch heartbeats to
>> PMD cores. The pmd threads respond to heartbeats by marking themselves
>> alive. As long as PMD responds to heartbeats it is considered 'healthy'.
>>
>> Signed-off-by: Bhanuprakash Bodireddy
>> <bhanuprakash.bodire...@intel.com>
>> ---
>
>I'm really confused with this patch.  I've stopped reviewing the series.
>
>It seems like there's a mix of 'track by core id' and 'track by thread id'.
>
>I don't think it's possible to do anything by core id.  We can never know what
>else has been scheduled on those cores, and we cannot be sure that a taskset
>or other scheduler provisioning call will move the threads.

[BHANU] I have already answered this in other thread. 
one can't correlate threads with cores and we shouldn't be tracking by cores. 
However with PMD threads 
there is 1:1 mapping of PMD and the core-id and it's safe to temporarily write 
PMD liveness info in to an array indexed
by core id before updating this in to HMAP. 

However as already mentioned, we are using tid for all other purposes as it is 
unique across the system.

>
>>  lib/dpif-netdev.c |  70 +++++++++++++++++++++++++
>>  lib/keepalive.c   | 153
>++++++++++++++++++++++++++++++++++++++++++++++++++----
>>  lib/keepalive.h   |  17 ++++++
>>  lib/util.c        |  23 ++++++++
>>  lib/util.h        |   2 +
>>  5 files changed, 254 insertions(+), 11 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
>> e2cd931..84c7ffd 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -49,6 +49,7 @@
>>  #include "flow.h"
>>  #include "hmapx.h"
>>  #include "id-pool.h"
>> +#include "keepalive.h"
>>  #include "latch.h"
>>  #include "netdev.h"
>>  #include "netdev-vport.h"
>> @@ -978,6 +979,63 @@ sorted_poll_thread_list(struct dp_netdev *dp,
>>      *n = k;
>>  }
>>
>> +static void *
>> +ovs_keepalive(void *f_ OVS_UNUSED)
>> +{
>> +    pthread_detach(pthread_self());
>> +
>> +    for (;;) {
>> +        xusleep(get_ka_interval() * 1000);
>> +    }
>> +
>> +    return NULL;
>> +}
>> +
>> +static void
>> +ka_thread_start(struct dp_netdev *dp) {
>> +    static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
>> +
>> +    if (ovsthread_once_start(&once)) {
>> +        ovs_thread_create("ovs_keepalive", ovs_keepalive, dp);
>> +
>> +        ovsthread_once_done(&once);
>> +    }
>> +}
>> +
>> +static void
>> +ka_register_datapath_threads(struct dp_netdev *dp) {
>> +    int ka_init = get_ka_init_status();
>> +    VLOG_DBG("Keepalive: Was initialization successful? [%s]",
>> +                ka_init ? "Success" : "Failure");
>> +    if (!ka_init) {
>> +        return;
>> +    }
>> +
>> +    ka_thread_start(dp);
>> +
>> +    struct dp_netdev_pmd_thread *pmd;
>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>> +        /*  Register only PMD threads. */
>> +        if (pmd->core_id != NON_PMD_CORE_ID) {
>> +            int tid = ka_get_pmd_tid(pmd->core_id);
>> +
>> +            /* Skip PMD thread with no rxqs mapping. */
>
>why skip these pmds?  we should still watch them, and then we can
>correlated interesting events (for instance... when an rxq gets added whats
>the change in utilization, etc).

[BHANU]  We shoud skip the PMDs that has no rxqs mapped. This would happen in 
cases
where there are more PMD threads than the number of rxqs. 

More importantly a PMD thread with no mapped rxq will not even enter the 
receive loop and
will be in sleep state as below. 

$ ps -eLo tid,psr,comm,state | grep pmd
 51727   3 pmd61           R
 51747   0 pmd62           S
 51749   1 pmd63           S
 51750   2 pmd64           R
 51756   6 pmd65           S
 51758   7 pmd66           R
 51759   4 pmd67           R
 51760   5 pmd68           S

When an rxq gets added to a sleeping PMD thread, the datapath reconfiguration 
happens,
this time threads get registered to KA framework as below.

CP:  reconfigure_datapath() -> ka_register_datapath_threads() -> 
ka_register_thread().

>
>> +            if (OVS_UNLIKELY(!hmap_count(&pmd->poll_list))) {
>> +                /* rxq mapping changes due to reconfiguration,
>> +                 * if there are no rxqs mapped to PMD, unregister it. */
>> +                ka_unregister_thread(tid, true);
>> +                continue;
>> +            }
>> +
>> +            ka_register_thread(tid, true);
>> +            VLOG_INFO("Registered PMD thread [%d] on Core [%d] to KA
>framework",
>> +                      tid, pmd->core_id);
>> +        }
>> +    }
>> +}
>> +
>>  static void
>>  dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char
>*argv[],
>>                       void *aux)
>> @@ -3625,6 +3683,9 @@ reconfigure_datapath(struct dp_netdev *dp)
>>
>>      /* Reload affected pmd threads. */
>>      reload_affected_pmds(dp);
>> +
>> +    /* Register datapath threads to KA monitoring. */
>> +    ka_register_datapath_threads(dp);
>>  }
>>
>>  /* Returns true if one of the netdevs in 'dp' requires a
>> reconfiguration */ @@ -3824,6 +3885,8 @@ pmd_thread_main(void *f_)
>>
>>      poll_list = NULL;
>>
>> +    ka_store_pmd_id(pmd->core_id);
>> +
>>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>>      ovs_numa_thread_setaffinity_core(pmd->core_id);
>> @@ -3859,6 +3922,9 @@ reload:
>>                                                        : PMD_CYCLES_IDLE);
>>          }
>>
>> +        /* Mark PMD thread alive. */
>> +        ka_mark_pmd_thread_alive(pmd->core_id);
>> +
>>          if (lc++ > 1024) {
>>              bool reload;
>>
>> @@ -3892,6 +3958,10 @@ reload:
>>      }
>>
>>      emc_cache_uninit(&pmd->flow_cache);
>> +
>> +    int tid = ka_get_pmd_tid(pmd->core_id);
>> +    ka_unregister_thread(tid, true);
>> +
>>      free(poll_list);
>>      pmd_free_cached_ports(pmd);
>>      return NULL;
>> diff --git a/lib/keepalive.c b/lib/keepalive.c index ac73a42..dfafaeb
>> 100644
>> --- a/lib/keepalive.c
>> +++ b/lib/keepalive.c
>> @@ -23,6 +23,7 @@
>>  #include "keepalive.h"
>>  #include "lib/vswitch-idl.h"
>>  #include "openvswitch/vlog.h"
>> +#include "process.h"
>>  #include "timeval.h"
>>
>>  VLOG_DEFINE_THIS_MODULE(keepalive);
>> @@ -48,6 +49,134 @@ ka_get_pmd_tid(unsigned core_idx)
>>      return -EINVAL;
>>  }
>>
>> +/* Return the Keepalive timer interval. */ inline uint32_t
>
>Same as before w.r.t. inline in c files.

[BHANU]  OK.

>
>> +get_ka_interval(void)
>> +{
>> +    return keepalive_timer_interval;
>> +}
>> +
>> +int
>> +get_ka_init_status(void)
>> +{
>> +    return ka_init_status;
>> +}
>> +
>> +void
>> +ka_store_pmd_id(unsigned core_idx)
>> +{
>> +    int tid = gettid();
>> +
>> +    if (ka_is_enabled()) {
>> +        ovs_assert(tid > 0);
>
>If someone changes pid_max to be a larger number than a 32-bit integer,
>won't this assert trigger?

[BHANU]  With my search skills, I see that PID_MAX_LIMIX can be set to 4 
million, so this shouldn't happen.

-----------------------"include/linux/threads.h"--------------------------------------
#define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)

/*
 * A maximum of 4 million PIDs should be enough for a while.
 * [NOTE: PID/TIDs are limited to 2^29 ~= 500+ million, see futex.h.]
 */
#define PID_MAX_LIMIT (CONFIG_BASE_SMALL ? PAGE_SIZE * 8 : \
        (sizeof(long) > 4 ? 4 * 1024 * 1024 : PID_MAX_DEFAULT))
----------------------------------------------------------------------------------------------

Man page also has some reference to PIX_MAX_LIMIT and confirms the above 
finding:
http://man7.org/linux/man-pages/man5/proc.5.html


>
>> +        ka_info->thread_id[core_idx] = tid;
>> +    }
>> +}
>> +
>> +/* Register thread to KA framework. */ void ka_register_thread(int
>> +tid, bool thread_is_pmd) {
>> +    if (ka_is_enabled()) {
>> +        struct ka_process_info *ka_pinfo;
>> +        int core_num = -1;
>> +        char proc_name[18] = "UNDEFINED";
>> +
>> +        struct process_info pinfo;
>> +        int success = get_process_info(tid, &pinfo);
>> +        if (success) {
>> +            core_num = pinfo.core_id;
>> +            ovs_strlcpy(proc_name, pinfo.name, sizeof proc_name);
>> +        }
>> +
>> +        ovs_assert(core_num >= 0);
>> +
>> +        uint32_t hash = hash_int(tid, 0);
>> +        ovs_mutex_lock(&ka_info->proclist_mutex);
>> +        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node,
>> +                                 hash, &ka_info->process_list) {
>> +            /* PMD thread is already registered. */
>> +            if (ka_pinfo->tid == tid) {
>> +                goto out;
>> +            }
>> +        }
>> +
>> +        ka_pinfo = xmalloc(sizeof *ka_pinfo);
>> +        ka_pinfo->tid = tid;
>> +        ka_pinfo->heartbeats = true;
>> +        ka_pinfo->core_id = core_num;
>> +        ovs_strlcpy(ka_pinfo->name, proc_name, sizeof
>> + ka_pinfo->name);
>> +
>> +        hmap_insert(&ka_info->process_list, &ka_pinfo->node, hash);
>> +
>> +        if (thread_is_pmd) {
>
>Why treat these differently?  Get rid of the thread_is_pmd.

[BHANU]
  The API is named as ka_register_thread(), meaning it can register any thread 
be it PMD or non-PMD.
   In case of PMD thread, thread_is_pmd is set, and uses core tracking and 
increments pmd_cnt for bookkeeping. 

   In the future, if we implement registering of non-pmd threads this API can 
be called with thread_is_pmd == false.

>
>> +            ka_info->active_cores[core_num] = KA_STATE_ALIVE;
>> +            ka_info->last_alive[core_num] = time_wall_msec();
>
>As noted in the previous patch, there is no way to properly infer core usage
>from this, so get rid of all the "core based" tracking.
>
>> +            ka_info->pmd_cnt++;  /* Increment PMD count. */
>> +        } else {
>> +            ka_info->nonpmd_cnt++;  /* Increment non-pmd thread count. */
>> +        }
>> +out:
>> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
>> +    }
>> +}
>> +
>> +/* Unregister thread from KA framework. */ void
>> +ka_unregister_thread(int tid, bool thread_is_pmd) {
>> +    if (ka_is_enabled()) {
>> +        struct ka_process_info *ka_pinfo;
>> +
>> +        int core_num = -1;
>> +        struct process_info pinfo;
>> +        if (get_process_info(tid, &pinfo)) {
>> +            core_num = pinfo.core_id;
>> +        }
>> +
>> +        ovs_assert(core_num >= 0);
>> +
>> +        ovs_mutex_lock(&ka_info->proclist_mutex);
>> +        HMAP_FOR_EACH_WITH_HASH (ka_pinfo, node, hash_int(tid, 0),
>> +                                 &ka_info->process_list) {
>> +            /* If PMD thread is registered, remove it from the list */
>> +            if (ka_pinfo->tid == tid) {
>> +                hmap_remove(&ka_info->process_list, &ka_pinfo->node);
>> +                free(ka_pinfo);
>> +
>> +                if (thread_is_pmd) {
>> +                    ka_info->active_cores[core_num] = KA_STATE_UNUSED;
>> +                    ka_info->pmd_cnt--;  /* Decrement PMD count. */
>> +                } else {
>> +                    ka_info->nonpmd_cnt--;  /* Decrement non-pmd thread 
>> cnt. */
>> +                }
>> +
>> +                break;
>> +            }
>> +        }
>> +
>> +        ovs_mutex_unlock(&ka_info->proclist_mutex);
>> +    }
>> +}
>> +
>> +/* Mark packet processing core alive. */ void
>> +ka_mark_pmd_thread_alive(unsigned core_id) {
>> +    if (ka_is_enabled()) {
>> +        ka_info->state_flags[core_id] = KA_STATE_ALIVE;
>
>shouldn't this be done by tid not coreid?

[BHANU]  The API is used by PMD threads and PMD thread pinned to *core_id* is 
marking itself ALIVE in packet processing loop.

>
>> +    }
>> +}
>> +
>> +/* Mark packet processing core as idle. */ inline void
>
>And again.
>
>> +ka_mark_pmd_thread_sleep(unsigned core_id) {
>> +    if (ka_is_enabled()) {
>> +        ka_info->state_flags[core_id] = KA_STATE_DOZING;
>> +    }
>> +}
>> +
>>  void
>>  ka_set_pmd_state_ts(unsigned core_id, enum keepalive_state state,
>>                      uint64_t last_alive) @@ -166,18 +295,20 @@
>> ka_init(const struct smap *ovs_other_config)
>
>This whole diff section is inappropriate.  Please don't add code in one patch
>and then remove it in the next.

[BHANU]  Its bit of realignment. I will fix it anyways.

>
>>  void
>>  ka_destroy(void)
>>  {
>> -    if (ka_info) {
>> -        struct ka_process_info *pinfo;
>> -
>> -        ovs_mutex_lock(&ka_info->proclist_mutex);
>> -        HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
>> -            free(pinfo);
>> -        }
>> -        ovs_mutex_unlock(&ka_info->proclist_mutex);
>> +    if (!ka_info) {
>> +        return;
>> +    }
>>
>> -        hmap_destroy(&ka_info->process_list);
>> -        ovs_mutex_destroy(&ka_info->proclist_mutex);
>> +    struct ka_process_info *pinfo;
>>
>> -        free(ka_info);
>> +    ovs_mutex_lock(&ka_info->proclist_mutex);
>> +    HMAP_FOR_EACH_POP (pinfo, node, &ka_info->process_list) {
>> +        free(pinfo);
>>      }
>> +    ovs_mutex_unlock(&ka_info->proclist_mutex);
>> +
>> +    hmap_destroy(&ka_info->process_list);
>> +    ovs_mutex_destroy(&ka_info->proclist_mutex);
>> +
>> +    free(ka_info);
>>  }
>> diff --git a/lib/keepalive.h b/lib/keepalive.h index d133398..607ee3b
>> 100644
>> --- a/lib/keepalive.h
>> +++ b/lib/keepalive.h
>> @@ -38,8 +38,10 @@ enum keepalive_state {  typedef void
>> (*ka_relay_cb)(int, enum keepalive_state, uint64_t, void *);
>>
>>  struct ka_process_info {
>> +    char name[16];
>>      int tid;
>>      int core_id;
>> +    bool heartbeats;
>>      enum keepalive_state core_state;
>>      uint64_t core_last_seen_times;
>>      struct hmap_node node;
>> @@ -52,6 +54,10 @@ struct keepalive_info {
>>      /* List of process/threads monitored by KA framework. */
>>      struct hmap process_list OVS_GUARDED;
>>
>> +    /* count of threads registered to KA framework. */
>> +    uint32_t pmd_cnt;
>> +    uint32_t nonpmd_cnt;
>> +
>
>What are these counts for?

[BHANU]  This is to track the number of PMD threads and non PMD threads on 
compute and displayed with 'Keepalive/pmd-health-show'

>
>>      /* Store Datapath threads 'tid'.
>>       * In case of DPDK there can be max of KA_DP_MAXCORES threads. */
>>      pid_t thread_id[KA_DP_MAXCORES];
>> @@ -84,4 +90,15 @@ void ka_update_core_state(const int, const enum
>> keepalive_state,
>>
>>  int ka_get_pmd_tid(unsigned core);
>>  bool ka_is_enabled(void);
>> +void ka_register_thread(int, bool);
>> +void ka_unregister_thread(int, bool); void
>> +ka_mark_pmd_thread_alive(unsigned);
>> +void ka_mark_pmd_thread_sleep(unsigned);
>> +
>> +void ka_store_pmd_id(unsigned core);
>> +uint32_t get_ka_interval(void);
>> +int get_ka_init_status(void);
>> +int ka_alloc_portstats(unsigned, int); void
>> +ka_destroy_portstats(void);
>> +
>>  #endif /* keepalive.h */
>> diff --git a/lib/util.c b/lib/util.c
>> index 36e3731..83f7e0a 100644
>> --- a/lib/util.c
>> +++ b/lib/util.c
>> @@ -26,6 +26,9 @@
>>  #include <stdlib.h>
>>  #include <string.h>
>>  #include <sys/stat.h>
>> +#ifdef __linux__
>> +#include <sys/syscall.h>
>> +#endif
>>  #include <unistd.h>
>>  #include "bitmap.h"
>>  #include "byte-order.h"
>> @@ -568,6 +571,16 @@ get_page_size(void)
>>      return cached;
>>  }
>>
>> +int
>> +gettid(void)
>> +{
>> +#ifdef __linux__
>> +    return (int)syscall(SYS_gettid);
>
>This returns a pid_t type.  Don't cast it to int, please.
>
>Also, call it ovs_gettid()

[BHANU] OK.

>
>> +#endif
>> +
>> +    return -EINVAL;
>> +}
>> +
>>  /* Returns the time at which the system booted, as the number of
>milliseconds
>>   * since the epoch, or 0 if the time of boot cannot be determined. */
>> long long int @@ -2197,6 +2210,16 @@ xsleep(unsigned int seconds)
>>      ovsrcu_quiesce_end();
>>  }
>>
>> +void
>> +xusleep(unsigned int microseconds)
>> +{
>> +    ovsrcu_quiesce_start();
>> +#ifdef __linux__
>> +    usleep(microseconds);
>
>I commented before on this not being put in a generic place since there's no
>equivalent.
>
>Further, according to the manpages:
>
>POSIX.1-2001  declares this function obsolete; use nanosleep(2) instead.
>POSIX.1-2008 removes  the  specification  of usleep().
>
>Please don't expose this through utils - ONLY linux will provide it, and it's
>possible at some future point that glibc removes it.
>
[BHANU] Can you suggest suitable place for this API?


>> +#endif
>> +    ovsrcu_quiesce_end();
>> +}
>> +
>>  /* Determine whether standard output is a tty or not. This is useful to
>decide
>>   * whether to use color output or not when --color option for utilities is 
>> set
>>   * to `auto`.
>> diff --git a/lib/util.h b/lib/util.h
>> index 764e0a0..f3cb61a 100644
>> --- a/lib/util.h
>> +++ b/lib/util.h
>> @@ -120,6 +120,7 @@ const char *get_subprogram_name(void);
>>
>>  unsigned int get_page_size(void);
>>  long long int get_boot_time(void);
>> +int gettid(void);
>>
>>  void ovs_print_version(uint8_t min_ofp, uint8_t max_ofp);
>>
>> @@ -489,6 +490,7 @@ ovs_u128_and(const ovs_u128 a, const ovs_u128 b)
>> }
>>
>>  void xsleep(unsigned int seconds);
>> +void xusleep(unsigned int microseconds);
>>
>>  bool is_stdout_a_tty(void);
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to