On 02.12.2019 17:03, David Marchand wrote:
> Most DPDK components make the assumption that rte_lcore_id() returns
> a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of
> the LCORE_ID_ANY special value).

Do you think that makes a practical sense?  In a real virtualization
environments users are usually using cpus with lower numbers and it's
most likely possible to change NUMA layout to have CPUs from all the
nodes enumerated from the low CPU numbers.

It's uncommon also to use all the CPUs for just OVS on a big system
with huge number of cores.

Another thing is can we really do this on a DPDK level?  Maybe it'll
be a step to dynamic registering/unregistering of non-EAL threads?
Since you're wiping off the meaning of a lcore as a CPU core, it
becomes just a thread_id in a DPDK point of view.  So, maybe application
could call a new API function instead of managing these strange
mappings by itself?

One comment inline (not a full review).

Best regards, Ilya Maximets.

> 
> OVS does not currently check which value is set in
> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
> side.
> 
> Introduce a lcore allocator in OVS for PMD threads and map them to
> unused lcores from DPDK à la --lcores.
> 
> The physical cores on which the PMD threads are running still
> constitutes an important information when debugging, so still keep
> those in the PMD thread names but add a new debug log when starting
> them.
> 
> Synchronize DPDK internals on numa and cpuset for the PMD threads by
> registering them via the rte_thread_set_affinity() helper.
> 
> Signed-off-by: David Marchand <david.march...@redhat.com>
> ---
>  lib/dpdk-stub.c   |  8 +++++-
>  lib/dpdk.c        | 69 +++++++++++++++++++++++++++++++++++++++++++----
>  lib/dpdk.h        |  3 ++-
>  lib/dpif-netdev.c |  3 ++-
>  4 files changed, 75 insertions(+), 8 deletions(-)
> 
> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
> index c332c217c..90473bc8e 100644
> --- a/lib/dpdk-stub.c
> +++ b/lib/dpdk-stub.c
> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
> +{
> +    /* Nothing */
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
>  {
>      /* Nothing */
>  }
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 21dd47e80..771baa413 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -33,6 +33,7 @@
>  
>  #include "dirs.h"
>  #include "fatal-signal.h"
> +#include "id-pool.h"
>  #include "netdev-dpdk.h"
>  #include "netdev-offload-provider.h"
>  #include "openvswitch/dynamic-string.h"
> @@ -55,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates 
> successful initialization
>                                         * of DPDK. */
>  static bool per_port_memory = false; /* Status of per port memory support */
>  
> +static struct id_pool *lcore_id_pool;
> +static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER;
> +
>  static int
>  process_vhost_flags(char *flag, const char *default_val, int size,
>                      const struct smap *ovs_other_config,
> @@ -346,7 +350,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>          }
>      }
>  
> -    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
> +    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
> +        args_contains(&args, "--lcores")) {
>          auto_determine = false;
>      }
>  
> @@ -372,8 +377,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>               * thread affintity - default to core #0 */
>              VLOG_ERR("Thread getaffinity failed. Using core #0");
>          }
> -        svec_add(&args, "-l");
> -        svec_add_nocopy(&args, xasprintf("%d", cpu));
> +        svec_add(&args, "--lcores");
> +        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
>      }
>  
>      svec_terminate(&args);
> @@ -429,6 +434,23 @@ dpdk_init__(const struct smap *ovs_other_config)
>          }
>      }
>  
> +    ovs_mutex_lock(&lcore_id_pool_mutex);
> +    lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE);
> +    /* Empty the whole pool... */
> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
> +        uint32_t lcore_id;
> +
> +        id_pool_alloc_id(lcore_id_pool, &lcore_id);
> +    }
> +    /* ...and release the unused spots. */
> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
> +        if (rte_eal_lcore_role(lcore) != ROLE_OFF) {
> +             continue;
> +        }
> +        id_pool_free_id(lcore_id_pool, lcore);
> +    }
> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
> +
>      /* We are called from the main thread here */
>      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
>  
> @@ -522,11 +544,48 @@ dpdk_available(void)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu)
> +dpdk_init_thread_context(unsigned cpu)
>  {
> +    cpu_set_t cpuset;
> +    unsigned lcore;
> +    int err;
> +
>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>      ovs_assert(cpu != NON_PMD_CORE_ID);
> -    RTE_PER_LCORE(_lcore_id) = cpu;
> +
> +    ovs_mutex_lock(&lcore_id_pool_mutex);
> +    if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) {
> +        lcore = NON_PMD_CORE_ID;

If PMD thread will get NON_PMD_CORE_ID it will smash the non-PMD
mempool caches since it doesn't take a 'non_pmd_mutex'.

> +    }
> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
> +
> +    RTE_PER_LCORE(_lcore_id) = lcore;
> +
> +    /* DPDK is not initialised, nothing more to do. */
> +    if (lcore == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    CPU_ZERO(&cpuset);
> +    err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset);
> +    if (err) {
> +        VLOG_ABORT("Thread getaffinity error: %s", ovs_strerror(err));
> +    }
> +
> +    rte_thread_set_affinity(&cpuset);
> +    VLOG_INFO("Initialised lcore %u for core %u", lcore, cpu);
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> +    if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    ovs_mutex_lock(&lcore_id_pool_mutex);
> +    id_pool_free_id(lcore_id_pool, RTE_PER_LCORE(_lcore_id));
> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
>  }
>  
>  void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..404ac1a4b 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -36,7 +36,8 @@ struct smap;
>  struct ovsrec_open_vswitch;
>  
>  void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
>  const char *dpdk_get_vhost_sock_dir(void);
>  bool dpdk_vhost_iommu_enabled(void);
>  bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 5142bad1d..c40031a78 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5472,7 +5472,7 @@ pmd_thread_main(void *f_)
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> -    dpdk_set_lcore_id(pmd->core_id);
> +    dpdk_init_thread_context(pmd->core_id);
>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      dfc_cache_init(&pmd->flow_cache);
>      pmd_alloc_static_tx_qid(pmd);
> @@ -5592,6 +5592,7 @@ reload:
>      dfc_cache_uninit(&pmd->flow_cache);
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
> +    dpdk_uninit_thread_context();
>      return NULL;
>  }
>  
> 
_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to