Re: [ovs-dev] [PATCH v6 1/1] ovs-numa: Support non-contiguous numa nodes and offline CPU cores

2021-06-21 Thread Kevin Traynor
On 18/06/2021 00:43, David Wilder wrote:
> This change removes the assumption that numa nodes and cores are numbered
> contiguously in linux.  This change is required to support some Power
> systems.
> 
> A check has been added to verify that cores are online,
> offline cores result in non-contiguously numbered cores.
> 
> Dpdk EAL option generation is updated to work with non-contiguous numa nodes.
> These options can be seen in the ovs-vswitchd.log.  For example:
> a system containing only numa nodes 0 and 8 will generate the following:
> 
> EAL ARGS: ovs-vswitchd --socket-mem 1024,0,0,0,0,0,0,0,1024 \
>  --socket-limit 1024,0,0,0,0,0,0,0,1024 -l 0
> 
> Tests for pmd and dpif-netdev have been updated to validate non-contiguous
> numbered nodes.
> 

Hi David,

Just a minor comment below. Aside from that it LGTM, but it's review
based as i don't have a system with non-contiguous numas to test it on.
I don't see a regression in generating default EAL memory parameters on
my system with 2 contiguous numas.

checkpatch ok, unit tests ok, github actions ok.

Kevin.

> Signed-off-by: David Wilder 
> ---
>  lib/dpdk.c   | 57 +++--
>  lib/ovs-numa.c   | 51 
>  lib/ovs-numa.h   |  2 ++
>  tests/dpif-netdev.at |  2 +-
>  tests/pmd.at | 61 
>  5 files changed, 142 insertions(+), 31 deletions(-)
> 
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 319540394..7f6f1d164 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -129,22 +129,63 @@ construct_dpdk_options(const struct smap 
> *ovs_other_config, struct svec *args)
>  }
>  }
>  
> +static int
> +compare_numa_node_list(const void *a_, const void *b_)
> +{
> +size_t a = *(const size_t *) a_;
> +size_t b = *(const size_t *) b_;
> +

These can be ints (see comment below)

> +if (a < b) {
> +return -1;
> +}
> +if (a > b) {
> +return 1;
> +}
> +return 0;> +}
> +
>  static char *
>  construct_dpdk_socket_mem(void)
>  {
>  const char *def_value = "1024";
> -int numa, numa_nodes = ovs_numa_get_n_numas();
> +struct ovs_numa_dump *dump;
> +const struct ovs_numa_info_numa *node;
> +size_t k = 0, last_node = 0, n_numa_nodes, *numa_node_list;

numa_node_list[] will be assigned numa->numa_id, which is type int, so
it can be:
int *numa_node_list;

>  struct ds dpdk_socket_mem = DS_EMPTY_INITIALIZER;
>  
> -if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) {
> -numa_nodes = 1;
> -}
> +/* Build a list of all numa nodes with at least one core */
> +dump = ovs_numa_dump_n_cores_per_numa(1);
> +n_numa_nodes = hmap_count(>numas);
> +numa_node_list = xcalloc(n_numa_nodes, sizeof numa_node_list);
>  

It should dereference numa_node_list for type,
numa_node_list = xcalloc(n_numa_nodes, sizeof *numa_node_list);

> -ds_put_cstr(_socket_mem, def_value);
> -for (numa = 1; numa < numa_nodes; ++numa) {
> -ds_put_format(_socket_mem, ",%s", def_value);
> +FOR_EACH_NUMA_ON_DUMP(node, dump) {
> +if (k >= n_numa_nodes) {
> +break;
> +}
> +numa_node_list[k++] = node->numa_id;
>  }
> -
> +qsort(numa_node_list, k, sizeof numa_node_list, compare_numa_node_list);
> +
> +for (size_t i = 0; i < n_numa_nodes; i++) {
> +while (numa_node_list[i] > last_node &&
> +   numa_node_list[i] != OVS_NUMA_UNSPEC &&
> +   numa_node_list[i] <= MAX_NUMA_NODES){
> +if (last_node == 0) {
> +ds_put_format(_socket_mem, "%s", "0");
> +} else {
> +ds_put_format(_socket_mem, ",%s", "0");
> +}
> +last_node++;
> +}
> +if (numa_node_list[i] == 0) {
> +ds_put_format(_socket_mem, "%s", def_value);
> +} else {
> +ds_put_format(_socket_mem, ",%s", def_value);
> +}
> +last_node++;
> +}
> +free(numa_node_list);
> +ovs_numa_dump_destroy(dump);
>  return ds_cstr(_socket_mem);
>  }
>  
> diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
> index 6d0a68522..b825ecbdd 100644
> --- a/lib/ovs-numa.c
> +++ b/lib/ovs-numa.c
> @@ -42,21 +42,22 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
>   * This module stores the affinity information of numa nodes and cpu cores.
>   * It also provides functions to bookkeep the pin of threads on cpu cores.
>   *
> - * It is assumed that the numa node ids and cpu core ids all start from 0 and
> - * range continuously.  So, for example, if 'ovs_numa_get_n_cores()' returns 
> N,
> - * user can assume core ids from 0 to N-1 are all valid and there is a
> - * 'struct cpu_core' for each id.
> + * It is assumed that the numa node ids and cpu core ids all start from 0.
> + * There is no guarantee that node and cpu ids are numbered consecutively
> + * (this is a change from earlier version of the code). So, 

[ovs-dev] [PATCH v6 1/1] ovs-numa: Support non-contiguous numa nodes and offline CPU cores

2021-06-17 Thread David Wilder
This change removes the assumption that numa nodes and cores are numbered
contiguously in linux.  This change is required to support some Power
systems.

A check has been added to verify that cores are online,
offline cores result in non-contiguously numbered cores.

Dpdk EAL option generation is updated to work with non-contiguous numa nodes.
These options can be seen in the ovs-vswitchd.log.  For example:
a system containing only numa nodes 0 and 8 will generate the following:

EAL ARGS: ovs-vswitchd --socket-mem 1024,0,0,0,0,0,0,0,1024 \
 --socket-limit 1024,0,0,0,0,0,0,0,1024 -l 0

Tests for pmd and dpif-netdev have been updated to validate non-contiguous
numbered nodes.

Signed-off-by: David Wilder 
---
 lib/dpdk.c   | 57 +++--
 lib/ovs-numa.c   | 51 
 lib/ovs-numa.h   |  2 ++
 tests/dpif-netdev.at |  2 +-
 tests/pmd.at | 61 
 5 files changed, 142 insertions(+), 31 deletions(-)

diff --git a/lib/dpdk.c b/lib/dpdk.c
index 319540394..7f6f1d164 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -129,22 +129,63 @@ construct_dpdk_options(const struct smap 
*ovs_other_config, struct svec *args)
 }
 }
 
+static int
+compare_numa_node_list(const void *a_, const void *b_)
+{
+size_t a = *(const size_t *) a_;
+size_t b = *(const size_t *) b_;
+
+if (a < b) {
+return -1;
+}
+if (a > b) {
+return 1;
+}
+return 0;
+}
+
 static char *
 construct_dpdk_socket_mem(void)
 {
 const char *def_value = "1024";
-int numa, numa_nodes = ovs_numa_get_n_numas();
+struct ovs_numa_dump *dump;
+const struct ovs_numa_info_numa *node;
+size_t k = 0, last_node = 0, n_numa_nodes, *numa_node_list;
 struct ds dpdk_socket_mem = DS_EMPTY_INITIALIZER;
 
-if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) {
-numa_nodes = 1;
-}
+/* Build a list of all numa nodes with at least one core */
+dump = ovs_numa_dump_n_cores_per_numa(1);
+n_numa_nodes = hmap_count(>numas);
+numa_node_list = xcalloc(n_numa_nodes, sizeof numa_node_list);
 
-ds_put_cstr(_socket_mem, def_value);
-for (numa = 1; numa < numa_nodes; ++numa) {
-ds_put_format(_socket_mem, ",%s", def_value);
+FOR_EACH_NUMA_ON_DUMP(node, dump) {
+if (k >= n_numa_nodes) {
+break;
+}
+numa_node_list[k++] = node->numa_id;
 }
-
+qsort(numa_node_list, k, sizeof numa_node_list, compare_numa_node_list);
+
+for (size_t i = 0; i < n_numa_nodes; i++) {
+while (numa_node_list[i] > last_node &&
+   numa_node_list[i] != OVS_NUMA_UNSPEC &&
+   numa_node_list[i] <= MAX_NUMA_NODES){
+if (last_node == 0) {
+ds_put_format(_socket_mem, "%s", "0");
+} else {
+ds_put_format(_socket_mem, ",%s", "0");
+}
+last_node++;
+}
+if (numa_node_list[i] == 0) {
+ds_put_format(_socket_mem, "%s", def_value);
+} else {
+ds_put_format(_socket_mem, ",%s", def_value);
+}
+last_node++;
+}
+free(numa_node_list);
+ovs_numa_dump_destroy(dump);
 return ds_cstr(_socket_mem);
 }
 
diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
index 6d0a68522..b825ecbdd 100644
--- a/lib/ovs-numa.c
+++ b/lib/ovs-numa.c
@@ -42,21 +42,22 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
  * This module stores the affinity information of numa nodes and cpu cores.
  * It also provides functions to bookkeep the pin of threads on cpu cores.
  *
- * It is assumed that the numa node ids and cpu core ids all start from 0 and
- * range continuously.  So, for example, if 'ovs_numa_get_n_cores()' returns N,
- * user can assume core ids from 0 to N-1 are all valid and there is a
- * 'struct cpu_core' for each id.
+ * It is assumed that the numa node ids and cpu core ids all start from 0.
+ * There is no guarantee that node and cpu ids are numbered consecutively
+ * (this is a change from earlier version of the code). So, for example,
+ * if two nodes exist with ids 0 and 8, 'ovs_numa_get_n_nodes()' will
+ * return 2, no assumption of node numbering should be made.
  *
  * NOTE, this module should only be used by the main thread.
  *
- * NOTE, the assumption above will fail when cpu hotplug is used.  In that
- * case ovs-numa will not function correctly.  For now, add a TODO entry
- * for addressing it in the future.
+ * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be
+ * invalidated when ever the system topology changes. Support for detecting
+ * topology changes has not been included. For now, add a TODO entry for
+ * addressing it in the future.
  *
  * TODO: Fix ovs-numa when cpu hotplug is used.
  */
 
-#define MAX_NUMA_NODES 128
 
 /* numa node. */
 struct numa_node {
@@ -130,15 +131,14 @@ insert_new_cpu_core(struct numa_node *n,