This change removes the assumption that numa nodes and cores are numbered contiguously in linux. This change is required to support some Power systems.
A check has been added to verify that cores are online, offline cores result in non-contiguously numbered cores. Dpdk EAL option generation is updated to work with non-contiguous numa nodes. These options can be seen in the ovs-vswitchd.log. For example: a system containing only numa nodes 0 and 8 will generate the following: EAL ARGS: ovs-vswitchd --socket-mem 1024,0,0,0,0,0,0,0,1024 \ --socket-limit 1024,0,0,0,0,0,0,0,1024 -l 0 Tests for pmd and dpif-netdev have been updated to validate non-contiguous numbered nodes. Signed-off-by: David Wilder <dwil...@us.ibm.com> --- lib/dpdk.c | 57 +++++++++++++++++++++++++++++++++++------ lib/ovs-numa.c | 51 ++++++++++++++++++++++++------------ lib/ovs-numa.h | 2 ++ tests/dpif-netdev.at | 2 +- tests/pmd.at | 61 ++++++++++++++++++++++++++++++++++++++++---- 5 files changed, 142 insertions(+), 31 deletions(-) diff --git a/lib/dpdk.c b/lib/dpdk.c index 319540394..7f6f1d164 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -129,22 +129,63 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) } } +static int +compare_numa_node_list(const void *a_, const void *b_) +{ + size_t a = *(const size_t *) a_; + size_t b = *(const size_t *) b_; + + if (a < b) { + return -1; + } + if (a > b) { + return 1; + } + return 0; +} + static char * construct_dpdk_socket_mem(void) { const char *def_value = "1024"; - int numa, numa_nodes = ovs_numa_get_n_numas(); + struct ovs_numa_dump *dump; + const struct ovs_numa_info_numa *node; + size_t k = 0, last_node = 0, n_numa_nodes, *numa_node_list; struct ds dpdk_socket_mem = DS_EMPTY_INITIALIZER; - if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) { - numa_nodes = 1; - } + /* Build a list of all numa nodes with at least one core */ + dump = ovs_numa_dump_n_cores_per_numa(1); + n_numa_nodes = hmap_count(&dump->numas); + numa_node_list = xcalloc(n_numa_nodes, sizeof numa_node_list); - ds_put_cstr(&dpdk_socket_mem, def_value); - for (numa = 1; numa < numa_nodes; ++numa) { - ds_put_format(&dpdk_socket_mem, ",%s", def_value); + FOR_EACH_NUMA_ON_DUMP(node, dump) { + if (k >= n_numa_nodes) { + break; + } + numa_node_list[k++] = node->numa_id; } - + qsort(numa_node_list, k, sizeof numa_node_list, compare_numa_node_list); + + for (size_t i = 0; i < n_numa_nodes; i++) { + while (numa_node_list[i] > last_node && + numa_node_list[i] != OVS_NUMA_UNSPEC && + numa_node_list[i] <= MAX_NUMA_NODES){ + if (last_node == 0) { + ds_put_format(&dpdk_socket_mem, "%s", "0"); + } else { + ds_put_format(&dpdk_socket_mem, ",%s", "0"); + } + last_node++; + } + if (numa_node_list[i] == 0) { + ds_put_format(&dpdk_socket_mem, "%s", def_value); + } else { + ds_put_format(&dpdk_socket_mem, ",%s", def_value); + } + last_node++; + } + free(numa_node_list); + ovs_numa_dump_destroy(dump); return ds_cstr(&dpdk_socket_mem); } diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c index 6d0a68522..b825ecbdd 100644 --- a/lib/ovs-numa.c +++ b/lib/ovs-numa.c @@ -42,21 +42,22 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa); * This module stores the affinity information of numa nodes and cpu cores. * It also provides functions to bookkeep the pin of threads on cpu cores. * - * It is assumed that the numa node ids and cpu core ids all start from 0 and - * range continuously. So, for example, if 'ovs_numa_get_n_cores()' returns N, - * user can assume core ids from 0 to N-1 are all valid and there is a - * 'struct cpu_core' for each id. + * It is assumed that the numa node ids and cpu core ids all start from 0. + * There is no guarantee that node and cpu ids are numbered consecutively + * (this is a change from earlier version of the code). So, for example, + * if two nodes exist with ids 0 and 8, 'ovs_numa_get_n_nodes()' will + * return 2, no assumption of node numbering should be made. * * NOTE, this module should only be used by the main thread. * - * NOTE, the assumption above will fail when cpu hotplug is used. In that - * case ovs-numa will not function correctly. For now, add a TODO entry - * for addressing it in the future. + * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be + * invalidated when ever the system topology changes. Support for detecting + * topology changes has not been included. For now, add a TODO entry for + * addressing it in the future. * * TODO: Fix ovs-numa when cpu hotplug is used. */ -#define MAX_NUMA_NODES 128 /* numa node. */ struct numa_node { @@ -130,15 +131,14 @@ insert_new_cpu_core(struct numa_node *n, unsigned core_id) * - "0,0,0,0": four cores on numa socket 0. * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets. * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets. - * - * The different numa ids must be consecutives or the function will abort. */ + * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous. + */ static void discover_numa_and_core_dummy(void) { char *conf = xstrdup(dummy_config); char *id, *saveptr = NULL; unsigned i = 0; - long max_numa_id = 0; for (id = strtok_r(conf, ",", &saveptr); id; id = strtok_r(NULL, ",", &saveptr)) { @@ -152,8 +152,6 @@ discover_numa_and_core_dummy(void) continue; } - max_numa_id = MAX(max_numa_id, numa_id); - hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0)); if (hnode) { @@ -169,10 +167,27 @@ discover_numa_and_core_dummy(void) free(conf); - if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) { - ovs_fatal(0, "dummy numa contains non consecutive numa ids"); +} + +#ifdef __linux__ +/* Check if a cpu is detected and online */ +static int +cpu_detected(unsigned int core_id) +{ + char path[PATH_MAX]; + int len = snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/topology/core_id", + core_id); + if (len <= 0 || (unsigned) len >= sizeof(path)) { + return 0; + } + if (access(path, F_OK) != 0) { + return 0; } + + return 1; } +#endif /* __linux__ */ /* Discovers all numa nodes and the corresponding cpu cores. * Constructs the 'struct numa_node' and 'struct cpu_core'. */ @@ -219,7 +234,9 @@ discover_numa_and_core(void) unsigned core_id; core_id = strtoul(subdir->d_name + 3, NULL, 10); - insert_new_cpu_core(n, core_id); + if (cpu_detected(core_id)) { + insert_new_cpu_core(n, core_id); + } } } closedir(dir); @@ -229,7 +246,7 @@ discover_numa_and_core(void) } free(path); - if (!dir || !numa_supported) { + if (!numa_supported) { break; } } diff --git a/lib/ovs-numa.h b/lib/ovs-numa.h index 8f2ea3430..ecc251a7f 100644 --- a/lib/ovs-numa.h +++ b/lib/ovs-numa.h @@ -26,6 +26,8 @@ #define OVS_CORE_UNSPEC INT_MAX #define OVS_NUMA_UNSPEC INT_MAX +#define MAX_NUMA_NODES 128 + /* Dump of a list of 'struct ovs_numa_info'. */ struct ovs_numa_dump { struct hmap cores; diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 57cae383f..f076dd3c3 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -98,7 +98,7 @@ m4_define([DPIF_NETDEV_DUMMY_IFACE], fail-mode=secure -- \ add-port br1 p2 -- set interface p2 type=$1 options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \ add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], [], - [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])]) + [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,8,8,8,8"], [])]) AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) diff --git a/tests/pmd.at b/tests/pmd.at index cc5371d5a..faf02f158 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -361,8 +361,8 @@ AT_SETUP([PMD - change numa node]) OVS_VSWITCHD_START( [add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \ add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \ - set Open_vSwitch . other_config:pmd-cpu-mask=3 -], [], [], [--dummy-numa 0,1]) + set Open_vSwitch . other_config:pmd-cpu-mask=7 +], [], [], [--dummy-numa 0,1,8]) AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) AT_CHECK([ovs-ofctl add-flow br0 action=controller]) @@ -432,6 +432,40 @@ NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_l icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc ]) +AT_CHECK([ovs-vsctl set Interface p1 options:numa_id=8]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl +p1 0 8 2 +p1 1 8 2 +p2 0 1 1 +p2 1 1 1 +]) + +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2]) +OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit]) + +AT_CHECK([cat ofctl_monitor.log], [0], [dnl +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered) +icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc +]) + +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2]) +OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit]) + +AT_CHECK([cat ofctl_monitor.log], [0], [dnl +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_len=106 (unbuffered) +icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc +]) + + OVS_VSWITCHD_STOP AT_CLEANUP @@ -584,7 +618,7 @@ AT_CLEANUP AT_SETUP([PMD - rxq affinity - NUMA]) OVS_VSWITCHD_START( - [], [], [], [--dummy-numa 0,0,0,1,1]) + [], [], [], [--dummy-numa 0,0,0,1,1,8,8]) AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) AT_CHECK([ovs-ofctl add-flow br0 actions=controller]) @@ -601,21 +635,38 @@ p1 1 0 2 AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"]) -dnl We moved the queues to different numa node. Expecting threads on +dnl We moved the queues to different contiguous numa node. Expecting threads on dnl NUMA node 1 to be created. AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl p1 0 1 3 p1 1 1 4 ]) +AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:6"]) + +dnl We moved the queues to different non-contiguous numa node. Expecting threads on +dnl NUMA node 8 to be created. +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl +p1 0 8 5 +p1 1 8 6 +]) + AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"]) -dnl Queues splitted between NUMA nodes. +dnl Queues splitted between contiguous NUMA nodes. AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl p1 0 1 3 p1 1 0 1 ]) +AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:1"]) + +dnl Queues splitted between non-contiguous NUMA nodes. +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl +p1 0 8 5 +p1 1 0 1 +]) + AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity]) dnl We removed the rxq-affinity request. dpif-netdev should assign queues -- 2.27.0 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev