Hi Maciej,
On 10/29/24 6:00 AM, Maciej Wieczor-Retman wrote:
> Sub-NUMA Cluster divides CPUs sharing an L3 cache into separate NUMA
> nodes. Systems may support splitting into either two, three or four
> nodes.
>
> When SNC mode is enabled the effective amount of L3 cache available
> for allocation is divided by the number of nodes per L3.
>
> Detect which SNC mode is active by comparing the number of CPUs
> that share a cache with CPU0, with the number of CPUs on node0.
>
> Co-developed-by: Tony Luck
> Signed-off-by: Tony Luck
> Signed-off-by: Maciej Wieczor-Retman
> ---
...
> diff --git a/tools/testing/selftests/resctrl/resctrl.h
> b/tools/testing/selftests/resctrl/resctrl.h
> index 2dda56084588..851b37c9c38a 100644
> --- a/tools/testing/selftests/resctrl/resctrl.h
> +++ b/tools/testing/selftests/resctrl/resctrl.h
> @@ -11,6 +11,7 @@
> #include
> #include
> #include
> +#include
> #include
> #include
> #include
> @@ -43,6 +44,8 @@
>
> #define DEFAULT_SPAN (250 * MB)
>
> +#define MAX_SNC 4
> +
> /*
> * user_params: User supplied parameters
> * @cpu: CPU number to which the benchmark will be bound to
> @@ -120,6 +123,7 @@ extern volatile int *value_sink;
>
> extern char llc_occup_path[1024];
>
> +int snc_nodes_per_l3_cache(void);
> int get_vendor(void);
> bool check_resctrlfs_support(void);
> int filter_dmesg(void);
> diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c
> b/tools/testing/selftests/resctrl/resctrl_tests.c
> index ecbb7605a981..4b84d6199a36 100644
> --- a/tools/testing/selftests/resctrl/resctrl_tests.c
> +++ b/tools/testing/selftests/resctrl/resctrl_tests.c
> @@ -118,11 +118,17 @@ static bool test_vendor_specific_check(const struct
> resctrl_test *test)
>
> static void run_single_test(const struct resctrl_test *test, const struct
> user_params *uparams)
> {
> - int ret;
> + int ret, snc_mode;
>
> if (test->disabled)
> return;
>
> + snc_mode = snc_nodes_per_l3_cache();
> + if (snc_mode > 1)
> + ksft_print_msg("SNC-%d mode discovered.\n", snc_mode);
> + else if (snc_unreliable)
> + ksft_print_msg("SNC detection unreliable due to offline CPUs.
> Test results may not be accurate if SNC enabled.\n");
As I understand none of the tests can be considered reliable if SNC detection
is unreliable
so skipping the test can be done here in a central spot without duplicating it
in each test.
> +
> if (!test_vendor_specific_check(test)) {
> ksft_test_result_skip("Hardware does not support %s\n",
> test->name);
> return;
> diff --git a/tools/testing/selftests/resctrl/resctrlfs.c
> b/tools/testing/selftests/resctrl/resctrlfs.c
> index 250c320349a7..d6384f404d95 100644
> --- a/tools/testing/selftests/resctrl/resctrlfs.c
> +++ b/tools/testing/selftests/resctrl/resctrlfs.c
> @@ -13,6 +13,8 @@
>
> #include "resctrl.h"
>
> +int snc_unreliable;
> +
> static int find_resctrl_mount(char *buffer)
> {
> FILE *mounts;
> @@ -156,6 +158,93 @@ int get_domain_id(const char *resource, int cpu_no, int
> *domain_id)
> return 0;
> }
>
> +/*
> + * Count number of CPUs in a /sys bitmap
> + */
> +static unsigned int count_sys_bitmap_bits(char *name)
> +{
> + FILE *fp = fopen(name, "r");
> + int count = 0, c;
> +
> + if (!fp)
> + return 0;
> +
> + while ((c = fgetc(fp)) != EOF) {
> + if (!isxdigit(c))
> + continue;
> + switch (c) {
> + case 'f':
> + count++;
> + case '7': case 'b': case 'd': case 'e':
> + count++;
> + case '3': case '5': case '6': case '9': case 'a': case 'c':
> + count++;
> + case '1': case '2': case '4': case '8':
> + count++;
> + }
> + }
> + fclose(fp);
> +
> + return count;
> +}
> +
> +static bool cpus_offline_empty(void)
> +{
> + char offline_cpus_str[64];
> + FILE *fp;
> +
> + fp = fopen("/sys/devices/system/cpu/offline", "r");
> + if (fscanf(fp, "%s", offline_cpus_str) < 0) {
> + if (!errno) {
> + fclose(fp);
> + return 1;
> + }
> + ksft_perror("Could not read /sys/devices/system/cpu/offline");
> + }
> +
> + fclose(fp);
> +
> + return 0;
> +}
> +
> +/*
> + * Detect SNC by comparing #CPUs in node0 with #CPUs sharing LLC with CPU0.
> + * If some CPUs are offline the numbers may not be exact multiples of each
> + * other. Any offline CPUs on node0 will be also gone from shared_cpu_map of
> + * CPU0 but offline CPUs from other nodes will only make the cache_cpus value
> + * lower. Still try to get the ratio right by preventing the second
> possibility.
Similar to v4 this "try to get the ratio right" is unnecessary because of the
explicit
check for offline CPU