Re: [PATCH v5 1/2] selftests/resctrl: Adjust effective L3 cache size with SNC enabled

2024-11-05 Thread Reinette Chatre
Hi Maciej,

On 10/29/24 6:00 AM, Maciej Wieczor-Retman wrote:
> diff --git a/tools/testing/selftests/resctrl/resctrl.h 
> b/tools/testing/selftests/resctrl/resctrl.h
> index 2dda56084588..851b37c9c38a 100644
> --- a/tools/testing/selftests/resctrl/resctrl.h
> +++ b/tools/testing/selftests/resctrl/resctrl.h
> @@ -11,6 +11,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -43,6 +44,8 @@
>  
>  #define DEFAULT_SPAN (250 * MB)
>  
> +#define MAX_SNC  4
> +

fyi, it seems that 6 is the new max:
https://lore.kernel.org/all/20241031220213.17991-1-tony.l...@intel.com/

Reinette




Re: [PATCH v5 1/2] selftests/resctrl: Adjust effective L3 cache size with SNC enabled

2024-11-05 Thread Reinette Chatre
Hi Maciej,

On 10/29/24 6:00 AM, Maciej Wieczor-Retman wrote:
> Sub-NUMA Cluster divides CPUs sharing an L3 cache into separate NUMA
> nodes. Systems may support splitting into either two, three or four
> nodes.
> 
> When SNC mode is enabled the effective amount of L3 cache available
> for allocation is divided by the number of nodes per L3.
> 
> Detect which SNC mode is active by comparing the number of CPUs
> that share a cache with CPU0, with the number of CPUs on node0.
> 
> Co-developed-by: Tony Luck 
> Signed-off-by: Tony Luck 
> Signed-off-by: Maciej Wieczor-Retman 
> ---

...

> diff --git a/tools/testing/selftests/resctrl/resctrl.h 
> b/tools/testing/selftests/resctrl/resctrl.h
> index 2dda56084588..851b37c9c38a 100644
> --- a/tools/testing/selftests/resctrl/resctrl.h
> +++ b/tools/testing/selftests/resctrl/resctrl.h
> @@ -11,6 +11,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -43,6 +44,8 @@
>  
>  #define DEFAULT_SPAN (250 * MB)
>  
> +#define MAX_SNC  4
> +
>  /*
>   * user_params:  User supplied parameters
>   * @cpu: CPU number to which the benchmark will be bound to
> @@ -120,6 +123,7 @@ extern volatile int *value_sink;
>  
>  extern char llc_occup_path[1024];
>  
> +int snc_nodes_per_l3_cache(void);
>  int get_vendor(void);
>  bool check_resctrlfs_support(void);
>  int filter_dmesg(void);
> diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c 
> b/tools/testing/selftests/resctrl/resctrl_tests.c
> index ecbb7605a981..4b84d6199a36 100644
> --- a/tools/testing/selftests/resctrl/resctrl_tests.c
> +++ b/tools/testing/selftests/resctrl/resctrl_tests.c
> @@ -118,11 +118,17 @@ static bool test_vendor_specific_check(const struct 
> resctrl_test *test)
>  
>  static void run_single_test(const struct resctrl_test *test, const struct 
> user_params *uparams)
>  {
> - int ret;
> + int ret, snc_mode;
>  
>   if (test->disabled)
>   return;
>  
> + snc_mode = snc_nodes_per_l3_cache();
> + if (snc_mode > 1)
> + ksft_print_msg("SNC-%d mode discovered.\n", snc_mode);
> + else if (snc_unreliable)
> + ksft_print_msg("SNC detection unreliable due to offline CPUs. 
> Test results may not be accurate if SNC enabled.\n");

As I understand none of the tests can be considered reliable if SNC detection 
is unreliable
so skipping the test can be done here in a central spot without duplicating it 
in each test.

> +
>   if (!test_vendor_specific_check(test)) {
>   ksft_test_result_skip("Hardware does not support %s\n", 
> test->name);
>   return;
> diff --git a/tools/testing/selftests/resctrl/resctrlfs.c 
> b/tools/testing/selftests/resctrl/resctrlfs.c
> index 250c320349a7..d6384f404d95 100644
> --- a/tools/testing/selftests/resctrl/resctrlfs.c
> +++ b/tools/testing/selftests/resctrl/resctrlfs.c
> @@ -13,6 +13,8 @@
>  
>  #include "resctrl.h"
>  
> +int snc_unreliable;
> +
>  static int find_resctrl_mount(char *buffer)
>  {
>   FILE *mounts;
> @@ -156,6 +158,93 @@ int get_domain_id(const char *resource, int cpu_no, int 
> *domain_id)
>   return 0;
>  }
>  
> +/*
> + * Count number of CPUs in a /sys bitmap
> + */
> +static unsigned int count_sys_bitmap_bits(char *name)
> +{
> + FILE *fp = fopen(name, "r");
> + int count = 0, c;
> +
> + if (!fp)
> + return 0;
> +
> + while ((c = fgetc(fp)) != EOF) {
> + if (!isxdigit(c))
> + continue;
> + switch (c) {
> + case 'f':
> + count++;
> + case '7': case 'b': case 'd': case 'e':
> + count++;
> + case '3': case '5': case '6': case '9': case 'a': case 'c':
> + count++;
> + case '1': case '2': case '4': case '8':
> + count++;
> + }
> + }
> + fclose(fp);
> +
> + return count;
> +}
> +
> +static bool cpus_offline_empty(void)
> +{
> + char offline_cpus_str[64];
> + FILE *fp;
> +
> + fp = fopen("/sys/devices/system/cpu/offline", "r");
> + if (fscanf(fp, "%s", offline_cpus_str) < 0) {
> + if (!errno) {
> + fclose(fp);
> + return 1;
> + }
> + ksft_perror("Could not read /sys/devices/system/cpu/offline");
> + }
> +
> + fclose(fp);
> +
> + return 0;
> +}
> +
> +/*
> + * Detect SNC by comparing #CPUs in node0 with #CPUs sharing LLC with CPU0.
> + * If some CPUs are offline the numbers may not be exact multiples of each
> + * other. Any offline CPUs on node0 will be also gone from shared_cpu_map of
> + * CPU0 but offline CPUs from other nodes will only make the cache_cpus value
> + * lower. Still try to get the ratio right by preventing the second 
> possibility.

Similar to v4 this "try to get the ratio right" is unnecessary because of the 
explicit
check for offline CPU