[PATCH v4] tools/power turbostat: Fix RAPL summary collection on AMD processors

2021-03-30 Thread Terry Bowman
Turbostat fails to correctly collect and display RAPL summary information
on Family 17h and 19h AMD processors. Running turbostat on these processors
returns immediately. If turbostat is working correctly then RAPL summary
data is displayed until the user provided command completes. If a command
is not provided by the user then turbostat is designed to continuously
display RAPL information until interrupted.

The issue is due to offset_to_idx() and idx_to_offset() missing support for
AMD MSR addresses/offsets. offset_to_idx()'s switch statement is missing
cases for AMD MSRs and idx_to_offset() does not include a path to return
AMD MSR(s) for any idx.

The solution is add AMD MSR support to offset_to_idx() and idx_to_offset().
These functions are split-out and renamed along architecture vendor lines
for supporting both AMD and Intel MSRs.

Fixes: 9972d5d84d76 ("tools/power turbostat: Enable accumulate RAPL display")
Signed-off-by: Terry Bowman 
Cc: Len Brown 
Cc: linux...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 tools/power/x86/turbostat/turbostat.c | 61 ---
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c 
b/tools/power/x86/turbostat/turbostat.c
index a7c4f0772e53..24c7f380485f 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -291,7 +291,7 @@ struct msr_sum_array {
 /* The percpu MSR sum array.*/
 struct msr_sum_array *per_cpu_msr_sum;
 
-int idx_to_offset(int idx)
+int idx_to_offset_intel(int idx)
 {
int offset;
 
@@ -320,7 +320,7 @@ int idx_to_offset(int idx)
return offset;
 }
 
-int offset_to_idx(int offset)
+int offset_to_idx_intel(int offset)
 {
int idx;
 
@@ -349,7 +349,7 @@ int offset_to_idx(int offset)
return idx;
 }
 
-int idx_valid(int idx)
+int idx_valid_intel(int idx)
 {
switch (idx) {
case IDX_PKG_ENERGY:
@@ -368,6 +368,51 @@ int idx_valid(int idx)
return 0;
}
 }
+
+int (*idx_to_offset)(int idx) = idx_to_offset_intel;
+int (*offset_to_idx)(int offset) = offset_to_idx_intel;
+int (*idx_valid)(int idx) = idx_valid_intel;
+
+int idx_to_offset_amd(int idx)
+{
+   int offset;
+
+   switch (idx) {
+   case IDX_PKG_ENERGY:
+   offset = MSR_PKG_ENERGY_STAT;
+   break;
+   default:
+   offset = -1;
+   }
+
+   return offset;
+}
+
+int offset_to_idx_amd(int offset)
+{
+   int idx;
+
+   switch (offset) {
+   case MSR_PKG_ENERGY_STAT:
+   idx = IDX_PKG_ENERGY;
+   break;
+   default:
+   idx = -1;
+   }
+
+   return idx;
+}
+
+int idx_valid_amd(int idx)
+{
+   switch (idx) {
+   case IDX_PKG_ENERGY:
+   return do_rapl & MSR_PKG_ENERGY_STAT;
+   default:
+   return 0;
+   }
+}
+
 struct sys_counters {
unsigned int added_thread_counters;
unsigned int added_core_counters;
@@ -3249,7 +3294,7 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long 
*msr)
return 1;
 
idx = offset_to_idx(offset);
-   if (idx < 0)
+   if (idx == -1)
return idx;
/* get_msr_sum() = sum + (get_msr() - last) */
ret = get_msr(cpu, offset, &msr_cur);
@@ -3277,7 +3322,7 @@ static int update_msr_sum(struct thread_data *t, struct 
core_data *c, struct pkg
if (!idx_valid(i))
continue;
offset = idx_to_offset(i);
-   if (offset < 0)
+   if (offset == -1)
continue;
ret = get_msr(cpu, offset, &msr_cur);
if (ret) {
@@ -5348,6 +5393,12 @@ void process_cpuid()
if (!quiet)
decode_misc_feature_control();
 
+   if (authentic_amd || hygon_genuine) {
+   idx_to_offset = idx_to_offset_amd;
+   offset_to_idx = offset_to_idx_amd;
+   idx_valid = idx_valid_amd;
+   }
+
return;
 }
 
-- 
2.25.1



tools/power turbostat: Fix RAPL summary collection on AMD processors

2021-03-30 Thread Terry Bowman
Turbostat fails to correctly collect and display RAPL summary information
on Family 17h and 19h AMD processors. Running turbostat on these processors
returns immediately. If turbostat is working correctly then RAPL summary
data is displayed until the user provided command completes. If a command
is not provided by the user then turbostat is designed to continuously
display RAPL information until interrupted.

The issue is due to offset_to_idx() and idx_to_offset() missing support for
AMD MSR addresses/offsets. offset_to_idx()'s switch statement is missing
cases for AMD MSRs and idx_to_offset() does not include a path to return
AMD MSR(s) for any idx.

The solution is add AMD MSR support to offset_to_idx() and idx_to_offset().
These functions are split-out and renamed along architecture vendor lines
for supporting both AMD and Intel MSRs.

Fixes: 9972d5d84d76 ("tools/power turbostat: Enable accumulate RAPL display")
Signed-off-by: Terry Bowman 
Cc: Len Brown 
Cc: linux...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 tools/power/x86/turbostat/turbostat.c | 61 ---
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c 
b/tools/power/x86/turbostat/turbostat.c
index a7c4f0772e53..24c7f380485f 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -291,7 +291,7 @@ struct msr_sum_array {
 /* The percpu MSR sum array.*/
 struct msr_sum_array *per_cpu_msr_sum;
 
-int idx_to_offset(int idx)
+int idx_to_offset_intel(int idx)
 {
int offset;
 
@@ -320,7 +320,7 @@ int idx_to_offset(int idx)
return offset;
 }
 
-int offset_to_idx(int offset)
+int offset_to_idx_intel(int offset)
 {
int idx;
 
@@ -349,7 +349,7 @@ int offset_to_idx(int offset)
return idx;
 }
 
-int idx_valid(int idx)
+int idx_valid_intel(int idx)
 {
switch (idx) {
case IDX_PKG_ENERGY:
@@ -368,6 +368,51 @@ int idx_valid(int idx)
return 0;
}
 }
+
+int (*idx_to_offset)(int idx) = idx_to_offset_intel;
+int (*offset_to_idx)(int offset) = offset_to_idx_intel;
+int (*idx_valid)(int idx) = idx_valid_intel;
+
+int idx_to_offset_amd(int idx)
+{
+   int offset;
+
+   switch (idx) {
+   case IDX_PKG_ENERGY:
+   offset = MSR_PKG_ENERGY_STAT;
+   break;
+   default:
+   offset = -1;
+   }
+
+   return offset;
+}
+
+int offset_to_idx_amd(int offset)
+{
+   int idx;
+
+   switch (offset) {
+   case MSR_PKG_ENERGY_STAT:
+   idx = IDX_PKG_ENERGY;
+   break;
+   default:
+   idx = -1;
+   }
+
+   return idx;
+}
+
+int idx_valid_amd(int idx)
+{
+   switch (idx) {
+   case IDX_PKG_ENERGY:
+   return do_rapl & MSR_PKG_ENERGY_STAT;
+   default:
+   return 0;
+   }
+}
+
 struct sys_counters {
unsigned int added_thread_counters;
unsigned int added_core_counters;
@@ -3249,7 +3294,7 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long 
*msr)
return 1;
 
idx = offset_to_idx(offset);
-   if (idx < 0)
+   if (idx == -1)
return idx;
/* get_msr_sum() = sum + (get_msr() - last) */
ret = get_msr(cpu, offset, &msr_cur);
@@ -3277,7 +3322,7 @@ static int update_msr_sum(struct thread_data *t, struct 
core_data *c, struct pkg
if (!idx_valid(i))
continue;
offset = idx_to_offset(i);
-   if (offset < 0)
+   if (offset == -1)
continue;
ret = get_msr(cpu, offset, &msr_cur);
if (ret) {
@@ -5348,6 +5393,12 @@ void process_cpuid()
if (!quiet)
decode_misc_feature_control();
 
+   if (authentic_amd || hygon_genuine) {
+   idx_to_offset = idx_to_offset_amd;
+   offset_to_idx = offset_to_idx_amd;
+   idx_valid = idx_valid_amd;
+   }
+
return;
 }
 
-- 
2.25.1



[PATCH v1 1/1] tools/power turbostat: Fix RAPL summary collection on AMD processors

2021-03-31 Thread Terry Bowman
Turbostat fails to correctly collect and display RAPL summary information
on Family 17h and 19h AMD processors. Running turbostat on these processors
returns immediately. If turbostat is working correctly then RAPL summary
data is displayed until the user provided command completes. If a command
is not provided by the user then turbostat is designed to continuously
display RAPL information until interrupted.

The issue is due to offset_to_idx() and idx_to_offset() missing support for
AMD MSR addresses/offsets. offset_to_idx()'s switch statement is missing
cases for AMD MSRs and idx_to_offset() does not include a path to return
AMD MSR(s) for any idx.

The solution is add AMD MSR support to offset_to_idx() and idx_to_offset().
These functions are split-out and renamed along architecture vendor lines
for supporting both AMD and Intel MSRs.

Fixes: 9972d5d84d76 ("tools/power turbostat: Enable accumulate RAPL display")
Signed-off-by: Terry Bowman 
Cc: Len Brown 
Cc: linux...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 tools/power/x86/turbostat/turbostat.c | 61 ---
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c 
b/tools/power/x86/turbostat/turbostat.c
index a7c4f0772e53..24c7f380485f 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -291,7 +291,7 @@ struct msr_sum_array {
 /* The percpu MSR sum array.*/
 struct msr_sum_array *per_cpu_msr_sum;
 
-int idx_to_offset(int idx)
+int idx_to_offset_intel(int idx)
 {
int offset;
 
@@ -320,7 +320,7 @@ int idx_to_offset(int idx)
return offset;
 }
 
-int offset_to_idx(int offset)
+int offset_to_idx_intel(int offset)
 {
int idx;
 
@@ -349,7 +349,7 @@ int offset_to_idx(int offset)
return idx;
 }
 
-int idx_valid(int idx)
+int idx_valid_intel(int idx)
 {
switch (idx) {
case IDX_PKG_ENERGY:
@@ -368,6 +368,51 @@ int idx_valid(int idx)
return 0;
}
 }
+
+int (*idx_to_offset)(int idx) = idx_to_offset_intel;
+int (*offset_to_idx)(int offset) = offset_to_idx_intel;
+int (*idx_valid)(int idx) = idx_valid_intel;
+
+int idx_to_offset_amd(int idx)
+{
+   int offset;
+
+   switch (idx) {
+   case IDX_PKG_ENERGY:
+   offset = MSR_PKG_ENERGY_STAT;
+   break;
+   default:
+   offset = -1;
+   }
+
+   return offset;
+}
+
+int offset_to_idx_amd(int offset)
+{
+   int idx;
+
+   switch (offset) {
+   case MSR_PKG_ENERGY_STAT:
+   idx = IDX_PKG_ENERGY;
+   break;
+   default:
+   idx = -1;
+   }
+
+   return idx;
+}
+
+int idx_valid_amd(int idx)
+{
+   switch (idx) {
+   case IDX_PKG_ENERGY:
+   return do_rapl & MSR_PKG_ENERGY_STAT;
+   default:
+   return 0;
+   }
+}
+
 struct sys_counters {
unsigned int added_thread_counters;
unsigned int added_core_counters;
@@ -3249,7 +3294,7 @@ int get_msr_sum(int cpu, off_t offset, unsigned long long 
*msr)
return 1;
 
idx = offset_to_idx(offset);
-   if (idx < 0)
+   if (idx == -1)
return idx;
/* get_msr_sum() = sum + (get_msr() - last) */
ret = get_msr(cpu, offset, &msr_cur);
@@ -3277,7 +3322,7 @@ static int update_msr_sum(struct thread_data *t, struct 
core_data *c, struct pkg
if (!idx_valid(i))
continue;
offset = idx_to_offset(i);
-   if (offset < 0)
+   if (offset == -1)
continue;
ret = get_msr(cpu, offset, &msr_cur);
if (ret) {
@@ -5348,6 +5393,12 @@ void process_cpuid()
if (!quiet)
decode_misc_feature_control();
 
+   if (authentic_amd || hygon_genuine) {
+   idx_to_offset = idx_to_offset_amd;
+   offset_to_idx = offset_to_idx_amd;
+   idx_valid = idx_valid_amd;
+   }
+
return;
 }
 
-- 
2.25.1



Re: [PATCH v4] tools/power turbostat: Fix RAPL summary collection on AMD processors

2021-04-16 Thread Terry Bowman



Hi Calvin,

Thanks for the feedback. I'll begin making the change and testing. I'll 
respond with V2 patch in this thread.


Regards,
Terry


On 4/14/21 9:13 PM, Calvin Walton wrote:

On Tue, 2021-03-30 at 21:38 +0000, Terry Bowman wrote:

+int idx_valid_amd(int idx)
+{
+   switch (idx) {
+   case IDX_PKG_ENERGY:
+   return do_rapl & MSR_PKG_ENERGY_STAT;

This isn't correct - MSR_PKG_ENERGY_STAT is the MSR offset, not a bit
mask for the do_rapl bit field.

The presence of MSR_PKG_ENERGY_STAT (along with MSR_RAPL_PWR_UNIT and
MSR_CORE_ENERGY_STAT) is indicated by the RAPL_AMD_F17H bit in do_rapl,
and can be checked with:
do_rapl & RAPL_AMD_F17H




[PATCH v2] tools/power turbostat: Fix RAPL summary collection on AMD processors

2021-04-19 Thread Terry Bowman
Turbostat fails to correctly collect and display RAPL summary information
on Family 17h and 19h AMD processors. Running turbostat on these processors
returns immediately. If turbostat is working correctly then RAPL summary
data is displayed until the user provided command completes. If a command
is not provided by the user then turbostat is designed to continuously
display RAPL information until interrupted.

The issue is due to offset_to_idx() and idx_to_offset() missing support for
AMD MSR addresses/offsets. offset_to_idx()'s switch statement is missing
cases for AMD MSRs and idx_to_offset() does not include a path to return
AMD MSR(s) for any idx.

The solution is add AMD MSR support to offset_to_idx() and idx_to_offset().
These functions are split-out and renamed along architecture vendor lines
for supporting both AMD and Intel MSRs.

Fixes: 9972d5d84d76 ("tools/power turbostat: Enable accumulate RAPL display")
Signed-off-by: Terry Bowman 
---
Changes in V2:
  - Set patch title to v2. The first patch submission was mistakenly titled as
  v4 when it should have been v1.
  - Change offset variables from 'int' to 'off_t' type. Change is needed
  to prevent sign extension in code casting int->off_t. This is currently a
  problem with AMD MSRs using base of 0xC000_
  - Update idx_valid_amd() capability masking to use RAPL_AMD_F17H

 tools/power/x86/turbostat/turbostat.c | 63 ---
 1 file changed, 57 insertions(+), 6 deletions(-)

diff --git a/tools/power/x86/turbostat/turbostat.c 
b/tools/power/x86/turbostat/turbostat.c
index a7c4f0772e53..5aacdbd28fa8 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -291,9 +291,9 @@ struct msr_sum_array {
 /* The percpu MSR sum array.*/
 struct msr_sum_array *per_cpu_msr_sum;
 
-int idx_to_offset(int idx)
+off_t idx_to_offset_intel(int idx)
 {
-   int offset;
+   off_t offset;
 
switch (idx) {
case IDX_PKG_ENERGY:
@@ -320,7 +320,7 @@ int idx_to_offset(int idx)
return offset;
 }
 
-int offset_to_idx(int offset)
+int offset_to_idx_intel(off_t offset)
 {
int idx;
 
@@ -349,7 +349,7 @@ int offset_to_idx(int offset)
return idx;
 }
 
-int idx_valid(int idx)
+int idx_valid_intel(int idx)
 {
switch (idx) {
case IDX_PKG_ENERGY:
@@ -368,6 +368,51 @@ int idx_valid(int idx)
return 0;
}
 }
+
+off_t (*idx_to_offset)(int idx) = idx_to_offset_intel;
+int (*offset_to_idx)(off_t offset) = offset_to_idx_intel;
+int (*idx_valid)(int idx) = idx_valid_intel;
+
+off_t idx_to_offset_amd(int idx)
+{
+   off_t offset;
+
+   switch (idx) {
+   case IDX_PKG_ENERGY:
+   offset = MSR_PKG_ENERGY_STAT;
+   break;
+   default:
+   offset = -1;
+   }
+
+   return offset;
+}
+
+int offset_to_idx_amd(off_t offset)
+{
+   int idx;
+
+   switch (offset) {
+   case MSR_PKG_ENERGY_STAT:
+   idx = IDX_PKG_ENERGY;
+   break;
+   default:
+   idx = -1;
+   }
+
+   return idx;
+}
+
+int idx_valid_amd(int idx)
+{
+   switch (idx) {
+   case IDX_PKG_ENERGY:
+   return do_rapl & RAPL_AMD_F17H;
+   default:
+   return 0;
+   }
+}
+
 struct sys_counters {
unsigned int added_thread_counters;
unsigned int added_core_counters;
@@ -3272,7 +3317,7 @@ static int update_msr_sum(struct thread_data *t, struct 
core_data *c, struct pkg
 
for (i = IDX_PKG_ENERGY; i < IDX_COUNT; i++) {
unsigned long long msr_cur, msr_last;
-   int offset;
+   off_t offset;
 
if (!idx_valid(i))
continue;
@@ -3281,7 +3326,7 @@ static int update_msr_sum(struct thread_data *t, struct 
core_data *c, struct pkg
continue;
ret = get_msr(cpu, offset, &msr_cur);
if (ret) {
-   fprintf(outf, "Can not update msr(0x%x)\n", offset);
+   fprintf(outf, "Can not update msr(0x%lx)\n", offset);
continue;
}
 
@@ -5348,6 +5393,12 @@ void process_cpuid()
if (!quiet)
decode_misc_feature_control();
 
+   if (authentic_amd || hygon_genuine) {
+   idx_to_offset = idx_to_offset_amd;
+   offset_to_idx = offset_to_idx_amd;
+   idx_valid = idx_valid_amd;
+   }
+
return;
 }
 
-- 
2.25.1



[PATCH] ACPI / APEI: Add is_ghes_type() to identify GHES sources

2021-01-22 Thread Terry Bowman
From: Yazen Ghannam 

Refactor duplicated GHES identity logic into is_ghes_type().

Signed-off-by: Yazen Ghannam 
Reviewed-by: Robert Richter 
Signed-off-by: Terry Bowman 
---
 drivers/acpi/apei/hest.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 6e980fe16772..bd702e0ef339 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -49,6 +49,12 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] 
= {
[ACPI_HEST_TYPE_IA32_DEFERRED_CHECK] = -1,
 };
 
+static inline bool is_ghes_type(struct acpi_hest_header *hest_hdr)
+{
+   return hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR ||
+  hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
+}
+
 static int hest_esrc_len(struct acpi_hest_header *hest_hdr)
 {
u16 hest_type = hest_hdr->type;
@@ -141,8 +147,7 @@ static int __init hest_parse_ghes_count(struct 
acpi_hest_header *hest_hdr, void
 {
int *count = data;
 
-   if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR ||
-   hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2)
+   if (is_ghes_type(hest_hdr))
(*count)++;
return 0;
 }
@@ -153,8 +158,7 @@ static int __init hest_parse_ghes(struct acpi_hest_header 
*hest_hdr, void *data)
struct ghes_arr *ghes_arr = data;
int rc, i;
 
-   if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR &&
-   hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2)
+   if (!is_ghes_type(hest_hdr))
return 0;
 
if (!((struct acpi_hest_generic *)hest_hdr)->enabled)
-- 
2.27.0



[PATCH v2] ACPI / APEI: Add is_generic_error() to identify GHES sources

2021-01-26 Thread Terry Bowman
From: Yazen Ghannam 

Refactor duplicated GHES identity logic into is_generic_error().

Signed-off-by: Yazen Ghannam 
Reviewed-by: Robert Richter 
Co-developed-by: Terry Bowman 
Signed-off-by: Terry Bowman 
---
Changes in v2:
  - Rename is_ghes_type() to is_generic_error()
  - Add co-developed-by
  
 drivers/acpi/apei/hest.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 6e980fe16772..f220bb00e91b 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -49,6 +49,12 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] 
= {
[ACPI_HEST_TYPE_IA32_DEFERRED_CHECK] = -1,
 };
 
+static inline bool is_generic_error(struct acpi_hest_header *hest_hdr)
+{
+   return hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR ||
+  hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
+}
+
 static int hest_esrc_len(struct acpi_hest_header *hest_hdr)
 {
u16 hest_type = hest_hdr->type;
@@ -141,8 +147,7 @@ static int __init hest_parse_ghes_count(struct 
acpi_hest_header *hest_hdr, void
 {
int *count = data;
 
-   if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR ||
-   hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2)
+   if (is_generic_error(hest_hdr))
(*count)++;
return 0;
 }
@@ -153,9 +158,7 @@ static int __init hest_parse_ghes(struct acpi_hest_header 
*hest_hdr, void *data)
struct ghes_arr *ghes_arr = data;
int rc, i;
 
-   if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR &&
-   hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2)
+   if (!is_generic_error(hest_hdr))
return 0;
 
if (!((struct acpi_hest_generic *)hest_hdr)->enabled)
--
2.27.0



Re: [PATCH] ACPI / APEI: Add is_ghes_type() to identify GHES sources

2021-01-25 Thread Terry Bowman

On 1/25/21 11:14 AM, Borislav Petkov wrote:

On Mon, Jan 25, 2021 at 05:41:04PM +0100, Rafael J. Wysocki wrote:

On Fri, Jan 22, 2021 at 7:05 PM Terry Bowman  wrote:

From: Yazen Ghannam

Refactor duplicated GHES identity logic into is_ghes_type().

Signed-off-by: Yazen Ghannam
Reviewed-by: Robert Richter
Signed-off-by: Terry Bowman

If Terry was a co-author of the patch, please add a Co-developed-by:
tag for him in addition to the s-o-b.  Otherwise the meaning of his
s-o-b is unclear.

I will add "co-developed-by" in V2.

Boris, James, any objections to the changes below?


---
  drivers/acpi/apei/hest.c | 12 
  1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 6e980fe16772..bd702e0ef339 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -49,6 +49,12 @@ static const int hest_esrc_len_tab[ACPI_HEST_TYPE_RESERVED] 
= {
 [ACPI_HEST_TYPE_IA32_DEFERRED_CHECK] = -1,
  };

+static inline bool is_ghes_type(struct acpi_hest_header *hest_hdr)
+{
+   return hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR ||
+  hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;

I realize that this is supposed to test whether the table point to a
generic hardware error source but everything in our code pertaining to
GHES is called, well, "ghes".

So I'd prefer to call that is_generic_error() or so.

Sure, I'll rename is_ghes_type() to is_generic_error() in v2.

Thx.



[PATCH v3 4/4] cxl/dax: Delay consumption of SOFT RESERVE resources

2025-04-05 Thread Terry Bowman
From: Nathan Fontenot 

The dax hmem device initialization will consume any iomem
SOFT RESERVE resources prior to CXL region creation. To allow
for the CXL driver to complete region creation and trim any
SOFT RESERVE resources before the dax driver consumes them
we need to delay the dax driver's search for SOFT RESERVEs.

To do this the dax driver hmem device initialization code
skips the walk of the iomem resource tree if the CXL ACPI
driver is enabled. This allows the CXL driver to complete
region creation and trim any SOFT RESERVES. Once the CXL
driver completes this, the CXL driver then registers any
remaining SOFT RESERVE resources with the dax hmem driver.

Signed-off-by: Nathan Fontenot 
Signed-off-by: Terry Bowman 
---
 drivers/cxl/core/region.c | 10 +
 drivers/dax/hmem/device.c | 43 ---
 drivers/dax/hmem/hmem.c   |  3 ++-
 include/linux/dax.h   |  6 ++
 4 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 25d70175f204..bf4a4371d98b 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include "core.h"
@@ -3444,6 +3445,11 @@ int cxl_add_to_region(struct cxl_port *root, struct 
cxl_endpoint_decoder *cxled)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_add_to_region, "CXL");
 
+static int cxl_srmem_register(struct resource *res, void *unused)
+{
+   return hmem_register_device(phys_to_target_node(res->start), res);
+}
+
 int cxl_region_srmem_update(void)
 {
struct device *dev = NULL;
@@ -3461,6 +3467,10 @@ int cxl_region_srmem_update(void)
put_device(dev);
} while (dev);
 
+   /* Now register any remaining SOFT RESERVES with dax */
+   walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED, IORESOURCE_MEM,
+   0, -1, NULL, cxl_srmem_register);
+
return 0;
 }
 EXPORT_SYMBOL_NS_GPL(cxl_region_srmem_update, "CXL");
diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
index 59ad44761191..cc1ed7bbdb1a 100644
--- a/drivers/dax/hmem/device.c
+++ b/drivers/dax/hmem/device.c
@@ -8,7 +8,6 @@
 static bool nohmem;
 module_param_named(disable, nohmem, bool, 0444);
 
-static bool platform_initialized;
 static DEFINE_MUTEX(hmem_resource_lock);
 static struct resource hmem_active = {
.name = "HMEM devices",
@@ -35,9 +34,7 @@ EXPORT_SYMBOL_GPL(walk_hmem_resources);
 
 static void __hmem_register_resource(int target_nid, struct resource *res)
 {
-   struct platform_device *pdev;
struct resource *new;
-   int rc;
 
new = __request_region(&hmem_active, res->start, resource_size(res), "",
   0);
@@ -47,21 +44,6 @@ static void __hmem_register_resource(int target_nid, struct 
resource *res)
}
 
new->desc = target_nid;
-
-   if (platform_initialized)
-   return;
-
-   pdev = platform_device_alloc("hmem_platform", 0);
-   if (!pdev) {
-   pr_err_once("failed to register device-dax hmem_platform 
device\n");
-   return;
-   }
-
-   rc = platform_device_add(pdev);
-   if (rc)
-   platform_device_put(pdev);
-   else
-   platform_initialized = true;
 }
 
 void hmem_register_resource(int target_nid, struct resource *res)
@@ -83,9 +65,28 @@ static __init int hmem_register_one(struct resource *res, 
void *data)
 
 static __init int hmem_init(void)
 {
-   walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED,
-   IORESOURCE_MEM, 0, -1, NULL, hmem_register_one);
-   return 0;
+   struct platform_device *pdev;
+   int rc;
+
+   if (!IS_ENABLED(CONFIG_CXL_ACPI)) {
+   walk_iomem_res_desc(IORES_DESC_SOFT_RESERVED,
+   IORESOURCE_MEM, 0, -1, NULL,
+   hmem_register_one);
+   }
+
+   pdev = platform_device_alloc("hmem_platform", 0);
+   if (!pdev) {
+   pr_err("failed to register device-dax hmem_platform device\n");
+   return -1;
+   }
+
+   rc = platform_device_add(pdev);
+   if (rc) {
+   pr_err("failed to add device-dax hmem_platform device\n");
+   platform_device_put(pdev);
+   }
+
+   return rc;
 }
 
 /*
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 3aedef5f1be1..a206b9b383e4 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -61,7 +61,7 @@ static void release_hmem(void *pdev)
platform_device_unregister(pdev);
 }
 
-static int hmem_register_device(int target_nid, const struct resource *res)
+int hmem_register_device(int target_nid, const struct resource *res)
 {
struct device *host = &dax_hmem_pdev->dev;

[PATCH v3 0/4] Add managed SOFT RESERVE resource handling

2025-04-04 Thread Terry Bowman
Add the ability to manage SOFT RESERVE iomem resources prior to them being
added to the iomem resource tree. This allows drivers, such as CXL, to
remove any pieces of the SOFT RESERVE resource that intersect with created
CXL regions.

The current approach of leaving the SOFT RESERVE resources as is can cause
failures during hotplug of devices, such as CXL, because the resource is
not available for reuse after teardown of the device.

The approach is to add SOFT RESERVE resources to a separate tree during
boot. This allows any drivers to update the SOFT RESERVE resources before
they are merged into the iomem resource tree. In addition a notifier chain
is added so that drivers can be notified when these SOFT RESERVE resources
are added to the ioeme resource tree.

The CXL driver is modified to use a worker thread that waits for the CXL
PCI and CXL mem drivers to be loaded and for their probe routine to
complete. Then the driver walks through any created CXL regions to trim any
intersections with SOFT RESERVE resources in the iomem tree.

The dax driver uses the new soft reserve notifier chain so it can consume
any remaining SOFT RESERVES once they're added to the iomem tree.

V3 updates:
 - Remove srmem resource tree from kernel/resource.c, this is no longer
   needed in the current implementation. All SOFT RESERVE resources now
   put on the iomem resource tree.
 - Remove the no longer needed SOFT_RESERVED_MANAGED kernel config option.
 - Add the 'nid' parameter back to hmem_register_resource();
 - Remove the no longer used soft reserve notification chain (introduced
   in v2). The dax driver is now notified of SOFT RESERVED resources by
   the CXL driver.

v2 updates:
 - Add config option SOFT_RESERVE_MANAGED to control use of the
   separate srmem resource tree at boot.
 - Only add SOFT RESERVE resources to the soft reserve tree during
   boot, they go to the iomem resource tree after boot.
 - Remove the resource trimming code in the previous patch to re-use
   the existing code in kernel/resource.c
 - Add functionality for the cxl acpi driver to wait for the cxl PCI
   and me drivers to load.

Nathan Fontenot (4):
  kernel/resource: Provide mem region release for SOFT RESERVES
  cxl: Update Soft Reserved resources upon region creation
  dax/mum: Save the dax mum platform device pointer
  cxl/dax: Delay consumption of SOFT RESERVE resources

 drivers/cxl/Kconfig|  4 ---
 drivers/cxl/acpi.c | 28 +++
 drivers/cxl/core/Makefile  |  2 +-
 drivers/cxl/core/region.c  | 34 ++-
 drivers/cxl/core/suspend.c | 41 
 drivers/cxl/cxl.h  |  3 +++
 drivers/cxl/cxlmem.h   |  9 ---
 drivers/cxl/cxlpci.h   |  1 +
 drivers/cxl/pci.c  |  2 ++
 drivers/dax/hmem/device.c  | 47 
 drivers/dax/hmem/hmem.c| 10 ---
 include/linux/dax.h| 11 +---
 include/linux/ioport.h |  3 +++
 include/linux/pm.h |  7 -
 kernel/resource.c  | 55 +++---
 15 files changed, 202 insertions(+), 55 deletions(-)


base-commit: aae0594a7053c60b82621136257c8b648c67b512
-- 
2.34.1




[PATCH v3 2/4] cxl: Update Soft Reserved resources upon region creation

2025-04-03 Thread Terry Bowman
From: Nathan Fontenot 

Update handling of SOFT RESERVE iomem resources that intersect with
CXL region resources to remove intersections from the SOFT RESERVE
resources. The current approach of leaving SOFT RESERVE resources as
is can cause failures during hotplug replace of CXL devices because
the resource is not available for reuse after teardown of the CXL device.

To accomplish this the cxl acpi driver creates a worker thread at the
end of cxl_acpi_probe(). This worker thread first waits for the CXL PCI
CXL mem drivers have loaded. The cxl core/suspend.c code is updated to
add a pci_loaded variable, in addition to the mem_active variable, that
is updated when the pci driver loads. Remove CONFIG_CXL_SUSPEND Kconfig as
it is no longer needed. A new cxl_wait_for_pci_mem() routine uses a
waitqueue for both these driver to be loaded. The need to add this
additional waitqueue is ensure the CXL PCI and CXL mem drivers have loaded
before we wait for their probe, without it the cxl acpi probe worker thread
calls wait_for_device_probe() before these drivers are loaded.

After the CXL PCI and CXL mem drivers load the cxl acpi worker thread
uses wait_for_device_probe() to ensure device probe routines have
completed.

Once probe completes and regions have been created, find all cxl
regions that have been created and trim any SOFT RESERVE resources
that intersect with the region.

Update cxl_acpi_exit() to cancel pending waitqueue work.

Signed-off-by: Nathan Fontenot 
Signed-off-by: Terry Bowman 
---
 drivers/cxl/Kconfig|  4 
 drivers/cxl/acpi.c | 28 ++
 drivers/cxl/core/Makefile  |  2 +-
 drivers/cxl/core/region.c  | 24 +-
 drivers/cxl/core/suspend.c | 41 ++
 drivers/cxl/cxl.h  |  3 +++
 drivers/cxl/cxlmem.h   |  9 -
 drivers/cxl/cxlpci.h   |  1 +
 drivers/cxl/pci.c  |  2 ++
 include/linux/pm.h |  7 ---
 10 files changed, 99 insertions(+), 22 deletions(-)

diff --git a/drivers/cxl/Kconfig b/drivers/cxl/Kconfig
index 205547e5543a..c7377956c1d5 100644
--- a/drivers/cxl/Kconfig
+++ b/drivers/cxl/Kconfig
@@ -117,10 +117,6 @@ config CXL_PORT
default CXL_BUS
tristate
 
-config CXL_SUSPEND
-   def_bool y
-   depends on SUSPEND && CXL_MEM
-
 config CXL_REGION
bool "CXL: Region Support"
default CXL_BUS
diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
index cb14829bb9be..94f2d649bb30 100644
--- a/drivers/cxl/acpi.c
+++ b/drivers/cxl/acpi.c
@@ -7,6 +7,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include "cxlpci.h"
 #include "cxl.h"
@@ -813,6 +815,27 @@ static int pair_cxl_resource(struct device *dev, void 
*data)
return 0;
 }
 
+static void cxl_srmem_work_fn(struct work_struct *work)
+{
+   /* Wait for CXL PCI and mem drivers to load */
+   cxl_wait_for_pci_mem();
+
+   /*
+* Once the CXL PCI and mem drivers have loaded wait
+* for the driver probe routines to complete.
+*/
+   wait_for_device_probe();
+
+   cxl_region_srmem_update();
+}
+
+DECLARE_WORK(cxl_sr_work, cxl_srmem_work_fn);
+
+static void cxl_srmem_update(void)
+{
+   schedule_work(&cxl_sr_work);
+}
+
 static int cxl_acpi_probe(struct platform_device *pdev)
 {
int rc;
@@ -887,6 +910,10 @@ static int cxl_acpi_probe(struct platform_device *pdev)
 
/* In case PCI is scanned before ACPI re-trigger memdev attach */
cxl_bus_rescan();
+
+   /* Update SOFT RESERVED resources that intersect with CXL regions */
+   cxl_srmem_update();
+
return 0;
 }
 
@@ -918,6 +945,7 @@ static int __init cxl_acpi_init(void)
 
 static void __exit cxl_acpi_exit(void)
 {
+   cancel_work_sync(&cxl_sr_work);
platform_driver_unregister(&cxl_acpi_driver);
cxl_bus_drain();
 }
diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index 086df97a0fcf..035864db8a32 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_CXL_BUS) += cxl_core.o
-obj-$(CONFIG_CXL_SUSPEND) += suspend.o
+obj-y += suspend.o
 
 ccflags-y += -I$(srctree)/drivers/cxl
 CFLAGS_trace.o = -DTRACE_INCLUDE_PATH=. -I$(src)
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index c3f4dc244df7..25d70175f204 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include "core.h"
@@ -2333,7 +2334,7 @@ const struct device_type cxl_region_type = {
 
 bool is_cxl_region(struct device *dev)
 {
-   return dev->type == &cxl_region_type;
+   return dev && dev->type == &cxl_region_type;
 }
 EXPORT_SYMBOL_NS_GPL(is_cxl_region, "CXL");
 
@@ -3443,6 +3444,27 @@ int cxl_add_to_region(struct cxl_port *root, str

[PATCH v3 3/4] dax/mum: Save the dax mum platform device pointer

2025-04-03 Thread Terry Bowman
From: Nathan Fontenot 

In order to handle registering hmem devices for SOFT RESERVE
resources after the dax hmem device initialization occurs
we need to save a reference to the dax hmem platform device
that will be used in a following patch.

Saving the platform device pointer also allows us to clean
up the walk_hmem_resources() routine to no require the
struct device argument.

There should be no functional changes.

Signed-off-by: Nathan Fontenot 
Signed-off-by: Terry Bowman 
---
 drivers/dax/hmem/device.c | 4 ++--
 drivers/dax/hmem/hmem.c   | 9 ++---
 include/linux/dax.h   | 5 ++---
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
index f9e1a76a04a9..59ad44761191 100644
--- a/drivers/dax/hmem/device.c
+++ b/drivers/dax/hmem/device.c
@@ -17,14 +17,14 @@ static struct resource hmem_active = {
.flags = IORESOURCE_MEM,
 };
 
-int walk_hmem_resources(struct device *host, walk_hmem_fn fn)
+int walk_hmem_resources(walk_hmem_fn fn)
 {
struct resource *res;
int rc = 0;
 
mutex_lock(&hmem_resource_lock);
for (res = hmem_active.child; res; res = res->sibling) {
-   rc = fn(host, (int) res->desc, res);
+   rc = fn((int) res->desc, res);
if (rc)
break;
}
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 5e7c53f18491..3aedef5f1be1 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -9,6 +9,8 @@
 static bool region_idle;
 module_param_named(region_idle, region_idle, bool, 0644);
 
+static struct platform_device *dax_hmem_pdev;
+
 static int dax_hmem_probe(struct platform_device *pdev)
 {
unsigned long flags = IORESOURCE_DAX_KMEM;
@@ -59,9 +61,9 @@ static void release_hmem(void *pdev)
platform_device_unregister(pdev);
 }
 
-static int hmem_register_device(struct device *host, int target_nid,
-   const struct resource *res)
+static int hmem_register_device(int target_nid, const struct resource *res)
 {
+   struct device *host = &dax_hmem_pdev->dev;
struct platform_device *pdev;
struct memregion_info info;
long id;
@@ -125,7 +127,8 @@ static int hmem_register_device(struct device *host, int 
target_nid,
 
 static int dax_hmem_platform_probe(struct platform_device *pdev)
 {
-   return walk_hmem_resources(&pdev->dev, hmem_register_device);
+   dax_hmem_pdev = pdev;
+   return walk_hmem_resources(hmem_register_device);
 }
 
 static struct platform_driver dax_hmem_platform_driver = {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index df41a0017b31..4b4d16f94898 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -277,7 +277,6 @@ static inline void hmem_register_resource(int target_nid, 
struct resource *r)
 }
 #endif
 
-typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
-   const struct resource *res);
-int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
+typedef int (*walk_hmem_fn)(int target_nid, const struct resource *res);
+int walk_hmem_resources(walk_hmem_fn fn);
 #endif
-- 
2.34.1




[PATCH v3 1/4] kernel/resource: Provide mem region release for SOFT RESERVES

2025-04-03 Thread Terry Bowman
From: Nathan Fontenot 

Add a release_Sam_region_adjustable() interface to allow for
removing SOFT RESERVE memory resources. This extracts out the code
to remove a mem region into a common __release_mem_region_adjustable()
routine, this routine takes additional parameters of an IORES
descriptor type to add checks for IORES_DESC_* and a flag to check
for IORESOURCE_BUSY to control it's behavior.

The existing release_mem_region_adjustable() is a front end to the
common code and a new release_srmem_region_adjustable() is added to
release SOFT RESERVE resources.

Signed-off-by: Nathan Fontenot 
Signed-off-by: Terry Bowman 
---
 include/linux/ioport.h |  3 +++
 kernel/resource.c  | 55 +++---
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 5385349f0b8a..718360c9c724 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -357,6 +357,9 @@ extern void __release_region(struct resource *, 
resource_size_t,
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern void release_mem_region_adjustable(resource_size_t, resource_size_t);
 #endif
+#ifdef CONFIG_CXL_REGION
+extern void release_srmem_region_adjustable(resource_size_t, resource_size_t);
+#endif
 #ifdef CONFIG_MEMORY_HOTPLUG
 extern void merge_system_ram_resource(struct resource *res);
 #endif
diff --git a/kernel/resource.c b/kernel/resource.c
index 12004452d999..0195b31064b0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1387,7 +1387,7 @@ void __release_region(struct resource *parent, 
resource_size_t start,
 }
 EXPORT_SYMBOL(__release_region);
 
-#ifdef CONFIG_MEMORY_HOTREMOVE
+#if defined(CONFIG_MEMORY_HOTREMOVE) || defined(CONFIG_CXL_REGION)
 /**
  * release_mem_region_adjustable - release a previously reserved memory region
  * @start: resource start address
@@ -1407,7 +1407,10 @@ EXPORT_SYMBOL(__release_region);
  *   assumes that all children remain in the lower address entry for
  *   simplicity.  Enhance this logic when necessary.
  */
-void release_mem_region_adjustable(resource_size_t start, resource_size_t size)
+static void __release_mem_region_adjustable(resource_size_t start,
+   resource_size_t size,
+   bool busy_check,
+   int res_desc)
 {
struct resource *parent = &iomem_resource;
struct resource *new_res = NULL;
@@ -1446,7 +1449,12 @@ void release_mem_region_adjustable(resource_size_t 
start, resource_size_t size)
if (!(res->flags & IORESOURCE_MEM))
break;
 
-   if (!(res->flags & IORESOURCE_BUSY)) {
+   if (busy_check && !(res->flags & IORESOURCE_BUSY)) {
+   p = &res->child;
+   continue;
+   }
+
+   if (res_desc != IORES_DESC_NONE && res->desc != res_desc) {
p = &res->child;
continue;
}
@@ -1496,7 +1504,46 @@ void release_mem_region_adjustable(resource_size_t 
start, resource_size_t size)
write_unlock(&resource_lock);
free_resource(new_res);
 }
-#endif /* CONFIG_MEMORY_HOTREMOVE */
+#endif
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/**
+ * release_mem_region_adjustable - release a previously reserved memory region
+ * @start: resource start address
+ * @size: resource region size
+ *
+ * This interface is intended for memory hot-delete.  The requested region
+ * is released from a currently busy memory resource.  The requested region
+ * must either match exactly or fit into a single busy resource entry.  In
+ * the latter case, the remaining resource is adjusted accordingly.
+ * Existing children of the busy memory resource must be immutable in the
+ * request.
+ *
+ * Note:
+ * - Additional release conditions, such as overlapping region, can be
+ *   supported after they are confirmed as valid cases.
+ * - When a busy memory resource gets split into two entries, the code
+ *   assumes that all children remain in the lower address entry for
+ *   simplicity.  Enhance this logic when necessary.
+ */
+void release_mem_region_adjustable(resource_size_t start, resource_size_t size)
+{
+   return __release_mem_region_adjustable(start, size,
+  true, IORES_DESC_NONE);
+}
+EXPORT_SYMBOL(release_mem_region_adjustable);
+#endif
+
+#ifdef CONFIG_CXL_REGION
+void release_srmem_region_adjustable(resource_size_t start,
+resource_size_t size)
+{
+   return __release_mem_region_adjustable(start, size,
+  false, IORES_DESC_SOFT_RESERVED);
+}
+EXPORT_SYMBOL(release_srmem_region_adjustable);
+#endif
+
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 static bool system_ram_resources_mergeable(struct resource *r1,
-- 
2.34.1