Re: [PATCH v2 4/4] powerpc/pseries/cpuhp: remove obsolete comment from pseries_cpu_die

2021-09-28 Thread Michael Ellerman
Daniel Henrique Barboza  writes:
> On 9/27/21 17:19, Nathan Lynch wrote:
>> This comment likely refers to the obsolete DLPAR workflow where some
>> resource state transitions were driven more directly from user space
>> utilities, but it also seems to contradict itself: "Change isolate state to
>> Isolate [...]" is at odds with the preceding sentences, and it does not
>> relate at all to the code that follows.
>
> This comment was added by commit 413f7c405a34, a 2006 commit where Mike
> Ellerman moved code from platform/pseries/smp.c into hotplug-cpu.c.
>
> I checked the original code back in smp.c and this comment was added there
> by commit 1da177e4c3f41, which is Linus's initial git commit, where he 
> mentions
> that he didn't bothered with full history (although it is available somewhere,
> allegedly).
>
> This is enough to say that we can't easily see the history behind this 
> comment.
> I also believe that we're better of without it since it doesn't make sense
> with the current codebase.

It was added by the original CPU hotplug commit for ppc64::

https://github.com/mpe/linux-fullhistory/commit/0e9fd9441cd2113b67b14e739267c9e69761489b


The code was fairly similar:

void __cpu_die(unsigned int cpu)
{
int tries;
int cpu_status;
unsigned int pcpu = get_hard_smp_processor_id(cpu);

for (tries = 0; tries < 5; tries++) {
cpu_status = query_cpu_stopped(pcpu);

if (cpu_status == 0)
break;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(HZ);
}
if (cpu_status != 0) {
printk("Querying DEAD? cpu %i (%i) shows %i\n",
   cpu, pcpu, cpu_status);
}

/* Isolation and deallocation are definatly done by
 * drslot_chrp_cpu.  If they were not they would be
 * done here.  Change isolate state to Isolate and
 * change allocation-state to Unusable.
 */
paca[cpu].xProcStart = 0;

/* So we can recognize if it fails to come up next time. */
cpu_callin_map[cpu] = 0;
}


drslot_chrp_cpu() still exists in drmgr:

  
https://github.com/ibm-power-utilities/powerpc-utils/blob/e798c4a09fbf0fa0f421e624cfa366a6c405c9fe/src/drmgr/drslot_chrp_cpu.c#L406


I agree the comment is no longer meaningful and can be removed.

It might be good to then add a comment explaining why we need to set
cpu_start = 0.

It's not immediately clear why we need to. When we bring a CPU back
online in smp_pSeries_kick_cpu() we ask RTAS to start it and then
immediately set cpu_start = 1, ie. there isn't a separate step that sets
cpu_start = 1 for hotplugged CPUs.

cheers


[PATCH v2 2/2] powerpc:85xx: fix timebase sync issue when CONFIG_HOTPLUG_CPU=n

2021-09-28 Thread Xiaoming Ni
When CONFIG_SMP=y, timebase synchronization is required when the second
 kernel is started.
arch/powerpc/kernel/smp.c:
int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
...
if (smp_ops->give_timebase)
smp_ops->give_timebase();
...
}

void start_secondary(void *unused)
{
...
if (smp_ops->take_timebase)
smp_ops->take_timebase();
...
}

When CONFIG_HOTPLUG_CPU=n and CONFIG_KEXEC_CORE=n,
 smp_85xx_ops.give_timebase is NULL,
 smp_85xx_ops.take_timebase is NULL,
As a result, the timebase is not synchronized.

Timebase  synchronization does not depend on CONFIG_HOTPLUG_CPU.

Fixes: 56f1ba280719 ("powerpc/mpc85xx: refactor the PM operations")
Cc: sta...@vger.kernel.org #v4.6
Signed-off-by: Xiaoming Ni 
---
 arch/powerpc/platforms/85xx/Makefile |  4 +++-
 arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c |  4 
 arch/powerpc/platforms/85xx/smp.c| 12 ++--
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/85xx/Makefile 
b/arch/powerpc/platforms/85xx/Makefile
index 60e4e97a929d..260fbad7967b 100644
--- a/arch/powerpc/platforms/85xx/Makefile
+++ b/arch/powerpc/platforms/85xx/Makefile
@@ -3,7 +3,9 @@
 # Makefile for the PowerPC 85xx linux kernel.
 #
 obj-$(CONFIG_SMP) += smp.o
-obj-$(CONFIG_FSL_PMC)+= mpc85xx_pm_ops.o
+ifneq ($(CONFIG_FSL_CORENET_RCPM),y)
+obj-$(CONFIG_SMP) += mpc85xx_pm_ops.o
+endif
 
 obj-y += common.o
 
diff --git a/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c 
b/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c
index ffa8a7a6a2db..4a8af80011a6 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c
@@ -17,6 +17,7 @@
 
 static struct ccsr_guts __iomem *guts;
 
+#ifdef CONFIG_FSL_PMC
 static void mpc85xx_irq_mask(int cpu)
 {
 
@@ -49,6 +50,7 @@ static void mpc85xx_cpu_up_prepare(int cpu)
 {
 
 }
+#endif
 
 static void mpc85xx_freeze_time_base(bool freeze)
 {
@@ -76,10 +78,12 @@ static const struct of_device_id mpc85xx_smp_guts_ids[] = {
 
 static const struct fsl_pm_ops mpc85xx_pm_ops = {
.freeze_time_base = mpc85xx_freeze_time_base,
+#ifdef CONFIG_FSL_PMC
.irq_mask = mpc85xx_irq_mask,
.irq_unmask = mpc85xx_irq_unmask,
.cpu_die = mpc85xx_cpu_die,
.cpu_up_prepare = mpc85xx_cpu_up_prepare,
+#endif
 };
 
 int __init mpc85xx_setup_pmc(void)
diff --git a/arch/powerpc/platforms/85xx/smp.c 
b/arch/powerpc/platforms/85xx/smp.c
index c6df294054fe..83f4a6389a28 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -40,7 +40,6 @@ struct epapr_spin_table {
u32 pir;
 };
 
-#ifdef CONFIG_HOTPLUG_CPU
 static u64 timebase;
 static int tb_req;
 static int tb_valid;
@@ -112,6 +111,7 @@ static void mpc85xx_take_timebase(void)
local_irq_restore(flags);
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
 static void smp_85xx_cpu_offline_self(void)
 {
unsigned int cpu = smp_processor_id();
@@ -495,21 +495,21 @@ void __init mpc85xx_smp_init(void)
smp_85xx_ops.probe = NULL;
}
 
-#ifdef CONFIG_HOTPLUG_CPU
 #ifdef CONFIG_FSL_CORENET_RCPM
+   /* Assign a value to qoriq_pm_ops on PPC_E500MC */
fsl_rcpm_init();
-#endif
-
-#ifdef CONFIG_FSL_PMC
+#else
+   /* Assign a value to qoriq_pm_ops on !PPC_E500MC */
mpc85xx_setup_pmc();
 #endif
if (qoriq_pm_ops) {
smp_85xx_ops.give_timebase = mpc85xx_give_timebase;
smp_85xx_ops.take_timebase = mpc85xx_take_timebase;
+#ifdef CONFIG_HOTPLUG_CPU
smp_85xx_ops.cpu_offline_self = smp_85xx_cpu_offline_self;
smp_85xx_ops.cpu_die = qoriq_cpu_kill;
-   }
 #endif
+   }
smp_ops = _85xx_ops;
 
 #ifdef CONFIG_KEXEC_CORE
-- 
2.27.0



[PATCH v2 1/2] powerpc:85xx:Fix oops when mpc85xx_smp_guts_ids node cannot be found

2021-09-28 Thread Xiaoming Ni
When the field described in mpc85xx_smp_guts_ids[] is not configured in
 dtb, the mpc85xx_setup_pmc() does not assign a value to the "guts"
 variable. As a result, the oops is triggered when
 mpc85xx_freeze_time_base() is executed.

Fixes:56f1ba280719 ("powerpc/mpc85xx: refactor the PM operations")
Cc: sta...@vger.kernel.org #v4.6
Signed-off-by: Xiaoming Ni 
---
 arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c 
b/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c
index 7c0133f558d0..ffa8a7a6a2db 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c
@@ -94,9 +94,8 @@ int __init mpc85xx_setup_pmc(void)
pr_err("Could not map guts node address\n");
return -ENOMEM;
}
+   qoriq_pm_ops = _pm_ops;
}
 
-   qoriq_pm_ops = _pm_ops;
-
return 0;
 }
-- 
2.27.0



[PATCH v2 0/2] powerpc:85xx: fix timebase sync issue when CONFIG_HOTPLUG_CPU=n

2021-09-28 Thread Xiaoming Ni
When CONFIG_SMP=y, timebase synchronization is required for mpc8572 when
 the second kernel is started
arch/powerpc/kernel/smp.c:
int __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
...
if (smp_ops->give_timebase)
smp_ops->give_timebase();
...
}

void start_secondary(void *unused)
{
...
if (smp_ops->take_timebase)
smp_ops->take_timebase();
...
}

When CONFIG_HOTPLUG_CPU=n and CONFIG_KEXEC_CORE=n,
 smp_85xx_ops.give_timebase is NULL,
 smp_85xx_ops.take_timebase is NULL,
As a result, the timebase is not synchronized.

test code:
for i in $(seq 1 3); do taskset 1 date; taskset 2 date; sleep 1; 
echo;done
log:
Sat Sep 25 18:50:00 CST 2021
Sat Sep 25 19:07:47 CST 2021

Sat Sep 25 18:50:01 CST 2021
Sat Sep 25 19:07:48 CST 2021

Sat Sep 25 18:50:02 CST 2021
Sat Sep 25 19:07:49 CST 2021

Code snippet about give_timebase and take_timebase assignments:
arch/powerpc/platforms/85xx/smp.c:
#ifdef CONFIG_HOTPLUG_CPU
#ifdef CONFIG_FSL_CORENET_RCPM
fsl_rcpm_init();
#endif
#ifdef CONFIG_FSL_PMC
mpc85xx_setup_pmc();
#endif
if (qoriq_pm_ops) {
smp_85xx_ops.give_timebase = mpc85xx_give_timebase;
smp_85xx_ops.take_timebase = mpc85xx_take_timebase;

config dependency:
FSL_CORENET_RCPM depends on the PPC_E500MC.
FSL_PMC depends on SUSPEND.
SUSPEND depends on ARCH_SUSPEND_POSSIBLE.
ARCH_SUSPEND_POSSIBLE depends on !PPC_E500MC.

CONFIG_HOTPLUG_CPU and CONFIG_FSL_PMC require the timebase function, but
the timebase should not depend on CONFIG_HOTPLUG_CPU and CONFIG_FSL_PMC.
Therefore, adjust the macro control range. Ensure that the corresponding
 timebase hook function is not empty when the dtsi node is configured.

-
changes in v2:
 1. add new patch: "powerpc:85xx:Fix oops when mpc85xx_smp_guts_ids node
  cannot be found"
 2. Using !CONFIG_FSL_CORENET_RCPM to manage the timebase code of !PPC_E500MC

v1:
 https://lore.kernel.org/lkml/20210926025144.55674-1-nixiaom...@huawei.com
--

Xiaoming Ni (2):
  powerpc:85xx:Fix oops when mpc85xx_smp_guts_ids node cannot be found
  powerpc:85xx: fix timebase sync issue when CONFIG_HOTPLUG_CPU=n

 arch/powerpc/platforms/85xx/Makefile |  4 +++-
 arch/powerpc/platforms/85xx/mpc85xx_pm_ops.c |  7 +--
 arch/powerpc/platforms/85xx/smp.c| 12 ++--
 3 files changed, 14 insertions(+), 9 deletions(-)

-- 
2.27.0



Re: [PATCH v2 2/4] mm: Make generic arch_is_kernel_initmem_freed() do what it says

2021-09-28 Thread Andrew Morton
On Tue, 28 Sep 2021 09:15:35 +0200 Christophe Leroy 
 wrote:

> Commit 7a5da02de8d6 ("locking/lockdep: check for freed initmem in
> static_obj()") added arch_is_kernel_initmem_freed() which is supposed
> to report whether an object is part of already freed init memory.
> 
> For the time being, the generic version of arch_is_kernel_initmem_freed()
> always reports 'false', allthough free_initmem() is generically called
> on all architectures.
> 
> Therefore, change the generic version of arch_is_kernel_initmem_freed()
> to check whether free_initmem() has been called. If so, then check
> if a given address falls into init memory.
> 
> In order to use function init_section_contains(), the fonction is
> moved at the end of asm-generic/section.h

i386 allmodconfig:

In file included from arch/x86/platform/intel-quark/imr.c:28:
./include/asm-generic/sections.h: In function 'arch_is_kernel_initmem_freed':
./include/asm-generic/sections.h:171:6: error: 'system_state' undeclared (first 
use in this function)
  171 |  if (system_state < SYSTEM_FREEING_INITMEM)
  |  ^~~~
./include/asm-generic/sections.h:171:6: note: each undeclared identifier is 
reported only once for each function it appears in
./include/asm-generic/sections.h:171:21: error: 'SYSTEM_FREEING_INITMEM' 
undeclared (first use in this function)
  171 |  if (system_state < SYSTEM_FREEING_INITMEM)
  | ^~


I don't think it would be a good idea to include kernel.h from
sections.h - it's unclear to me which is the "innermost" of those two. 
It would be better to uninline arch_is_kernel_initmem_freed().  Surely
there's no real reason for inlining it?

Anyway, I'll drop the series for now.



Re: [PATCH v2 9/9] iommu/vt-d: Use pci core's DVSEC functionality

2021-09-28 Thread Lu Baolu

Hi Dan,

On 9/29/21 1:54 AM, Dan Williams wrote:

On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:


Reduce maintenance burden of DVSEC query implementation by using the
centralized PCI core implementation.

Cc: io...@lists.linux-foundation.org
Cc: David Woodhouse 
Cc: Lu Baolu 
Signed-off-by: Ben Widawsky 
---
  drivers/iommu/intel/iommu.c | 15 +--
  1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index d75f59ae28e6..30c97181f0ae 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -5398,20 +5398,7 @@ static int intel_iommu_disable_sva(struct device *dev)
   */
  static int siov_find_pci_dvsec(struct pci_dev *pdev)
  {
-   int pos;
-   u16 vendor, id;
-
-   pos = pci_find_next_ext_capability(pdev, 0, 0x23);
-   while (pos) {
-   pci_read_config_word(pdev, pos + 4, );
-   pci_read_config_word(pdev, pos + 8, );
-   if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
-   return pos;
-
-   pos = pci_find_next_ext_capability(pdev, pos, 0x23);
-   }
-
-   return 0;
+   return pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_INTEL, 5);
  }


Same comments as the CXL patch, siov_find_pci_dvsec() doesn't seem to
have a reason to exist anymore. What is 5?


"5" is DVSEC ID for Scalable IOV.

Anyway, the siov_find_pci_dvsec() has been dead code since commit
262948f8ba57 ("iommu: Delete iommu_dev_has_feature()"). I have a patch
to clean it up. No need to care about it in this series.

Best regards,
baolu


Re: [PATCH v3 9/9] powerpc/mm: Use is_kernel_text() and is_kernel_inittext() helper

2021-09-28 Thread Kefeng Wang




On 2021/9/29 1:51, Christophe Leroy wrote:



Le 26/09/2021 à 09:20, Kefeng Wang a écrit :

Use is_kernel_text() and is_kernel_inittext() helper to simplify code,
also drop etext, _stext, _sinittext, _einittext declaration which
already declared in section.h.

Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Kefeng Wang 
---
  arch/powerpc/mm/pgtable_32.c | 7 ++-
  1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index dcf5ecca19d9..13c798308c2e 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -33,8 +33,6 @@
  #include 
-extern char etext[], _stext[], _sinittext[], _einittext[];
-
  static u8 early_fixmap_pagetable[FIXMAP_PTE_SIZE] __page_aligned_data;
  notrace void __init early_ioremap_init(void)
@@ -104,14 +102,13 @@ static void __init __mapin_ram_chunk(unsigned 
long offset, unsigned long top)

  {
  unsigned long v, s;
  phys_addr_t p;
-    int ktext;
+    bool ktext;
  s = offset;
  v = PAGE_OFFSET + s;
  p = memstart_addr + s;
  for (; s < top; s += PAGE_SIZE) {
-    ktext = ((char *)v >= _stext && (char *)v < etext) ||
-    ((char *)v >= _sinittext && (char *)v < _einittext);
+    ktext = (is_kernel_text(v) || is_kernel_inittext(v));


I think we could use core_kernel_next() instead.

Indead. oops, sorry for the build error, will update, thanks.



Build failure on mpc885_ads_defconfig

arch/powerpc/mm/pgtable_32.c: In function '__mapin_ram_chunk':
arch/powerpc/mm/pgtable_32.c:111:26: error: implicit declaration of 
function 'is_kernel_text'; did you mean 'is_kernel_inittext'? 
[-Werror=implicit-function-declaration]
   111 | ktext = (is_kernel_text(v) || 
is_kernel_inittext(v));

   |  ^~
   |  is_kernel_inittext
cc1: all warnings being treated as errors
make[2]: *** [scripts/Makefile.build:277: arch/powerpc/mm/pgtable_32.o] 
Error 1

make[1]: *** [scripts/Makefile.build:540: arch/powerpc/mm] Error 2
make: *** [Makefile:1868: arch/powerpc] Error 2


.


Re: [PATCH v2 4/4] powerpc/pseries/cpuhp: remove obsolete comment from pseries_cpu_die

2021-09-28 Thread Daniel Henrique Barboza




On 9/27/21 17:19, Nathan Lynch wrote:

This comment likely refers to the obsolete DLPAR workflow where some
resource state transitions were driven more directly from user space
utilities, but it also seems to contradict itself: "Change isolate state to
Isolate [...]" is at odds with the preceding sentences, and it does not
relate at all to the code that follows.



This comment was added by commit 413f7c405a34, a 2006 commit where Mike
Ellerman moved code from platform/pseries/smp.c into hotplug-cpu.c.

I checked the original code back in smp.c and this comment was added there
by commit 1da177e4c3f41, which is Linus's initial git commit, where he mentions
that he didn't bothered with full history (although it is available somewhere,
allegedly).

This is enough to say that we can't easily see the history behind this comment.
I also believe that we're better of without it since it doesn't make sense
with the current codebase.


Reviewed-by: Daniel Henrique Barboza 



Remove it to prevent confusion.

Signed-off-by: Nathan Lynch 
---
  arch/powerpc/platforms/pseries/hotplug-cpu.c | 5 -
  1 file changed, 5 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c 
b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index b50f3e9aa259..5ab44600c8d3 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -137,11 +137,6 @@ static void pseries_cpu_die(unsigned int cpu)
cpu, pcpu);
}
  
-	/* Isolation and deallocation are definitely done by

-* drslot_chrp_cpu.  If they were not they would be
-* done here.  Change isolate state to Isolate and
-* change allocation-state to Unusable.
-*/
paca_ptrs[cpu]->cpu_start = 0;
  }
  



Re: [RFC PATCH 7/8] riscv: rely on core code to keep thread_info::cpu updated

2021-09-28 Thread Palmer Dabbelt

On Tue, 14 Sep 2021 05:10:35 PDT (-0700), a...@kernel.org wrote:

Now that the core code switched back to using thread_info::cpu to keep
a task's CPU number, we no longer need to keep it in sync explicitly. So
just drop the code that does this.

Signed-off-by: Ard Biesheuvel 
---
 arch/riscv/kernel/asm-offsets.c | 1 -
 arch/riscv/kernel/entry.S   | 5 -
 arch/riscv/kernel/head.S| 1 -
 3 files changed, 7 deletions(-)

diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 90f8ce64fa6f..478d9f02dab5 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -33,7 +33,6 @@ void asm_offsets(void)
OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
OFFSET(TASK_TI_USER_SP, task_struct, thread_info.user_sp);
-   OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);

OFFSET(TASK_THREAD_F0,  task_struct, thread.fstate.f[0]);
OFFSET(TASK_THREAD_F1,  task_struct, thread.fstate.f[1]);
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 98f502654edd..459eb1714353 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -544,11 +544,6 @@ ENTRY(__switch_to)
REG_L s9,  TASK_THREAD_S9_RA(a4)
REG_L s10, TASK_THREAD_S10_RA(a4)
REG_L s11, TASK_THREAD_S11_RA(a4)
-   /* Swap the CPU entry around. */
-   lw a3, TASK_TI_CPU(a0)
-   lw a4, TASK_TI_CPU(a1)
-   sw a3, TASK_TI_CPU(a1)
-   sw a4, TASK_TI_CPU(a0)
/* The offset of thread_info in task_struct is zero. */
move tp, a1
ret
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index fce5184b22c3..d5ec30ef6f5d 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -317,7 +317,6 @@ clear_bss_done:
call setup_trap_vector
/* Restore C environment */
la tp, init_task
-   sw zero, TASK_TI_CPU(tp)
la sp, init_thread_union + THREAD_SIZE

 #ifdef CONFIG_KASAN


Acked-by: Palmer Dabbelt 

Thanks!


[PATCH v2 0/2] powerpc/paravirt: vcpu_is_preempted() tweaks

2021-09-28 Thread Nathan Lynch
Minor changes arising from discovering that this code throws warnings with
DEBUG_PREEMPT kernels.

Changes since v1:
* Additional commentary to (1) distinguish hypervisor dispatch and preempt
  behavior from kernel scheduler preemption; and (2) more clearly justify
  the use of raw_smp_processor_id().
* Additional patch to update existing comments before making the functional
  change.

v1: 
https://lore.kernel.org/linuxppc-dev/20210921031213.2029824-1-nath...@linux.ibm.com/

Nathan Lynch (2):
  powerpc/paravirt: vcpu_is_preempted() commentary
  powerpc/paravirt: correct preempt debug splat in vcpu_is_preempted()

 arch/powerpc/include/asm/paravirt.h | 40 +
 1 file changed, 35 insertions(+), 5 deletions(-)

-- 
2.31.1



[PATCH v2 2/2] powerpc/paravirt: correct preempt debug splat in vcpu_is_preempted()

2021-09-28 Thread Nathan Lynch
vcpu_is_preempted() can be used outside of preempt-disabled critical
sections, yielding warnings such as:

BUG: using smp_processor_id() in preemptible [] code: systemd-udevd/185
caller is rwsem_spin_on_owner+0x1cc/0x2d0
CPU: 1 PID: 185 Comm: systemd-udevd Not tainted 5.15.0-rc2+ #33
Call Trace:
[c00012907ac0] [c0aa30a8] dump_stack_lvl+0xac/0x108 (unreliable)
[c00012907b00] [c1371f70] check_preemption_disabled+0x150/0x160
[c00012907b90] [c01e0e8c] rwsem_spin_on_owner+0x1cc/0x2d0
[c00012907be0] [c01e1408] rwsem_down_write_slowpath+0x478/0x9a0
[c00012907ca0] [c0576cf4] filename_create+0x94/0x1e0
[c00012907d10] [c057ac08] do_symlinkat+0x68/0x1a0
[c00012907d70] [c057ae18] sys_symlink+0x58/0x70
[c00012907da0] [c002e448] system_call_exception+0x198/0x3c0
[c00012907e10] [c000c54c] system_call_common+0xec/0x250

The result of vcpu_is_preempted() is always used speculatively, and the
function does not access per-cpu resources in a (Linux) preempt-unsafe way.
Use raw_smp_processor_id() to avoid such warnings, adding explanatory
comments.

Signed-off-by: Nathan Lynch 
Fixes: ca3f969dcb11 ("powerpc/paravirt: Use is_kvm_guest() in 
vcpu_is_preempted()")
---
 arch/powerpc/include/asm/paravirt.h | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/paravirt.h 
b/arch/powerpc/include/asm/paravirt.h
index 39f173961f6a..eb7df559ae74 100644
--- a/arch/powerpc/include/asm/paravirt.h
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -110,7 +110,23 @@ static inline bool vcpu_is_preempted(int cpu)
 
 #ifdef CONFIG_PPC_SPLPAR
if (!is_kvm_guest()) {
-   int first_cpu = cpu_first_thread_sibling(smp_processor_id());
+   int first_cpu;
+
+   /*
+* The result of vcpu_is_preempted() is used in a
+* speculative way, and is always subject to invalidation
+* by events internal and external to Linux. While we can
+* be called in preemptable context (in the Linux sense),
+* we're not accessing per-cpu resources in a way that can
+* race destructively with Linux scheduler preemption and
+* migration, and callers can tolerate the potential for
+* error introduced by sampling the CPU index without
+* pinning the task to it. So it is permissible to use
+* raw_smp_processor_id() here to defeat the preempt debug
+* warnings that can arise from using smp_processor_id()
+* in arbitrary contexts.
+*/
+   first_cpu = cpu_first_thread_sibling(raw_smp_processor_id());
 
/*
 * The PowerVM hypervisor dispatches VMs on a whole core
-- 
2.31.1



[PATCH v2 1/2] powerpc/paravirt: vcpu_is_preempted() commentary

2021-09-28 Thread Nathan Lynch
Add comments more clearly documenting that this function determines whether
hypervisor-level preemption of the VM has occurred.

Signed-off-by: Nathan Lynch 
---
 arch/powerpc/include/asm/paravirt.h | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/paravirt.h 
b/arch/powerpc/include/asm/paravirt.h
index bcb7b5f917be..39f173961f6a 100644
--- a/arch/powerpc/include/asm/paravirt.h
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -21,7 +21,7 @@ static inline bool is_shared_processor(void)
return static_branch_unlikely(_processor);
 }
 
-/* If bit 0 is set, the cpu has been preempted */
+/* If bit 0 is set, the cpu has been ceded, conferred, or preempted */
 static inline u32 yield_count_of(int cpu)
 {
__be32 yield_count = READ_ONCE(lppaca_of(cpu).yield_count);
@@ -92,6 +92,19 @@ static inline void prod_cpu(int cpu)
 #define vcpu_is_preempted vcpu_is_preempted
 static inline bool vcpu_is_preempted(int cpu)
 {
+   /*
+* The dispatch/yield bit alone is an imperfect indicator of
+* whether the hypervisor has dispatched @cpu to run on a physical
+* processor. When it is clear, @cpu is definitely not preempted.
+* But when it is set, it means only that it *might* be, subject to
+* other conditions. So we check other properties of the VM and
+* @cpu first, resorting to the yield count last.
+*/
+
+   /*
+* Hypervisor preemption isn't possible in dedicated processor
+* mode by definition.
+*/
if (!is_shared_processor())
return false;
 
@@ -100,9 +113,10 @@ static inline bool vcpu_is_preempted(int cpu)
int first_cpu = cpu_first_thread_sibling(smp_processor_id());
 
/*
-* Preemption can only happen at core granularity. This CPU
-* is not preempted if one of the CPU of this core is not
-* preempted.
+* The PowerVM hypervisor dispatches VMs on a whole core
+* basis. So we know that a thread sibling of the local CPU
+* cannot have been preempted by the hypervisor, even if it
+* has called H_CONFER, which will set the yield bit.
 */
if (cpu_first_thread_sibling(cpu) == first_cpu)
return false;
-- 
2.31.1



Re: [PATCH v4 0/8] Implement generic cc_platform_has() helper function

2021-09-28 Thread Borislav Petkov
On Tue, Sep 28, 2021 at 02:01:57PM -0700, Kuppuswamy, Sathyanarayanan wrote:
> Yes. But, since the check is related to TDX, I just want to confirm whether
> you are fine with naming the function as intel_*().

Why is this such a big of a deal?!

There's amd_cc_platform_has() and intel_cc_platform_has() will be the
corresponding Intel version.

> Since this patch is going to have dependency on TDX code, I will include
> this patch in TDX patch set.

Ok.

-- 
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette


Re: [PATCH v4 4/8] PCI: replace pci_dev::driver usage that gets the driver name

2021-09-28 Thread Simon Horman
On Mon, Sep 27, 2021 at 10:43:22PM +0200, Uwe Kleine-König wrote:
> From: Uwe Kleine-König 
> 
> struct pci_dev::driver holds (apart from a constant offset) the same
> data as struct pci_dev::dev->driver. With the goal to remove struct
> pci_dev::driver to get rid of data duplication replace getting the
> driver name by dev_driver_string() which implicitly makes use of struct
> pci_dev::dev->driver.
> 
> Signed-off-by: Uwe Kleine-König 

...

> diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 
> b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
> index 0685ece1f155..23dfb599c828 100644
> --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
> +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
> @@ -202,7 +202,7 @@ nfp_get_drvinfo(struct nfp_app *app, struct pci_dev *pdev,
>  {
>   char nsp_version[ETHTOOL_FWVERS_LEN] = {};
>  
> - strlcpy(drvinfo->driver, pdev->driver->name, sizeof(drvinfo->driver));
> + strlcpy(drvinfo->driver, dev_driver_string(>dev), 
> sizeof(drvinfo->driver));

I'd slightly prefer to maintain lines under 80 columns wide.
But not nearly strongly enough to engage in a long debate about it.

In any case, for the NFP portion of this patch.

Acked-by: Simon Horman 

>   nfp_net_get_nspinfo(app, nsp_version);
>   snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
>"%s %s %s %s", vnic_version, nsp_version,

...


Re: [PATCH v4 0/8] Implement generic cc_platform_has() helper function

2021-09-28 Thread Kuppuswamy, Sathyanarayanan




On 9/28/21 1:58 PM, Borislav Petkov wrote:

On Tue, Sep 28, 2021 at 01:48:46PM -0700, Kuppuswamy, Sathyanarayanan wrote:

Just read it. If you want to use cpuid_has_tdx_guest() directly in
cc_platform_has(), then you want to rename intel_cc_platform_has() to
tdx_cc_platform_has()?


Why?

You simply do:

if (cpuid_has_tdx_guest())
intel_cc_platform_has(...);

and lemme paste from that mail: " ...you should use
cpuid_has_tdx_guest() instead but cache its result so that you don't
call CPUID each time the kernel executes cc_platform_has()."

Makes sense?


Yes. But, since the check is related to TDX, I just want to confirm whether
you are fine with naming the function as intel_*().

Since this patch is going to have dependency on TDX code, I will include
this patch in TDX patch set.





--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer


Re: [PATCH v4 0/8] Implement generic cc_platform_has() helper function

2021-09-28 Thread Borislav Petkov
On Tue, Sep 28, 2021 at 01:48:46PM -0700, Kuppuswamy, Sathyanarayanan wrote:
> Just read it. If you want to use cpuid_has_tdx_guest() directly in
> cc_platform_has(), then you want to rename intel_cc_platform_has() to
> tdx_cc_platform_has()?

Why?

You simply do:

if (cpuid_has_tdx_guest())
intel_cc_platform_has(...);

and lemme paste from that mail: " ...you should use
cpuid_has_tdx_guest() instead but cache its result so that you don't
call CPUID each time the kernel executes cc_platform_has()."

Makes sense?

-- 
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette


Re: [PATCH v4 0/8] Implement generic cc_platform_has() helper function

2021-09-28 Thread Kuppuswamy, Sathyanarayanan




On 9/28/21 1:31 PM, Borislav Petkov wrote:

On Tue, Sep 28, 2021 at 12:19:49PM -0700, Kuppuswamy, Sathyanarayanan wrote:

Intel CC support patch is not included in this series. You want me
to address the issue raised by Joerg before merging it?


Did you not see my email to you today:

https://lkml.kernel.org/r/yvl4zughfsh1q...@zn.tnic


Just read it. If you want to use cpuid_has_tdx_guest() directly in
cc_platform_has(), then you want to rename intel_cc_platform_has() to
tdx_cc_platform_has()?



?



--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer


Re: [PATCH v4 0/8] Implement generic cc_platform_has() helper function

2021-09-28 Thread Borislav Petkov
On Tue, Sep 28, 2021 at 12:19:49PM -0700, Kuppuswamy, Sathyanarayanan wrote:
> Intel CC support patch is not included in this series. You want me
> to address the issue raised by Joerg before merging it?

Did you not see my email to you today:

https://lkml.kernel.org/r/yvl4zughfsh1q...@zn.tnic

?

-- 
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette


Re: [PATCH v4 4/8] PCI: replace pci_dev::driver usage that gets the driver name

2021-09-28 Thread Bjorn Helgaas
On Tue, Sep 28, 2021 at 09:29:36PM +0200, Uwe Kleine-König wrote:
> On Tue, Sep 28, 2021 at 12:17:59PM -0500, Bjorn Helgaas wrote:
> > [+to Oliver, Russell for eeh_driver_name() question below]
> > 
> > On Mon, Sep 27, 2021 at 10:43:22PM +0200, Uwe Kleine-König wrote:
> > > From: Uwe Kleine-König 
> > > 
> > > struct pci_dev::driver holds (apart from a constant offset) the same
> > > data as struct pci_dev::dev->driver. With the goal to remove struct
> > > pci_dev::driver to get rid of data duplication replace getting the
> > > driver name by dev_driver_string() which implicitly makes use of struct
> > > pci_dev::dev->driver.

> > Also, would you mind using "pci_dev.driver" instead of
> > "pci_dev::driver"?  AFAIK, the "::" operator is not actually part of
> > C, so I think it's more confusing than useful.
> 
> pci_dev.driver doesn't work either in C because pci_dev is a type and
> not a variable.

Sure, "pci_dev.driver" is not strictly acceptable C unless you have a
"struct pci_dev pci_dev", but it's pretty common.


Re: [PATCH 02/10] i2c: pasemi: Use io{read,write}32

2021-09-28 Thread Sven Peter
On Mon, Sep 27, 2021, at 09:39, Arnd Bergmann wrote:
> On Sun, Sep 26, 2021 at 12:00 PM Sven Peter  wrote:
>>
>> In preparation for splitting this driver up into a platform_driver
>> and a pci_driver, replace outl/inl usage with ioport_map and
>> ioread32/iowrite32.
>>
>> Signed-off-by: Sven Peter 
>>
>> +   smbus->ioaddr = ioport_map(smbus->base, smbus->size);
>> +   if (!smbus->ioaddr) {
>> +   error = -EBUSY;
>> +   goto out_release_region;
>> +   }
>
> While this works, I would suggest using the more regular pci_iomap()
> or pcim_iomap() helper to turn the port number into an __iomem token.

Thanks a lot for the review!

I'll replace it with pci_iomap here and then later in this series with
pcim_iomap when also switching the rest to devres.


Thanks,

Sven



Re: [PATCH v4 4/8] PCI: replace pci_dev::driver usage that gets the driver name

2021-09-28 Thread Uwe Kleine-König
Hello,

On Tue, Sep 28, 2021 at 12:17:59PM -0500, Bjorn Helgaas wrote:
> [+to Oliver, Russell for eeh_driver_name() question below]
> 
> On Mon, Sep 27, 2021 at 10:43:22PM +0200, Uwe Kleine-König wrote:
> > From: Uwe Kleine-König 
> > 
> > struct pci_dev::driver holds (apart from a constant offset) the same
> > data as struct pci_dev::dev->driver. With the goal to remove struct
> > pci_dev::driver to get rid of data duplication replace getting the
> > driver name by dev_driver_string() which implicitly makes use of struct
> > pci_dev::dev->driver.
> 
> When you repost to fix the build issue, can you capitalize the subject
> line to match the other?

Yes, sure.

> Also, would you mind using "pci_dev.driver" instead of
> "pci_dev::driver"?  AFAIK, the "::" operator is not actually part of
> C, so I think it's more confusing than useful.

pci_dev.driver doesn't work either in C because pci_dev is a type and
not a variable. This is probably subjective, but for me pci_dev.driver
looks definitively stranger than pci_dev::driver. And :: is at least not
unseen in the kernel commit logs. (git log --grep=::)
But if you insist I can change to .

> > diff --git a/arch/powerpc/include/asm/ppc-pci.h 
> > b/arch/powerpc/include/asm/ppc-pci.h
> > index 2b9edbf6e929..e8f1795a2acf 100644
> > --- a/arch/powerpc/include/asm/ppc-pci.h
> > +++ b/arch/powerpc/include/asm/ppc-pci.h
> > @@ -57,7 +57,14 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev);
> >  
> >  static inline const char *eeh_driver_name(struct pci_dev *pdev)
> >  {
> > -   return (pdev && pdev->driver) ? pdev->driver->name : "";
> > +   if (pdev) {
> > +   const char *drvstr = dev_driver_string(>dev);
> > +
> > +   if (strcmp(drvstr, ""))
> > +   return drvstr;
> > +   }
> > +
> > +   return "";
> 
> Can we just do this?
> 
>   if (pdev)
> return dev_driver_string(>dev);
> 
>   return "";

Works for me, too. It behaves a bit differerently than my suggestion
(which nearly behaves identical to the status quo), but only in some
degenerated cases.

> I think it's more complicated than it's worth to include a strcmp().
> It's possible this will change those error messages about "Might be
> infinite loop in %s driver", but that doesn't seem like a huge deal.
> 
> I moved Oliver to "to:" and added Russell in case they object.
> 
> >  }
> >  
> >  #endif /* CONFIG_EEH */
> > diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c
> > index 69c10a7b7c61..0973022d4b13 100644
> > --- a/drivers/bcma/host_pci.c
> > +++ b/drivers/bcma/host_pci.c
> > @@ -175,9 +175,10 @@ static int bcma_host_pci_probe(struct pci_dev *dev,
> > if (err)
> > goto err_kfree_bus;
> >  
> > -   name = dev_name(>dev);
> > -   if (dev->driver && dev->driver->name)
> > -   name = dev->driver->name;
> > +   name = dev_driver_string(>dev);
> > +   if (!strcmp(name, ""))
> > +   name = dev_name(>dev);
> > err = pci_request_regions(dev, name);
> 
> Again seems more complicated than it's worth to me.  This is in the
> driver's .probe() method, so really_probe() has already set
> "dev->driver = drv", which means dev->driver is always set to
> _pci_bridge_driver here, and bcma_pci_bridge_driver.name is
> always "bcma-pci-bridge".
> 
> Almost all callers of pci_request_regions() just hardcode the driver
> name or use a DRV_NAME #define
> 
> So I think we should just do:
> 
>   err = pci_request_regions(dev, "bcma-pci-bridge");

Yes, looks right. I'd put this in a separate patch.

> > if (err)
> > goto err_pci_disable;
> > [...]
> > diff --git a/drivers/ssb/pcihost_wrapper.c b/drivers/ssb/pcihost_wrapper.c
> > index 410215c16920..4938ed5cfae5 100644
> > --- a/drivers/ssb/pcihost_wrapper.c
> > +++ b/drivers/ssb/pcihost_wrapper.c
> > @@ -78,9 +78,11 @@ static int ssb_pcihost_probe(struct pci_dev *dev,
> > err = pci_enable_device(dev);
> > if (err)
> > goto err_kfree_ssb;
> > -   name = dev_name(>dev);
> > -   if (dev->driver && dev->driver->name)
> > -   name = dev->driver->name;
> > +
> > +   name = dev_driver_string(>dev);
> > +   if (*name == '\0')
> > +   name = dev_name(>dev);
> > +
> > err = pci_request_regions(dev, name);
> 
> Also seems like more trouble than it's worth.  This one is a little
> strange but is always called for either b43_pci_bridge_driver or
> b44_pci_driver, both of which have .name set, so I think we should
> simply do:
> 
>   err = pci_request_regions(dev, dev_driver_string(>dev));

yes, agreed, too.

Best regards
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | https://www.pengutronix.de/ |


signature.asc
Description: PGP signature


Re: [PATCH v4 0/8] Implement generic cc_platform_has() helper function

2021-09-28 Thread Kuppuswamy, Sathyanarayanan




On 9/28/21 12:10 PM, Borislav Petkov wrote:

From: Borislav Petkov 

Hi all,

here's v4 of the cc_platform_has() patchset with feedback incorporated.

I'm going to route this through tip if there are no objections.


Intel CC support patch is not included in this series. You want me
to address the issue raised by Joerg before merging it?



Thx.

Tom Lendacky (8):
   x86/ioremap: Selectively build arch override encryption functions
   arch/cc: Introduce a function to check for confidential computing
 features
   x86/sev: Add an x86 version of cc_platform_has()
   powerpc/pseries/svm: Add a powerpc version of cc_platform_has()
   x86/sme: Replace occurrences of sme_active() with cc_platform_has()
   x86/sev: Replace occurrences of sev_active() with cc_platform_has()
   x86/sev: Replace occurrences of sev_es_active() with cc_platform_has()
   treewide: Replace the use of mem_encrypt_active() with
 cc_platform_has()

  arch/Kconfig |  3 +
  arch/powerpc/include/asm/mem_encrypt.h   |  5 --
  arch/powerpc/platforms/pseries/Kconfig   |  1 +
  arch/powerpc/platforms/pseries/Makefile  |  2 +
  arch/powerpc/platforms/pseries/cc_platform.c | 26 ++
  arch/powerpc/platforms/pseries/svm.c |  5 +-
  arch/s390/include/asm/mem_encrypt.h  |  2 -
  arch/x86/Kconfig |  1 +
  arch/x86/include/asm/io.h|  8 ++
  arch/x86/include/asm/kexec.h |  2 +-
  arch/x86/include/asm/mem_encrypt.h   | 12 +--
  arch/x86/kernel/Makefile |  6 ++
  arch/x86/kernel/cc_platform.c| 69 +++
  arch/x86/kernel/crash_dump_64.c  |  4 +-
  arch/x86/kernel/head64.c |  9 +-
  arch/x86/kernel/kvm.c|  3 +-
  arch/x86/kernel/kvmclock.c   |  4 +-
  arch/x86/kernel/machine_kexec_64.c   | 19 +++--
  arch/x86/kernel/pci-swiotlb.c|  9 +-
  arch/x86/kernel/relocate_kernel_64.S |  2 +-
  arch/x86/kernel/sev.c|  6 +-
  arch/x86/kvm/svm/svm.c   |  3 +-
  arch/x86/mm/ioremap.c| 18 ++--
  arch/x86/mm/mem_encrypt.c| 55 
  arch/x86/mm/mem_encrypt_identity.c   |  9 +-
  arch/x86/mm/pat/set_memory.c |  3 +-
  arch/x86/platform/efi/efi_64.c   |  9 +-
  arch/x86/realmode/init.c |  8 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 +-
  drivers/gpu/drm/drm_cache.c  |  4 +-
  drivers/gpu/drm/vmwgfx/vmwgfx_drv.c  |  4 +-
  drivers/gpu/drm/vmwgfx/vmwgfx_msg.c  |  6 +-
  drivers/iommu/amd/init.c |  7 +-
  drivers/iommu/amd/iommu.c|  3 +-
  drivers/iommu/amd/iommu_v2.c |  3 +-
  drivers/iommu/iommu.c|  3 +-
  fs/proc/vmcore.c |  6 +-
  include/linux/cc_platform.h  | 88 
  include/linux/mem_encrypt.h  |  4 -
  kernel/dma/swiotlb.c |  4 +-
  40 files changed, 310 insertions(+), 129 deletions(-)
  create mode 100644 arch/powerpc/platforms/pseries/cc_platform.c
  create mode 100644 arch/x86/kernel/cc_platform.c
  create mode 100644 include/linux/cc_platform.h



--
Sathyanarayanan Kuppuswamy
Linux Kernel Developer


[PATCH 8/8] treewide: Replace the use of mem_encrypt_active() with cc_platform_has()

2021-09-28 Thread Borislav Petkov
From: Tom Lendacky 

Replace uses of mem_encrypt_active() with calls to cc_platform_has() with
the CC_ATTR_MEM_ENCRYPT attribute.

Remove the implementation of mem_encrypt_active() across all arches.

For s390, since the default implementation of the cc_platform_has()
matches the s390 implementation of mem_encrypt_active(), cc_platform_has()
does not need to be implemented in s390 (the config option
ARCH_HAS_CC_PLATFORM is not set).

Signed-off-by: Tom Lendacky 
Signed-off-by: Borislav Petkov 
---
 arch/powerpc/include/asm/mem_encrypt.h  | 5 -
 arch/powerpc/platforms/pseries/svm.c| 5 +++--
 arch/s390/include/asm/mem_encrypt.h | 2 --
 arch/x86/include/asm/mem_encrypt.h  | 5 -
 arch/x86/kernel/head64.c| 9 +++--
 arch/x86/mm/ioremap.c   | 4 ++--
 arch/x86/mm/mem_encrypt.c   | 2 +-
 arch/x86/mm/pat/set_memory.c| 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 +++-
 drivers/gpu/drm/drm_cache.c | 4 ++--
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c | 4 ++--
 drivers/gpu/drm/vmwgfx/vmwgfx_msg.c | 6 +++---
 drivers/iommu/amd/iommu.c   | 3 ++-
 drivers/iommu/amd/iommu_v2.c| 3 ++-
 drivers/iommu/iommu.c   | 3 ++-
 fs/proc/vmcore.c| 6 +++---
 include/linux/mem_encrypt.h | 4 
 kernel/dma/swiotlb.c| 4 ++--
 18 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/include/asm/mem_encrypt.h 
b/arch/powerpc/include/asm/mem_encrypt.h
index ba9dab07c1be..2f26b8fc8d29 100644
--- a/arch/powerpc/include/asm/mem_encrypt.h
+++ b/arch/powerpc/include/asm/mem_encrypt.h
@@ -10,11 +10,6 @@
 
 #include 
 
-static inline bool mem_encrypt_active(void)
-{
-   return is_secure_guest();
-}
-
 static inline bool force_dma_unencrypted(struct device *dev)
 {
return is_secure_guest();
diff --git a/arch/powerpc/platforms/pseries/svm.c 
b/arch/powerpc/platforms/pseries/svm.c
index 87f001b4c4e4..c083ecbbae4d 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -8,6 +8,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -63,7 +64,7 @@ void __init svm_swiotlb_init(void)
 
 int set_memory_encrypted(unsigned long addr, int numpages)
 {
-   if (!mem_encrypt_active())
+   if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return 0;
 
if (!PAGE_ALIGNED(addr))
@@ -76,7 +77,7 @@ int set_memory_encrypted(unsigned long addr, int numpages)
 
 int set_memory_decrypted(unsigned long addr, int numpages)
 {
-   if (!mem_encrypt_active())
+   if (!cc_platform_has(CC_ATTR_MEM_ENCRYPT))
return 0;
 
if (!PAGE_ALIGNED(addr))
diff --git a/arch/s390/include/asm/mem_encrypt.h 
b/arch/s390/include/asm/mem_encrypt.h
index 2542cbf7e2d1..08a8b96606d7 100644
--- a/arch/s390/include/asm/mem_encrypt.h
+++ b/arch/s390/include/asm/mem_encrypt.h
@@ -4,8 +4,6 @@
 
 #ifndef __ASSEMBLY__
 
-static inline bool mem_encrypt_active(void) { return false; }
-
 int set_memory_encrypted(unsigned long addr, int numpages);
 int set_memory_decrypted(unsigned long addr, int numpages);
 
diff --git a/arch/x86/include/asm/mem_encrypt.h 
b/arch/x86/include/asm/mem_encrypt.h
index da14ede311aa..2d4f5c17d79c 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -96,11 +96,6 @@ static inline void mem_encrypt_free_decrypted_mem(void) { }
 
 extern char __start_bss_decrypted[], __end_bss_decrypted[], 
__start_bss_decrypted_unused[];
 
-static inline bool mem_encrypt_active(void)
-{
-   return sme_me_mask;
-}
-
 static inline u64 sme_get_me_mask(void)
 {
return sme_me_mask;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index de01903c3735..fc5371a7e9d1 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -19,7 +19,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 
 #include 
@@ -284,8 +284,13 @@ unsigned long __head __startup_64(unsigned long physaddr,
 * The bss section will be memset to zero later in the initialization so
 * there is no need to zero it after changing the memory encryption
 * attribute.
+*
+* This is early code, use an open coded check for SME instead of
+* using cc_platform_has(). This eliminates worries about removing
+* instrumentation or checking boot_cpu_data in the cc_platform_has()
+* function.
 */
-   if (mem_encrypt_active()) {
+   if (sme_get_me_mask()) {
vaddr = (unsigned long)__start_bss_decrypted;
vaddr_end = (unsigned long)__end_bss_decrypted;
for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index b59a5cbc6bc5..026031b3b782 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -694,7 +694,7 @@ static bool __init 

[PATCH 5/8] x86/sme: Replace occurrences of sme_active() with cc_platform_has()

2021-09-28 Thread Borislav Petkov
From: Tom Lendacky 

Replace uses of sme_active() with the more generic cc_platform_has()
using CC_ATTR_HOST_MEM_ENCRYPT. If future support is added for other
memory encryption technologies, the use of CC_ATTR_HOST_MEM_ENCRYPT
can be updated, as required.

This also replaces two usages of sev_active() that are really geared
towards detecting if SME is active.

Signed-off-by: Tom Lendacky 
Signed-off-by: Borislav Petkov 
---
 arch/x86/include/asm/kexec.h |  2 +-
 arch/x86/include/asm/mem_encrypt.h   |  2 --
 arch/x86/kernel/machine_kexec_64.c   | 15 ---
 arch/x86/kernel/pci-swiotlb.c|  9 -
 arch/x86/kernel/relocate_kernel_64.S |  2 +-
 arch/x86/mm/ioremap.c|  6 +++---
 arch/x86/mm/mem_encrypt.c| 13 -
 arch/x86/mm/mem_encrypt_identity.c   |  9 -
 arch/x86/realmode/init.c |  5 +++--
 drivers/iommu/amd/init.c |  7 ---
 10 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 0a6e34b07017..11b7c06e2828 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -129,7 +129,7 @@ relocate_kernel(unsigned long indirection_page,
unsigned long page_list,
unsigned long start_address,
unsigned int preserve_context,
-   unsigned int sme_active);
+   unsigned int host_mem_enc_active);
 #endif
 
 #define ARCH_HAS_KIMAGE_ARCH
diff --git a/arch/x86/include/asm/mem_encrypt.h 
b/arch/x86/include/asm/mem_encrypt.h
index 3fb9f5ebefa4..63c5b99ccae5 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -51,7 +51,6 @@ void __init mem_encrypt_free_decrypted_mem(void);
 void __init mem_encrypt_init(void);
 
 void __init sev_es_init_vc_handling(void);
-bool sme_active(void);
 bool sev_active(void);
 bool sev_es_active(void);
 
@@ -76,7 +75,6 @@ static inline void __init sme_encrypt_kernel(struct 
boot_params *bp) { }
 static inline void __init sme_enable(struct boot_params *bp) { }
 
 static inline void sev_es_init_vc_handling(void) { }
-static inline bool sme_active(void) { return false; }
 static inline bool sev_active(void) { return false; }
 static inline bool sev_es_active(void) { return false; }
 
diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 131f30fdcfbd..7040c0fa921c 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -358,7 +359,7 @@ void machine_kexec(struct kimage *image)
   (unsigned long)page_list,
   image->start,
   image->preserve_context,
-  sme_active());
+  
cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT));
 
 #ifdef CONFIG_KEXEC_JUMP
if (image->preserve_context)
@@ -569,12 +570,12 @@ void arch_kexec_unprotect_crashkres(void)
  */
 int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp)
 {
-   if (sev_active())
+   if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
return 0;
 
/*
-* If SME is active we need to be sure that kexec pages are
-* not encrypted because when we boot to the new kernel the
+* If host memory encryption is active we need to be sure that kexec
+* pages are not encrypted because when we boot to the new kernel the
 * pages won't be accessed encrypted (initially).
 */
return set_memory_decrypted((unsigned long)vaddr, pages);
@@ -582,12 +583,12 @@ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int 
pages, gfp_t gfp)
 
 void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages)
 {
-   if (sev_active())
+   if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
return;
 
/*
-* If SME is active we need to reset the pages back to being
-* an encrypted mapping before freeing them.
+* If host memory encryption is active we need to reset the pages back
+* to being an encrypted mapping before freeing them.
 */
set_memory_encrypted((unsigned long)vaddr, pages);
 }
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index c2cfa5e7c152..814ab46a0dad 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -6,7 +6,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include 
 #include 
@@ -45,11 +45,10 @@ int __init pci_swiotlb_detect_4gb(void)
swiotlb = 1;
 
/*
-* If SME is active then swiotlb will be set to 1 so that bounce
-* buffers are allocated and used for devices that do not support
-* the addressing range required for the encryption mask.
+* 

[PATCH 7/8] x86/sev: Replace occurrences of sev_es_active() with cc_platform_has()

2021-09-28 Thread Borislav Petkov
From: Tom Lendacky 

Replace uses of sev_es_active() with the more generic cc_platform_has()
using CC_ATTR_GUEST_STATE_ENCRYPT. If future support is added for other
memory encyrption techonologies, the use of CC_ATTR_GUEST_STATE_ENCRYPT
can be updated, as required.

Signed-off-by: Tom Lendacky 
Signed-off-by: Borislav Petkov 
---
 arch/x86/include/asm/mem_encrypt.h |  2 --
 arch/x86/kernel/sev.c  |  6 +++---
 arch/x86/mm/mem_encrypt.c  | 24 +++-
 arch/x86/realmode/init.c   |  3 +--
 4 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/arch/x86/include/asm/mem_encrypt.h 
b/arch/x86/include/asm/mem_encrypt.h
index a5a58ccd1ee3..da14ede311aa 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -51,7 +51,6 @@ void __init mem_encrypt_free_decrypted_mem(void);
 void __init mem_encrypt_init(void);
 
 void __init sev_es_init_vc_handling(void);
-bool sev_es_active(void);
 
 #define __bss_decrypted __section(".bss..decrypted")
 
@@ -74,7 +73,6 @@ static inline void __init sme_encrypt_kernel(struct 
boot_params *bp) { }
 static inline void __init sme_enable(struct boot_params *bp) { }
 
 static inline void sev_es_init_vc_handling(void) { }
-static inline bool sev_es_active(void) { return false; }
 
 static inline int __init
 early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 
0; }
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index a6895e440bc3..53a6837d354b 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -11,7 +11,7 @@
 
 #include  /* For show_regs() */
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -615,7 +615,7 @@ int __init sev_es_efi_map_ghcbs(pgd_t *pgd)
int cpu;
u64 pfn;
 
-   if (!sev_es_active())
+   if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
return 0;
 
pflags = _PAGE_NX | _PAGE_RW;
@@ -774,7 +774,7 @@ void __init sev_es_init_vc_handling(void)
 
BUILD_BUG_ON(offsetof(struct sev_es_runtime_data, ghcb_page) % 
PAGE_SIZE);
 
-   if (!sev_es_active())
+   if (!cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
return;
 
if (!sev_es_check_cpu_features())
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 932007a6913b..2d04c39bea1d 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -361,25 +361,6 @@ int __init early_set_memory_encrypted(unsigned long vaddr, 
unsigned long size)
return early_set_memory_enc_dec(vaddr, size, true);
 }
 
-/*
- * SME and SEV are very similar but they are not the same, so there are
- * times that the kernel will need to distinguish between SME and SEV. The
- * cc_platform_has() function is used for this.  When a distinction isn't
- * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used.
- *
- * The trampoline code is a good example for this requirement.  Before
- * paging is activated, SME will access all memory as decrypted, but SEV
- * will access all memory as encrypted.  So, when APs are being brought
- * up under SME the trampoline area cannot be encrypted, whereas under SEV
- * the trampoline area must be encrypted.
- */
-
-/* Needs to be called from non-instrumentable code */
-bool noinstr sev_es_active(void)
-{
-   return sev_status & MSR_AMD64_SEV_ES_ENABLED;
-}
-
 /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */
 bool force_dma_unencrypted(struct device *dev)
 {
@@ -449,7 +430,7 @@ static void print_mem_encrypt_feature_info(void)
pr_cont(" SEV");
 
/* Encrypted Register State */
-   if (sev_es_active())
+   if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
pr_cont(" SEV-ES");
 
pr_cont("\n");
@@ -468,7 +449,8 @@ void __init mem_encrypt_init(void)
 * With SEV, we need to unroll the rep string I/O instructions,
 * but SEV-ES supports them through the #VC handler.
 */
-   if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && !sev_es_active())
+   if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) &&
+   !cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
static_branch_enable(_enable_key);
 
print_mem_encrypt_feature_info();
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index c878c5ee5a4c..4a3da7592b99 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -2,7 +2,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -48,7 +47,7 @@ static void sme_sev_setup_real_mode(struct trampoline_header 
*th)
if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT))
th->flags |= TH_FLAGS_SME_ACTIVE;
 
-   if (sev_es_active()) {
+   if (cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT)) {
/*
 * Skip the call to verify_cpu() in secondary_startup_64 as it
 * will cause #VC exceptions when the AP can't handle them yet.

[PATCH 6/8] x86/sev: Replace occurrences of sev_active() with cc_platform_has()

2021-09-28 Thread Borislav Petkov
From: Tom Lendacky 

Replace uses of sev_active() with the more generic cc_platform_has()
using CC_ATTR_GUEST_MEM_ENCRYPT. If future support is added for other
memory encryption technologies, the use of CC_ATTR_GUEST_MEM_ENCRYPT
can be updated, as required.

Signed-off-by: Tom Lendacky 
Signed-off-by: Borislav Petkov 
---
 arch/x86/include/asm/mem_encrypt.h |  2 --
 arch/x86/kernel/crash_dump_64.c|  4 +++-
 arch/x86/kernel/kvm.c  |  3 ++-
 arch/x86/kernel/kvmclock.c |  4 ++--
 arch/x86/kernel/machine_kexec_64.c |  4 ++--
 arch/x86/kvm/svm/svm.c |  3 ++-
 arch/x86/mm/ioremap.c  |  6 +++---
 arch/x86/mm/mem_encrypt.c  | 21 -
 arch/x86/platform/efi/efi_64.c |  9 +
 9 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/mem_encrypt.h 
b/arch/x86/include/asm/mem_encrypt.h
index 63c5b99ccae5..a5a58ccd1ee3 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -51,7 +51,6 @@ void __init mem_encrypt_free_decrypted_mem(void);
 void __init mem_encrypt_init(void);
 
 void __init sev_es_init_vc_handling(void);
-bool sev_active(void);
 bool sev_es_active(void);
 
 #define __bss_decrypted __section(".bss..decrypted")
@@ -75,7 +74,6 @@ static inline void __init sme_encrypt_kernel(struct 
boot_params *bp) { }
 static inline void __init sme_enable(struct boot_params *bp) { }
 
 static inline void sev_es_init_vc_handling(void) { }
-static inline bool sev_active(void) { return false; }
 static inline bool sev_es_active(void) { return false; }
 
 static inline int __init
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045e82e8945b..a7f617a3981d 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static ssize_t __copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
  unsigned long offset, int userbuf,
@@ -73,5 +74,6 @@ ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char 
*buf, size_t csize,
 
 ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
 {
-   return read_from_oldmem(buf, count, ppos, 0, sev_active());
+   return read_from_oldmem(buf, count, ppos, 0,
+   cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT));
 }
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b656456c3a94..8863d1941f1b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -418,7 +419,7 @@ static void __init sev_map_percpu_data(void)
 {
int cpu;
 
-   if (!sev_active())
+   if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
return;
 
for_each_possible_cpu(cpu) {
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ad273e5861c1..fc3930c5db1b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -16,9 +16,9 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
-#include 
 #include 
 #include 
 
@@ -232,7 +232,7 @@ static void __init kvmclock_init_mem(void)
 * hvclock is shared between the guest and the hypervisor, must
 * be mapped decrypted.
 */
-   if (sev_active()) {
+   if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
r = set_memory_decrypted((unsigned long) hvclock_mem,
 1UL << order);
if (r) {
diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 7040c0fa921c..f5da4a18070a 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -167,7 +167,7 @@ static int init_transition_pgtable(struct kimage *image, 
pgd_t *pgd)
}
pte = pte_offset_kernel(pmd, vaddr);
 
-   if (sev_active())
+   if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT))
prot = PAGE_KERNEL_EXEC;
 
set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
@@ -207,7 +207,7 @@ static int init_pgtable(struct kimage *image, unsigned long 
start_pgtable)
level4p = (pgd_t *)__va(start_pgtable);
clear_page(level4p);
 
-   if (sev_active()) {
+   if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
info.page_flag   |= _PAGE_ENC;
info.kernpg_flag |= _PAGE_ENC;
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 05e8d4d27969..a2f78a8cfdaa 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -455,7 +456,7 @@ static int has_svm(void)
return 0;
}
 
-   if (sev_active()) {
+   if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) {
pr_info("KVM is unsupported when running as an SEV guest\n");
   

[PATCH v4 0/8] Implement generic cc_platform_has() helper function

2021-09-28 Thread Borislav Petkov
From: Borislav Petkov 

Hi all,

here's v4 of the cc_platform_has() patchset with feedback incorporated.

I'm going to route this through tip if there are no objections.

Thx.

Tom Lendacky (8):
  x86/ioremap: Selectively build arch override encryption functions
  arch/cc: Introduce a function to check for confidential computing
features
  x86/sev: Add an x86 version of cc_platform_has()
  powerpc/pseries/svm: Add a powerpc version of cc_platform_has()
  x86/sme: Replace occurrences of sme_active() with cc_platform_has()
  x86/sev: Replace occurrences of sev_active() with cc_platform_has()
  x86/sev: Replace occurrences of sev_es_active() with cc_platform_has()
  treewide: Replace the use of mem_encrypt_active() with
cc_platform_has()

 arch/Kconfig |  3 +
 arch/powerpc/include/asm/mem_encrypt.h   |  5 --
 arch/powerpc/platforms/pseries/Kconfig   |  1 +
 arch/powerpc/platforms/pseries/Makefile  |  2 +
 arch/powerpc/platforms/pseries/cc_platform.c | 26 ++
 arch/powerpc/platforms/pseries/svm.c |  5 +-
 arch/s390/include/asm/mem_encrypt.h  |  2 -
 arch/x86/Kconfig |  1 +
 arch/x86/include/asm/io.h|  8 ++
 arch/x86/include/asm/kexec.h |  2 +-
 arch/x86/include/asm/mem_encrypt.h   | 12 +--
 arch/x86/kernel/Makefile |  6 ++
 arch/x86/kernel/cc_platform.c| 69 +++
 arch/x86/kernel/crash_dump_64.c  |  4 +-
 arch/x86/kernel/head64.c |  9 +-
 arch/x86/kernel/kvm.c|  3 +-
 arch/x86/kernel/kvmclock.c   |  4 +-
 arch/x86/kernel/machine_kexec_64.c   | 19 +++--
 arch/x86/kernel/pci-swiotlb.c|  9 +-
 arch/x86/kernel/relocate_kernel_64.S |  2 +-
 arch/x86/kernel/sev.c|  6 +-
 arch/x86/kvm/svm/svm.c   |  3 +-
 arch/x86/mm/ioremap.c| 18 ++--
 arch/x86/mm/mem_encrypt.c| 55 
 arch/x86/mm/mem_encrypt_identity.c   |  9 +-
 arch/x86/mm/pat/set_memory.c |  3 +-
 arch/x86/platform/efi/efi_64.c   |  9 +-
 arch/x86/realmode/init.c |  8 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 +-
 drivers/gpu/drm/drm_cache.c  |  4 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c  |  4 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_msg.c  |  6 +-
 drivers/iommu/amd/init.c |  7 +-
 drivers/iommu/amd/iommu.c|  3 +-
 drivers/iommu/amd/iommu_v2.c |  3 +-
 drivers/iommu/iommu.c|  3 +-
 fs/proc/vmcore.c |  6 +-
 include/linux/cc_platform.h  | 88 
 include/linux/mem_encrypt.h  |  4 -
 kernel/dma/swiotlb.c |  4 +-
 40 files changed, 310 insertions(+), 129 deletions(-)
 create mode 100644 arch/powerpc/platforms/pseries/cc_platform.c
 create mode 100644 arch/x86/kernel/cc_platform.c
 create mode 100644 include/linux/cc_platform.h

-- 
2.29.2



[PATCH 3/8] x86/sev: Add an x86 version of cc_platform_has()

2021-09-28 Thread Borislav Petkov
From: Tom Lendacky 

Introduce an x86 version of the cc_platform_has() function. This will be
used to replace vendor specific calls like sme_active(), sev_active(),
etc.

Signed-off-by: Tom Lendacky 
Signed-off-by: Borislav Petkov 
---
 arch/x86/Kconfig   |  1 +
 arch/x86/include/asm/mem_encrypt.h |  1 +
 arch/x86/kernel/Makefile   |  6 +++
 arch/x86/kernel/cc_platform.c  | 69 ++
 arch/x86/mm/mem_encrypt.c  |  1 +
 5 files changed, 78 insertions(+)
 create mode 100644 arch/x86/kernel/cc_platform.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ab83c22d274e..9f190ec4f953 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1518,6 +1518,7 @@ config AMD_MEM_ENCRYPT
select ARCH_HAS_FORCE_DMA_UNENCRYPTED
select INSTRUCTION_DECODER
select ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
+   select ARCH_HAS_CC_PLATFORM
help
  Say yes to enable support for the encryption of system memory.
  This requires an AMD processor that supports Secure Memory
diff --git a/arch/x86/include/asm/mem_encrypt.h 
b/arch/x86/include/asm/mem_encrypt.h
index 9c80c68d75b5..3fb9f5ebefa4 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -13,6 +13,7 @@
 #ifndef __ASSEMBLY__
 
 #include 
+#include 
 
 #include 
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 8f4e8fa6ed75..2ff3e600f426 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -21,6 +21,7 @@ CFLAGS_REMOVE_ftrace.o = -pg
 CFLAGS_REMOVE_early_printk.o = -pg
 CFLAGS_REMOVE_head64.o = -pg
 CFLAGS_REMOVE_sev.o = -pg
+CFLAGS_REMOVE_cc_platform.o = -pg
 endif
 
 KASAN_SANITIZE_head$(BITS).o   := n
@@ -29,6 +30,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o:= n
 KASAN_SANITIZE_stacktrace.o:= n
 KASAN_SANITIZE_paravirt.o  := n
 KASAN_SANITIZE_sev.o   := n
+KASAN_SANITIZE_cc_platform.o   := n
 
 # With some compiler versions the generated code results in boot hangs, caused
 # by several compilation units. To be safe, disable all instrumentation.
@@ -47,6 +49,7 @@ endif
 KCOV_INSTRUMENT:= n
 
 CFLAGS_head$(BITS).o   += -fno-stack-protector
+CFLAGS_cc_platform.o   += -fno-stack-protector
 
 CFLAGS_irq.o := -I $(srctree)/$(src)/../include/asm/trace
 
@@ -147,6 +150,9 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER)+= 
unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)   += unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += sev.o
+
+obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cc_platform.c b/arch/x86/kernel/cc_platform.c
new file mode 100644
index ..03bb2f343ddb
--- /dev/null
+++ b/arch/x86/kernel/cc_platform.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Confidential Computing Platform Capability checks
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky 
+ */
+
+#include 
+#include 
+#include 
+
+#include 
+
+static bool __maybe_unused intel_cc_platform_has(enum cc_attr attr)
+{
+#ifdef CONFIG_INTEL_TDX_GUEST
+   return false;
+#else
+   return false;
+#endif
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * cc_platform_has() function is used for this.  When a distinction isn't
+ * needed, the CC_ATTR_MEM_ENCRYPT attribute can be used.
+ *
+ * The trampoline code is a good example for this requirement.  Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted.  So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+static bool amd_cc_platform_has(enum cc_attr attr)
+{
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+   switch (attr) {
+   case CC_ATTR_MEM_ENCRYPT:
+   return sme_me_mask;
+
+   case CC_ATTR_HOST_MEM_ENCRYPT:
+   return sme_me_mask && !(sev_status & MSR_AMD64_SEV_ENABLED);
+
+   case CC_ATTR_GUEST_MEM_ENCRYPT:
+   return sev_status & MSR_AMD64_SEV_ENABLED;
+
+   case CC_ATTR_GUEST_STATE_ENCRYPT:
+   return sev_status & MSR_AMD64_SEV_ES_ENABLED;
+
+   default:
+   return false;
+   }
+#else
+   return false;
+#endif
+}
+
+
+bool cc_platform_has(enum cc_attr attr)
+{
+   if (sme_me_mask)
+   return amd_cc_platform_has(attr);
+
+   return false;
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index ff08dc463634..e29b1418d00c 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -20,6 +20,7 

[PATCH 1/8] x86/ioremap: Selectively build arch override encryption functions

2021-09-28 Thread Borislav Petkov
From: Tom Lendacky 

In preparation for other uses of the cc_platform_has() function
besides AMD's memory encryption support, selectively build the
AMD memory encryption architecture override functions only when
CONFIG_AMD_MEM_ENCRYPT=y. These functions are:

- early_memremap_pgprot_adjust()
- arch_memremap_can_ram_remap()

Additionally, routines that are only invoked by these architecture
override functions can also be conditionally built. These functions are:

- memremap_should_map_decrypted()
- memremap_is_efi_data()
- memremap_is_setup_data()
- early_memremap_is_setup_data()

And finally, phys_mem_access_encrypted() is conditionally built as well,
but requires a static inline version of it when CONFIG_AMD_MEM_ENCRYPT is
not set.

Signed-off-by: Tom Lendacky 
Signed-off-by: Borislav Petkov 
---
 arch/x86/include/asm/io.h | 8 
 arch/x86/mm/ioremap.c | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 841a5d104afa..5c6a4af0b911 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -391,6 +391,7 @@ extern void arch_io_free_memtype_wc(resource_size_t start, 
resource_size_t size)
 #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc
 #endif
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
 extern bool arch_memremap_can_ram_remap(resource_size_t offset,
unsigned long size,
unsigned long flags);
@@ -398,6 +399,13 @@ extern bool arch_memremap_can_ram_remap(resource_size_t 
offset,
 
 extern bool phys_mem_access_encrypted(unsigned long phys_addr,
  unsigned long size);
+#else
+static inline bool phys_mem_access_encrypted(unsigned long phys_addr,
+unsigned long size)
+{
+   return true;
+}
+#endif
 
 /**
  * iosubmit_cmds512 - copy data to single MMIO location, in 512-bit units
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 60ade7dd71bd..ccff76cedd8f 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -508,6 +508,7 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
memunmap((void *)((unsigned long)addr & PAGE_MASK));
 }
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
 /*
  * Examine the physical address to determine if it is an area of memory
  * that should be mapped decrypted.  If the memory is not part of the
@@ -746,7 +747,6 @@ bool phys_mem_access_encrypted(unsigned long phys_addr, 
unsigned long size)
return arch_memremap_can_ram_remap(phys_addr, size, 0);
 }
 
-#ifdef CONFIG_AMD_MEM_ENCRYPT
 /* Remap memory with encryption */
 void __init *early_memremap_encrypted(resource_size_t phys_addr,
  unsigned long size)
-- 
2.29.2



[PATCH 2/8] arch/cc: Introduce a function to check for confidential computing features

2021-09-28 Thread Borislav Petkov
From: Tom Lendacky 

In preparation for other confidential computing technologies, introduce
a generic helper function, cc_platform_has(), that can be used to
check for specific active confidential computing attributes, like
memory encryption. This is intended to eliminate having to add multiple
technology-specific checks to the code (e.g. if (sev_active() ||
tdx_active() || ... ).

 [ bp: s/_CC_PLATFORM_H/_LINUX_CC_PLATFORM_H/g ]

Co-developed-by: Andi Kleen 
Signed-off-by: Andi Kleen 
Co-developed-by: Kuppuswamy Sathyanarayanan 

Signed-off-by: Kuppuswamy Sathyanarayanan 

Signed-off-by: Tom Lendacky 
Signed-off-by: Borislav Petkov 
---
 arch/Kconfig|  3 ++
 include/linux/cc_platform.h | 88 +
 2 files changed, 91 insertions(+)
 create mode 100644 include/linux/cc_platform.h

diff --git a/arch/Kconfig b/arch/Kconfig
index 8df1c7102643..d1e69d6e8498 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1234,6 +1234,9 @@ config RELR
 config ARCH_HAS_MEM_ENCRYPT
bool
 
+config ARCH_HAS_CC_PLATFORM
+   bool
+
 config HAVE_SPARSE_SYSCALL_NR
bool
help
diff --git a/include/linux/cc_platform.h b/include/linux/cc_platform.h
new file mode 100644
index ..a075b70b9a70
--- /dev/null
+++ b/include/linux/cc_platform.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Confidential Computing Platform Capability checks
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky 
+ */
+
+#ifndef _LINUX_CC_PLATFORM_H
+#define _LINUX_CC_PLATFORM_H
+
+#include 
+#include 
+
+/**
+ * enum cc_attr - Confidential computing attributes
+ *
+ * These attributes represent confidential computing features that are
+ * currently active.
+ */
+enum cc_attr {
+   /**
+* @CC_ATTR_MEM_ENCRYPT: Memory encryption is active
+*
+* The platform/OS is running with active memory encryption. This
+* includes running either as a bare-metal system or a hypervisor
+* and actively using memory encryption or as a guest/virtual machine
+* and actively using memory encryption.
+*
+* Examples include SME, SEV and SEV-ES.
+*/
+   CC_ATTR_MEM_ENCRYPT,
+
+   /**
+* @CC_ATTR_HOST_MEM_ENCRYPT: Host memory encryption is active
+*
+* The platform/OS is running as a bare-metal system or a hypervisor
+* and actively using memory encryption.
+*
+* Examples include SME.
+*/
+   CC_ATTR_HOST_MEM_ENCRYPT,
+
+   /**
+* @CC_ATTR_GUEST_MEM_ENCRYPT: Guest memory encryption is active
+*
+* The platform/OS is running as a guest/virtual machine and actively
+* using memory encryption.
+*
+* Examples include SEV and SEV-ES.
+*/
+   CC_ATTR_GUEST_MEM_ENCRYPT,
+
+   /**
+* @CC_ATTR_GUEST_STATE_ENCRYPT: Guest state encryption is active
+*
+* The platform/OS is running as a guest/virtual machine and actively
+* using memory encryption and register state encryption.
+*
+* Examples include SEV-ES.
+*/
+   CC_ATTR_GUEST_STATE_ENCRYPT,
+};
+
+#ifdef CONFIG_ARCH_HAS_CC_PLATFORM
+
+/**
+ * cc_platform_has() - Checks if the specified cc_attr attribute is active
+ * @attr: Confidential computing attribute to check
+ *
+ * The cc_platform_has() function will return an indicator as to whether the
+ * specified Confidential Computing attribute is currently active.
+ *
+ * Context: Any context
+ * Return:
+ * * TRUE  - Specified Confidential Computing attribute is active
+ * * FALSE - Specified Confidential Computing attribute is not active
+ */
+bool cc_platform_has(enum cc_attr attr);
+
+#else  /* !CONFIG_ARCH_HAS_CC_PLATFORM */
+
+static inline bool cc_platform_has(enum cc_attr attr) { return false; }
+
+#endif /* CONFIG_ARCH_HAS_CC_PLATFORM */
+
+#endif /* _LINUX_CC_PLATFORM_H */
-- 
2.29.2



[PATCH 4/8] powerpc/pseries/svm: Add a powerpc version of cc_platform_has()

2021-09-28 Thread Borislav Petkov
From: Tom Lendacky 

Introduce a powerpc version of the cc_platform_has() function. This will
be used to replace the powerpc mem_encrypt_active() implementation, so
the implementation will initially only support the CC_ATTR_MEM_ENCRYPT
attribute.

Signed-off-by: Tom Lendacky 
Signed-off-by: Borislav Petkov 
Acked-by: Michael Ellerman 
---
 arch/powerpc/platforms/pseries/Kconfig   |  1 +
 arch/powerpc/platforms/pseries/Makefile  |  2 ++
 arch/powerpc/platforms/pseries/cc_platform.c | 26 
 3 files changed, 29 insertions(+)
 create mode 100644 arch/powerpc/platforms/pseries/cc_platform.c

diff --git a/arch/powerpc/platforms/pseries/Kconfig 
b/arch/powerpc/platforms/pseries/Kconfig
index 5e037df2a3a1..2e57391e0778 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -159,6 +159,7 @@ config PPC_SVM
select SWIOTLB
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_FORCE_DMA_UNENCRYPTED
+   select ARCH_HAS_CC_PLATFORM
help
 There are certain POWER platforms which support secure guests using
 the Protected Execution Facility, with the help of an Ultravisor
diff --git a/arch/powerpc/platforms/pseries/Makefile 
b/arch/powerpc/platforms/pseries/Makefile
index 4cda0ef87be0..41d8aee98da4 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -31,3 +31,5 @@ obj-$(CONFIG_FA_DUMP) += rtas-fadump.o
 
 obj-$(CONFIG_SUSPEND)  += suspend.o
 obj-$(CONFIG_PPC_VAS)  += vas.o
+
+obj-$(CONFIG_ARCH_HAS_CC_PLATFORM) += cc_platform.o
diff --git a/arch/powerpc/platforms/pseries/cc_platform.c 
b/arch/powerpc/platforms/pseries/cc_platform.c
new file mode 100644
index ..e8021af83a19
--- /dev/null
+++ b/arch/powerpc/platforms/pseries/cc_platform.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Confidential Computing Platform Capability checks
+ *
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky 
+ */
+
+#include 
+#include 
+
+#include 
+#include 
+
+bool cc_platform_has(enum cc_attr attr)
+{
+   switch (attr) {
+   case CC_ATTR_MEM_ENCRYPT:
+   return is_secure_guest();
+
+   default:
+   return false;
+   }
+}
+EXPORT_SYMBOL_GPL(cc_platform_has);
-- 
2.29.2



Re: [PATCH] powerpc: fix unbalanced node refcount in check_kvm_guest()

2021-09-28 Thread Tyrel Datwyler
On 9/28/21 5:45 AM, Nathan Lynch wrote:
> When check_kvm_guest() succeeds in looking up a /hypervisor OF node, it
> returns without performing a matching put for the lookup, leaving the
> node's reference count elevated.
> 
> Add the necessary call to of_node_put(), rearranging the code slightly to
> avoid repetition or goto.
> 
> Signed-off-by: Nathan Lynch 
> Fixes: 107c55005fbd ("powerpc/pseries: Add KVM guest doorbell restrictions")

Reviewed-by: Tyrel Datwyler 


Re: [PATCH v2 9/9] iommu/vt-d: Use pci core's DVSEC functionality

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> Reduce maintenance burden of DVSEC query implementation by using the
> centralized PCI core implementation.
>
> Cc: io...@lists.linux-foundation.org
> Cc: David Woodhouse 
> Cc: Lu Baolu 
> Signed-off-by: Ben Widawsky 
> ---
>  drivers/iommu/intel/iommu.c | 15 +--
>  1 file changed, 1 insertion(+), 14 deletions(-)
>
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index d75f59ae28e6..30c97181f0ae 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -5398,20 +5398,7 @@ static int intel_iommu_disable_sva(struct device *dev)
>   */
>  static int siov_find_pci_dvsec(struct pci_dev *pdev)
>  {
> -   int pos;
> -   u16 vendor, id;
> -
> -   pos = pci_find_next_ext_capability(pdev, 0, 0x23);
> -   while (pos) {
> -   pci_read_config_word(pdev, pos + 4, );
> -   pci_read_config_word(pdev, pos + 8, );
> -   if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
> -   return pos;
> -
> -   pos = pci_find_next_ext_capability(pdev, pos, 0x23);
> -   }
> -
> -   return 0;
> +   return pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_INTEL, 5);
>  }

Same comments as the CXL patch, siov_find_pci_dvsec() doesn't seem to
have a reason to exist anymore. What is 5?


Re: [PATCH v3 9/9] powerpc/mm: Use is_kernel_text() and is_kernel_inittext() helper

2021-09-28 Thread Christophe Leroy




Le 26/09/2021 à 09:20, Kefeng Wang a écrit :

Use is_kernel_text() and is_kernel_inittext() helper to simplify code,
also drop etext, _stext, _sinittext, _einittext declaration which
already declared in section.h.

Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Kefeng Wang 
---
  arch/powerpc/mm/pgtable_32.c | 7 ++-
  1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index dcf5ecca19d9..13c798308c2e 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -33,8 +33,6 @@
  
  #include 
  
-extern char etext[], _stext[], _sinittext[], _einittext[];

-
  static u8 early_fixmap_pagetable[FIXMAP_PTE_SIZE] __page_aligned_data;
  
  notrace void __init early_ioremap_init(void)

@@ -104,14 +102,13 @@ static void __init __mapin_ram_chunk(unsigned long 
offset, unsigned long top)
  {
unsigned long v, s;
phys_addr_t p;
-   int ktext;
+   bool ktext;
  
  	s = offset;

v = PAGE_OFFSET + s;
p = memstart_addr + s;
for (; s < top; s += PAGE_SIZE) {
-   ktext = ((char *)v >= _stext && (char *)v < etext) ||
-   ((char *)v >= _sinittext && (char *)v < _einittext);
+   ktext = (is_kernel_text(v) || is_kernel_inittext(v));


I think we could use core_kernel_next() instead.



map_kernel_page(v, p, ktext ? PAGE_KERNEL_TEXT : PAGE_KERNEL);
v += PAGE_SIZE;
p += PAGE_SIZE;






Build failure on mpc885_ads_defconfig

arch/powerpc/mm/pgtable_32.c: In function '__mapin_ram_chunk':
arch/powerpc/mm/pgtable_32.c:111:26: error: implicit declaration of 
function 'is_kernel_text'; did you mean 'is_kernel_inittext'? 
[-Werror=implicit-function-declaration]
  111 | ktext = (is_kernel_text(v) || 
is_kernel_inittext(v));

  |  ^~
  |  is_kernel_inittext
cc1: all warnings being treated as errors
make[2]: *** [scripts/Makefile.build:277: arch/powerpc/mm/pgtable_32.o] 
Error 1

make[1]: *** [scripts/Makefile.build:540: arch/powerpc/mm] Error 2
make: *** [Makefile:1868: arch/powerpc] Error 2




Re: [PATCH v2 7/9] cxl/pci: Use pci core's DVSEC functionality

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> Reduce maintenance burden of DVSEC query implementation by using the
> centralized PCI core implementation.
>
> Signed-off-by: Ben Widawsky 
> ---
>  drivers/cxl/pci.c | 20 +---
>  1 file changed, 1 insertion(+), 19 deletions(-)
>
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 5eaf2736f779..79d4d9b16d83 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -340,25 +340,7 @@ static void cxl_pci_unmap_regblock(struct cxl_mem *cxlm, 
> struct cxl_register_map
>
>  static int cxl_pci_dvsec(struct pci_dev *pdev, int dvsec)

cxl_pci_dvsec() has no reason to exist anymore. Let's just have the
caller use pci_find_dvsec_capability() directly.


Re: [PATCH v2 5/9] cxl/pci: Make more use of cxl_register_map

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> The structure exists to pass around information about register mapping.
> Using it more extensively cleans up many existing functions.

I would have liked to have seen "add @base to cxl_register_map" and
"use @map for @bar and @offset arguments" somewhere in this changelog
to set expectations for what changes are included. That would have
also highlighted that adding a @base to cxl_register_map deserves its
own patch vs the conversion of @bar and @offset to instead use
@map->bar and @map->offset. Can you resend with that split and those
mentions?


Re: [PATCH v2 4/9] cxl/pci: Refactor cxl_pci_setup_regs

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> In preparation for moving parts of register mapping to cxl_core, the
> cxl_pci driver is refactored to utilize a new helper to find register
> blocks by type.
>
> cxl_pci scanned through all register blocks and mapping the ones that
> the driver will use. This logic is inverted so that the driver
> specifically requests the register blocks from a new helper. Under the
> hood, the same implementation of scanning through all register locator
> DVSEC entries exists.
>
> There are 2 behavioral changes (#2 is arguable):
> 1. A dev_err is introduced if cxl_map_regs fails.
> 2. The previous logic would try to map component registers and device
>registers multiple times if there were present and keep the mapping
>of the last one found (furthest offset in the register locator).
>While this is disallowed in the spec, CXL 2.0 8.1.9: "Each register
>block identifier shall only occur once in the Register Locator DVSEC
>structure" it was how the driver would respond to the spec violation.
>The new logic will take the first found register block by type and
>move on.
>
> Signed-off-by: Ben Widawsky 
>
> ---
> Changes since v1:

No changes? Luckily git am strips this section...

Overall I think this refactor can be broken down further for
readability and cleanup the long standing problem that the driver maps
component registers for no reason. The main contributing factor to
readability is that cxl_setup_pci_regs() still exists after the
refactor, which also contributes to the component register problem. If
the register mapping is up leveled to the caller of
cxl_setup_pci_regs() (and drops mapping component registers) then a
follow-on patch to rename cxl_setup_pci_regs to find_register_block
becomes easier to read. Moving the cxl_register_map array out of
cxl_setup_pci_regs() also makes a later patch to pass component
register enumeration details to the endpoint-port that much cleaner.


Re: [PATCH kernel] powerps/pseries/dma: Add support for 2M IOMMU page size

2021-09-28 Thread Leonardo Brás
Hello Alexey,

On Tue, 2021-09-28 at 20:15 +1000, Alexey Kardashevskiy wrote:
> The upcoming PAPR spec adds a 2M page size, bit 23 right after the 16G
> page
> size in the "ibm,query-pe-dma-window" call.
> 
> This adds support for the new page size. Since the new page size is out
> of sorted order, this changes the loop to not assume that shift[] is
> sorted.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> 
> This might not work if PHYP keeps rejecting new window requests for
> less
> than 32768 TCEs. This is needed:
> https://github.com/aik/linux/commit/8cc8fa5ba5b3b4a18efbc9d81d9e5b85ca7c8a95
> 
> 
> ---
>  arch/powerpc/platforms/pseries/iommu.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c
> b/arch/powerpc/platforms/pseries/iommu.c
> index c741689a5165..237bf405b0cb 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -1159,14 +1159,15 @@ static void reset_dma_window(struct pci_dev
> *dev, struct device_node *par_dn)
>  /* Return largest page shift based on "IO Page Sizes" output of
> ibm,query-pe-dma-window. */
>  static int iommu_get_page_shift(u32 query_page_size)
>  {
> -   /* Supported IO page-sizes according to LoPAR */
> +   /* Supported IO page-sizes according to LoPAR, note that 2M is
> out of order */
> const int shift[] = {
> __builtin_ctzll(SZ_4K),   __builtin_ctzll(SZ_64K),
> __builtin_ctzll(SZ_16M),
> __builtin_ctzll(SZ_32M),  __builtin_ctzll(SZ_64M),
> __builtin_ctzll(SZ_128M),
> -   __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G)
> +   __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G),
> __builtin_ctzll(SZ_2M)
> };
>  
> int i = ARRAY_SIZE(shift) - 1;
> +   int ret = 0;
>  
> /*
>  * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes"
> using a bit field:
> @@ -1176,11 +1177,10 @@ static int iommu_get_page_shift(u32
> query_page_size)
>  */
> for (; i >= 0 ; i--) {
> if (query_page_size & (1 << i))
> -   return shift[i];
> +   ret = max(ret, shift[i]);
> }
>  
> -   /* No valid page size found. */
> -   return 0;
> +   return ret;
>  }
>  
>  static struct property *ddw_property_create(const char *propname, u32
> liobn, u64 dma_addr,

Looks great to me.

FWIW:
Reviewed-by: Leonardo Bras 

Best regards,
Leonardo




Re: [PATCH v4 4/8] PCI: replace pci_dev::driver usage that gets the driver name

2021-09-28 Thread Bjorn Helgaas
[+to Oliver, Russell for eeh_driver_name() question below]

On Mon, Sep 27, 2021 at 10:43:22PM +0200, Uwe Kleine-König wrote:
> From: Uwe Kleine-König 
> 
> struct pci_dev::driver holds (apart from a constant offset) the same
> data as struct pci_dev::dev->driver. With the goal to remove struct
> pci_dev::driver to get rid of data duplication replace getting the
> driver name by dev_driver_string() which implicitly makes use of struct
> pci_dev::dev->driver.

When you repost to fix the build issue, can you capitalize the subject
line to match the other?

Also, would you mind using "pci_dev.driver" instead of
"pci_dev::driver"?  AFAIK, the "::" operator is not actually part of
C, so I think it's more confusing than useful.

> Signed-off-by: Uwe Kleine-König 
> ---
>  arch/powerpc/include/asm/ppc-pci.h   | 9 -
>  drivers/bcma/host_pci.c  | 7 ---
>  drivers/crypto/hisilicon/qm.c| 2 +-
>  drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c   | 2 +-
>  drivers/net/ethernet/marvell/prestera/prestera_pci.c | 2 +-
>  drivers/net/ethernet/mellanox/mlxsw/pci.c| 2 +-
>  drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 2 +-
>  drivers/ssb/pcihost_wrapper.c| 8 +---
>  8 files changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/ppc-pci.h 
> b/arch/powerpc/include/asm/ppc-pci.h
> index 2b9edbf6e929..e8f1795a2acf 100644
> --- a/arch/powerpc/include/asm/ppc-pci.h
> +++ b/arch/powerpc/include/asm/ppc-pci.h
> @@ -57,7 +57,14 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev);
>  
>  static inline const char *eeh_driver_name(struct pci_dev *pdev)
>  {
> - return (pdev && pdev->driver) ? pdev->driver->name : "";
> + if (pdev) {
> + const char *drvstr = dev_driver_string(>dev);
> +
> + if (strcmp(drvstr, ""))
> + return drvstr;
> + }
> +
> + return "";

Can we just do this?

  if (pdev)
return dev_driver_string(>dev);

  return "";

I think it's more complicated than it's worth to include a strcmp().
It's possible this will change those error messages about "Might be
infinite loop in %s driver", but that doesn't seem like a huge deal.

I moved Oliver to "to:" and added Russell in case they object.

>  }
>  
>  #endif /* CONFIG_EEH */
> diff --git a/drivers/bcma/host_pci.c b/drivers/bcma/host_pci.c
> index 69c10a7b7c61..0973022d4b13 100644
> --- a/drivers/bcma/host_pci.c
> +++ b/drivers/bcma/host_pci.c
> @@ -175,9 +175,10 @@ static int bcma_host_pci_probe(struct pci_dev *dev,
>   if (err)
>   goto err_kfree_bus;
>  
> - name = dev_name(>dev);
> - if (dev->driver && dev->driver->name)
> - name = dev->driver->name;
> + name = dev_driver_string(>dev);
> + if (!strcmp(name, ""))
> + name = dev_name(>dev);
>   err = pci_request_regions(dev, name);

Again seems more complicated than it's worth to me.  This is in the
driver's .probe() method, so really_probe() has already set
"dev->driver = drv", which means dev->driver is always set to
_pci_bridge_driver here, and bcma_pci_bridge_driver.name is
always "bcma-pci-bridge".

Almost all callers of pci_request_regions() just hardcode the driver
name or use a DRV_NAME #define

So I think we should just do:

  err = pci_request_regions(dev, "bcma-pci-bridge");

>   if (err)
>   goto err_pci_disable;
> diff --git a/drivers/crypto/hisilicon/qm.c b/drivers/crypto/hisilicon/qm.c
> index 369562d34d66..8f361e54e524 100644
> --- a/drivers/crypto/hisilicon/qm.c
> +++ b/drivers/crypto/hisilicon/qm.c
> @@ -3085,7 +3085,7 @@ static int qm_alloc_uacce(struct hisi_qm *qm)
>   };
>   int ret;
>  
> - ret = strscpy(interface.name, pdev->driver->name,
> + ret = strscpy(interface.name, dev_driver_string(>dev),
> sizeof(interface.name));
>   if (ret < 0)
>   return -ENAMETOOLONG;
> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
> b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
> index 7ea511d59e91..f279edfce3f1 100644
> --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
> +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
> @@ -606,7 +606,7 @@ static void hns3_get_drvinfo(struct net_device *netdev,
>   return;
>   }
>  
> - strncpy(drvinfo->driver, h->pdev->driver->name,
> + strncpy(drvinfo->driver, dev_driver_string(>pdev->dev),
>   sizeof(drvinfo->driver));
>   drvinfo->driver[sizeof(drvinfo->driver) - 1] = '\0';
>  
> diff --git a/drivers/net/ethernet/marvell/prestera/prestera_pci.c 
> b/drivers/net/ethernet/marvell/prestera/prestera_pci.c
> index a250d394da38..a8f007f6dad2 100644
> --- a/drivers/net/ethernet/marvell/prestera/prestera_pci.c
> +++ b/drivers/net/ethernet/marvell/prestera/prestera_pci.c
> @@ -720,7 +720,7 @@ static int prestera_fw_load(struct prestera_fw *fw)
>  static 

Re: [PATCH v2 3/9] cxl/pci: Remove pci request/release regions

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:26 AM Ben Widawsky  wrote:
>
> Quoting Dan, "... the request + release regions should probably just be
> dropped. It's not like any of the register enumeration would collide
> with someone else who already has the registers mapped. The collision
> only comes when the registers are mapped for their final usage, and that
> will have more precision in the request."

Looks good to me:

Reviewed-by: Dan Williams 

>
> Recommended-by: Dan Williams 

This isn't one of the canonical tags:
Documentation/process/submitting-patches.rst

I'll change this to Suggested-by:

> Signed-off-by: Ben Widawsky 
> ---
>  drivers/cxl/pci.c | 5 -
>  1 file changed, 5 deletions(-)
>
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index ccc7c2573ddc..7256c236fdb3 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -453,9 +453,6 @@ static int cxl_pci_setup_regs(struct cxl_mem *cxlm)
> return -ENXIO;
> }
>
> -   if (pci_request_mem_regions(pdev, pci_name(pdev)))
> -   return -ENODEV;
> -
> /* Get the size of the Register Locator DVSEC */
> pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, _size);
> regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size);
> @@ -499,8 +496,6 @@ static int cxl_pci_setup_regs(struct cxl_mem *cxlm)
> n_maps++;
> }
>
> -   pci_release_mem_regions(pdev);
> -
> for (i = 0; i < n_maps; i++) {
> ret = cxl_map_regs(cxlm, [i]);
> if (ret)
> --
> 2.33.0
>


Re: [PATCH v2 2/9] cxl/pci: Remove dev_dbg for unknown register blocks

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> While interesting to driver developers, the dev_dbg message doesn't do
> much except clutter up logs. This information should be attainable
> through sysfs, and someday lspci like utilities. This change
> additionally helps reduce the LOC in a subsequent patch to refactor some
> of cxl_pci register mapping.

Looks good to me:

Reviewed-by: Dan Williams 


Re: [PATCH v8 1/2] powerpc/pseries: Interface to represent PAPR firmware attributes

2021-09-28 Thread Pratik Sampat




On 28/09/21 7:28 pm, Greg KH wrote:

On Tue, Sep 28, 2021 at 06:13:18PM +0530, Pratik Sampat wrote:

Hello Greg,

Thank you for your review.

On 28/09/21 5:38 pm, Greg KH wrote:

On Tue, Sep 28, 2021 at 05:21:01PM +0530, Pratik R. Sampat wrote:

Adds a generic interface to represent the energy and frequency related
PAPR attributes on the system using the new H_CALL
"H_GET_ENERGY_SCALE_INFO".

H_GET_EM_PARMS H_CALL was previously responsible for exporting this
information in the lparcfg, however the H_GET_EM_PARMS H_CALL
will be deprecated P10 onwards.

The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
hcall(
uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
uint64 flags,   // Per the flag request
uint64 firstAttributeId,// The attribute id
uint64 bufferAddress,   // Guest physical address of the output buffer
uint64 bufferSize   // The size in bytes of the output buffer
);

This H_CALL can query either all the attributes at once with
firstAttributeId = 0, flags = 0 as well as query only one attribute
at a time with firstAttributeId = id, flags = 1.

The output buffer consists of the following
1. number of attributes  - 8 bytes
2. array offset to the data location - 8 bytes
3. version info  - 1 byte
4. A data array of size num attributes, which contains the following:
a. attribute ID  - 8 bytes
b. attribute value in number - 8 bytes
c. attribute name in string  - 64 bytes
d. attribute value in string - 64 bytes

The new H_CALL exports information in direct string value format, hence
a new interface has been introduced in
/sys/firmware/papr/energy_scale_info to export this information to
userspace in an extensible pass-through format.

The H_CALL returns the name, numeric value and string value (if exists)

The format of exposing the sysfs information is as follows:
/sys/firmware/papr/energy_scale_info/
 |-- /
   |-- desc
   |-- value
   |-- value_desc (if exists)
 |-- /
   |-- desc
   |-- value
   |-- value_desc (if exists)
...

The energy information that is exported is useful for userspace tools
such as powerpc-utils. Currently these tools infer the
"power_mode_data" value in the lparcfg, which in turn is obtained from
the to be deprecated H_GET_EM_PARMS H_CALL.
On future platforms, such userspace utilities will have to look at the
data returned from the new H_CALL being populated in this new sysfs
interface and report this information directly without the need of
interpretation.

Signed-off-by: Pratik R. Sampat 
Reviewed-by: Gautham R. Shenoy 
Reviewed-by: Fabiano Rosas 
Reviewed-by: Kajol Jain 
---
   .../sysfs-firmware-papr-energy-scale-info |  26 ++
   arch/powerpc/include/asm/hvcall.h |  24 +-
   arch/powerpc/kvm/trace_hv.h   |   1 +
   arch/powerpc/platforms/pseries/Makefile   |   3 +-
   .../pseries/papr_platform_attributes.c| 312 ++
   5 files changed, 364 insertions(+), 2 deletions(-)
   create mode 100644 
Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
   create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c

diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info 
b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
new file mode 100644
index ..139a576c7c9d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
@@ -0,0 +1,26 @@
+What:  /sys/firmware/papr/energy_scale_info
+Date:  June 2021
+Contact:   Linux for PowerPC mailing list 
+Description:   Directory hosting a set of platform attributes like
+   energy/frequency on Linux running as a PAPR guest.
+
+   Each file in a directory contains a platform
+   attribute hierarchy pertaining to performance/
+   energy-savings mode and processor frequency.
+
+What:  /sys/firmware/papr/energy_scale_info/
+   /sys/firmware/papr/energy_scale_info//desc
+   /sys/firmware/papr/energy_scale_info//value
+   /sys/firmware/papr/energy_scale_info//value_desc
+Date:  June 2021
+Contact:   Linux for PowerPC mailing list 
+Description:   Energy, frequency attributes directory for POWERVM servers
+
+   This directory provides energy, frequency, folding information. 
It
+   contains below sysfs attributes:
+
+   - desc: String description of the attribute 
+
+   - value: Numeric value of attribute 
+
+   - value_desc: String value of attribute 

Can you just make 4 different entries in this file, making it easier to
parse and extend over time?

Do you mean I only create one file per attribute and populate it with 4
different entries as follows?

# cat /sys/firmware/papr/energy_scale_info/
id:
desc:
value:
value_desc:

No, I mean in this documentation file, 

Re: [PATCH v8 1/2] powerpc/pseries: Interface to represent PAPR firmware attributes

2021-09-28 Thread Greg KH
On Tue, Sep 28, 2021 at 06:13:18PM +0530, Pratik Sampat wrote:
> Hello Greg,
> 
> Thank you for your review.
> 
> On 28/09/21 5:38 pm, Greg KH wrote:
> > On Tue, Sep 28, 2021 at 05:21:01PM +0530, Pratik R. Sampat wrote:
> > > Adds a generic interface to represent the energy and frequency related
> > > PAPR attributes on the system using the new H_CALL
> > > "H_GET_ENERGY_SCALE_INFO".
> > > 
> > > H_GET_EM_PARMS H_CALL was previously responsible for exporting this
> > > information in the lparcfg, however the H_GET_EM_PARMS H_CALL
> > > will be deprecated P10 onwards.
> > > 
> > > The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
> > > hcall(
> > >uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
> > >uint64 flags,   // Per the flag request
> > >uint64 firstAttributeId,// The attribute id
> > >uint64 bufferAddress,   // Guest physical address of the output buffer
> > >uint64 bufferSize   // The size in bytes of the output buffer
> > > );
> > > 
> > > This H_CALL can query either all the attributes at once with
> > > firstAttributeId = 0, flags = 0 as well as query only one attribute
> > > at a time with firstAttributeId = id, flags = 1.
> > > 
> > > The output buffer consists of the following
> > > 1. number of attributes  - 8 bytes
> > > 2. array offset to the data location - 8 bytes
> > > 3. version info  - 1 byte
> > > 4. A data array of size num attributes, which contains the following:
> > >a. attribute ID  - 8 bytes
> > >b. attribute value in number - 8 bytes
> > >c. attribute name in string  - 64 bytes
> > >d. attribute value in string - 64 bytes
> > > 
> > > The new H_CALL exports information in direct string value format, hence
> > > a new interface has been introduced in
> > > /sys/firmware/papr/energy_scale_info to export this information to
> > > userspace in an extensible pass-through format.
> > > 
> > > The H_CALL returns the name, numeric value and string value (if exists)
> > > 
> > > The format of exposing the sysfs information is as follows:
> > > /sys/firmware/papr/energy_scale_info/
> > > |-- /
> > >   |-- desc
> > >   |-- value
> > >   |-- value_desc (if exists)
> > > |-- /
> > >   |-- desc
> > >   |-- value
> > >   |-- value_desc (if exists)
> > > ...
> > > 
> > > The energy information that is exported is useful for userspace tools
> > > such as powerpc-utils. Currently these tools infer the
> > > "power_mode_data" value in the lparcfg, which in turn is obtained from
> > > the to be deprecated H_GET_EM_PARMS H_CALL.
> > > On future platforms, such userspace utilities will have to look at the
> > > data returned from the new H_CALL being populated in this new sysfs
> > > interface and report this information directly without the need of
> > > interpretation.
> > > 
> > > Signed-off-by: Pratik R. Sampat 
> > > Reviewed-by: Gautham R. Shenoy 
> > > Reviewed-by: Fabiano Rosas 
> > > Reviewed-by: Kajol Jain 
> > > ---
> > >   .../sysfs-firmware-papr-energy-scale-info |  26 ++
> > >   arch/powerpc/include/asm/hvcall.h |  24 +-
> > >   arch/powerpc/kvm/trace_hv.h   |   1 +
> > >   arch/powerpc/platforms/pseries/Makefile   |   3 +-
> > >   .../pseries/papr_platform_attributes.c| 312 ++
> > >   5 files changed, 364 insertions(+), 2 deletions(-)
> > >   create mode 100644 
> > > Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
> > >   create mode 100644 
> > > arch/powerpc/platforms/pseries/papr_platform_attributes.c
> > > 
> > > diff --git 
> > > a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info 
> > > b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
> > > new file mode 100644
> > > index ..139a576c7c9d
> > > --- /dev/null
> > > +++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
> > > @@ -0,0 +1,26 @@
> > > +What:/sys/firmware/papr/energy_scale_info
> > > +Date:June 2021
> > > +Contact: Linux for PowerPC mailing list 
> > > +Description: Directory hosting a set of platform attributes like
> > > + energy/frequency on Linux running as a PAPR guest.
> > > +
> > > + Each file in a directory contains a platform
> > > + attribute hierarchy pertaining to performance/
> > > + energy-savings mode and processor frequency.
> > > +
> > > +What:/sys/firmware/papr/energy_scale_info/
> > > + /sys/firmware/papr/energy_scale_info//desc
> > > + /sys/firmware/papr/energy_scale_info//value
> > > + /sys/firmware/papr/energy_scale_info//value_desc
> > > +Date:June 2021
> > > +Contact: Linux for PowerPC mailing list 
> > > +Description: Energy, frequency attributes directory for POWERVM 
> > > servers
> > > +
> > > + This directory provides energy, frequency, folding information. 
> > > It
> > > + 

Re: [RFC PATCH 3/8] s390: add CPU field to struct thread_info

2021-09-28 Thread Ard Biesheuvel
On Tue, 14 Sept 2021 at 14:11, Ard Biesheuvel  wrote:
>
> The CPU field will be moved back into thread_info even when
> THREAD_INFO_IN_TASK is enabled, so add it back to s390's definition of
> struct thread_info.
>
> Signed-off-by: Ard Biesheuvel 
> ---
>  arch/s390/include/asm/thread_info.h | 1 +
>  1 file changed, 1 insertion(+)
>

Heiko, Christian, Vasily,

Do you have any objections to this change? If you don't, could you
please ack it so it can be taken through another tree (or if that is
problematic for you, could you please propose another way of merging
these changes?)

Thanks,
Ard.

> diff --git a/arch/s390/include/asm/thread_info.h 
> b/arch/s390/include/asm/thread_info.h
> index e6674796aa6f..b2ffcb4fe000 100644
> --- a/arch/s390/include/asm/thread_info.h
> +++ b/arch/s390/include/asm/thread_info.h
> @@ -37,6 +37,7 @@
>  struct thread_info {
> unsigned long   flags;  /* low level flags */
> unsigned long   syscall_work;   /* SYSCALL_WORK_ flags */
> +   unsigned intcpu;/* current CPU */
>  };
>
>  /*
> --
> 2.30.2
>


Re: [RFC PATCH 4/8] powerpc: add CPU field to struct thread_info

2021-09-28 Thread Ard Biesheuvel
On Tue, 28 Sept 2021 at 02:16, Michael Ellerman  wrote:
>
> Michael Ellerman  writes:
> > Ard Biesheuvel  writes:
> >> On Tue, 14 Sept 2021 at 14:11, Ard Biesheuvel  wrote:
> >>>
> >>> The CPU field will be moved back into thread_info even when
> >>> THREAD_INFO_IN_TASK is enabled, so add it back to powerpc's definition
> >>> of struct thread_info.
> >>>
> >>> Signed-off-by: Ard Biesheuvel 
> >>
> >> Michael,
> >>
> >> Do you have any objections or issues with this patch or the subsequent
> >> ones cleaning up the task CPU kludge for ppc32? Christophe indicated
> >> that he was happy with it.
> >
> > No objections, it looks good to me, thanks for cleaning up that horror :)
> >
> > It didn't apply cleanly to master so I haven't tested it at all, if you can 
> > point me at a
> > git tree with the dependencies I'd be happy to run some tests over it.
>
> Actually I realised I can just drop the last patch.
>
> So that looks fine, passes my standard quick build & boot on qemu tests,
> and builds with/without stack protector enabled.
>

Thanks.

Do you have any opinion on how this series should be merged? Kees Cook
is willing to take them via his cross-arch tree, or you could carry
them if you prefer. Taking it via multiple trees at the same time is
going to be tricky, or take two cycles, with I'd prefer to avoid.

-- 
Ard.


[PATCH v5 4/4] docs: ABI: sysfs-bus-nvdimm: Document sysfs event format entries for nvdimm pmu

2021-09-28 Thread Kajol Jain
Details are added for the event, cpumask and format attributes
in the ABI documentation.

Signed-off-by: Kajol Jain 
---
 Documentation/ABI/testing/sysfs-bus-nvdimm | 35 ++
 1 file changed, 35 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-nvdimm 
b/Documentation/ABI/testing/sysfs-bus-nvdimm
index bff84a16812a..64004d5e4840 100644
--- a/Documentation/ABI/testing/sysfs-bus-nvdimm
+++ b/Documentation/ABI/testing/sysfs-bus-nvdimm
@@ -6,3 +6,38 @@ Description:
 
 The libnvdimm sub-system implements a common sysfs interface for
 platform nvdimm resources. See Documentation/driver-api/nvdimm/.
+
+What:   /sys/bus/event_source/devices/nmemX/format
+Date:   September 2021
+KernelVersion:  5.16
+Contact:Kajol Jain 
+Description:   (RO) Attribute group to describe the magic bits
+   that go into perf_event_attr.config for a particular pmu.
+   (See ABI/testing/sysfs-bus-event_source-devices-format).
+
+   Each attribute under this group defines a bit range of the
+   perf_event_attr.config. Supported attribute is listed
+   below::
+ event  = "config:0-4"  - event ID
+
+   For example::
+   ctl_res_cnt = "event=0x1"
+
+What:   /sys/bus/event_source/devices/nmemX/events
+Date:   September 2021
+KernelVersion:  5.16
+Contact:Kajol Jain 
+Description:   (RO) Attribute group to describe performance monitoring events
+for the nvdimm memory device. Each attribute in this group
+describes a single performance monitoring event supported by
+this nvdimm pmu.  The name of the file is the name of the 
event.
+(See ABI/testing/sysfs-bus-event_source-devices-events). A
+listing of the events supported by a given nvdimm provider type
+can be found in Documentation/driver-api/nvdimm/$provider.
+
+What:  /sys/bus/event_source/devices/nmemX/cpumask
+Date:  September 2021
+KernelVersion:  5.16
+Contact:Kajol Jain 
+Description:   (RO) This sysfs file exposes the cpumask which is designated to
+   to retrieve nvdimm pmu event counter data.
-- 
2.26.2



[PATCH v5 3/4] powerpc/papr_scm: Add perf interface support

2021-09-28 Thread Kajol Jain
Performance monitoring support for papr-scm nvdimm devices
via perf interface is added which includes addition of pmu
functions like add/del/read/event_init for nvdimm_pmu struture.

A new parameter 'priv' in added to the pdev_archdata structure to save
nvdimm_pmu device pointer, to handle the unregistering of pmu device.

papr_scm_pmu_register function populates the nvdimm_pmu structure
with name, capabilities, cpumask along with event handling
functions. Finally the populated nvdimm_pmu structure is passed to
register the pmu device. Event handling functions internally uses
hcall to get events and counter data.

Result in power9 machine with 2 nvdimm device:

Ex: List all event by perf list

command:# perf list nmem

  nmem0/cache_rh_cnt/[Kernel PMU event]
  nmem0/cache_wh_cnt/[Kernel PMU event]
  nmem0/cri_res_util/[Kernel PMU event]
  nmem0/ctl_res_cnt/ [Kernel PMU event]
  nmem0/ctl_res_tm/  [Kernel PMU event]
  nmem0/fast_w_cnt/  [Kernel PMU event]
  nmem0/host_l_cnt/  [Kernel PMU event]
  nmem0/host_l_dur/  [Kernel PMU event]
  nmem0/host_s_cnt/  [Kernel PMU event]
  nmem0/host_s_dur/  [Kernel PMU event]
  nmem0/med_r_cnt/   [Kernel PMU event]
  nmem0/med_r_dur/   [Kernel PMU event]
  nmem0/med_w_cnt/   [Kernel PMU event]
  nmem0/med_w_dur/   [Kernel PMU event]
  nmem0/mem_life/[Kernel PMU event]
  nmem0/poweron_secs/[Kernel PMU event]
  ...
  nmem1/mem_life/[Kernel PMU event]
  nmem1/poweron_secs/[Kernel PMU event]

Signed-off-by: Kajol Jain 
---
 arch/powerpc/include/asm/device.h |   5 +
 arch/powerpc/platforms/pseries/papr_scm.c | 225 ++
 2 files changed, 230 insertions(+)

diff --git a/arch/powerpc/include/asm/device.h 
b/arch/powerpc/include/asm/device.h
index 219559d65864..47ed639f3b8f 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -48,6 +48,11 @@ struct dev_archdata {
 
 struct pdev_archdata {
u64 dma_mask;
+   /*
+* Pointer to nvdimm_pmu structure, to handle the unregistering
+* of pmu device
+*/
+   void *priv;
 };
 
 #endif /* _ASM_POWERPC_DEVICE_H */
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index f48e87ac89c9..bdf2620db461 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define BIND_ANY_ADDR (~0ul)
 
@@ -68,6 +69,8 @@
 #define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS)
 #define PAPR_SCM_PERF_STATS_VERSION 0x1
 
+#define to_nvdimm_pmu(_pmu)container_of(_pmu, struct nvdimm_pmu, pmu)
+
 /* Struct holding a single performance metric */
 struct papr_scm_perf_stat {
u8 stat_id[8];
@@ -120,6 +123,9 @@ struct papr_scm_priv {
 
/* length of the stat buffer as expected by phyp */
size_t stat_buffer_len;
+
+/* array to have event_code and stat_id mappings */
+   char **nvdimm_events_map;
 };
 
 static int papr_scm_pmem_flush(struct nd_region *nd_region,
@@ -340,6 +346,218 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv 
*p,
return 0;
 }
 
+static int papr_scm_pmu_get_value(struct perf_event *event, struct device 
*dev, u64 *count)
+{
+   struct papr_scm_perf_stat *stat;
+   struct papr_scm_perf_stats *stats;
+   struct papr_scm_priv *p = (struct papr_scm_priv *)dev->driver_data;
+   int rc, size;
+
+   /* Allocate request buffer enough to hold single performance stat */
+   size = sizeof(struct papr_scm_perf_stats) +
+   sizeof(struct papr_scm_perf_stat);
+
+   if (!p || !p->nvdimm_events_map)
+   return -EINVAL;
+
+   stats = kzalloc(size, GFP_KERNEL);
+   if (!stats)
+   return -ENOMEM;
+
+   stat = >scm_statistic[0];
+   memcpy(>stat_id,
+  p->nvdimm_events_map[event->attr.config],
+   sizeof(stat->stat_id));
+   stat->stat_val = 0;
+
+   rc = drc_pmem_query_stats(p, stats, 1);
+   if (rc < 0) {
+   kfree(stats);
+   return rc;
+   }
+
+   *count = be64_to_cpu(stat->stat_val);
+   kfree(stats);
+   return 0;
+}
+
+static int papr_scm_pmu_event_init(struct perf_event *event)
+{
+   struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
+   struct papr_scm_priv *p;
+
+   if (!nd_pmu)
+   return 

[PATCH v5 2/4] drivers/nvdimm: Add perf interface to expose nvdimm performance stats

2021-09-28 Thread Kajol Jain
A common interface is added to get performance stats reporting
support for nvdimm devices. Added interface defines supported
event list, config fields for the event attributes and their
corresponding bit values which are exported via sysfs.

Interface also added support for pmu register/unregister functions,
cpu hotplug feature along with macros for handling events addition
via sysfs. It adds attribute groups for format, cpumask and events
to the pmu structure.

User could use the standard perf tool to access perf events exposed
via nvdimm pmu.

Signed-off-by: Kajol Jain 
[Make hotplug function static as reported by kernel test rorbot]
Reported-by: kernel test robot  
---
 drivers/nvdimm/Makefile  |   1 +
 drivers/nvdimm/nd_perf.c | 328 +++
 include/linux/nd.h   |  21 +++
 3 files changed, 350 insertions(+)
 create mode 100644 drivers/nvdimm/nd_perf.c

diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
index 29203f3d3069..25dba6095612 100644
--- a/drivers/nvdimm/Makefile
+++ b/drivers/nvdimm/Makefile
@@ -18,6 +18,7 @@ nd_e820-y := e820.o
 libnvdimm-y := core.o
 libnvdimm-y += bus.o
 libnvdimm-y += dimm_devs.o
+libnvdimm-y += nd_perf.o
 libnvdimm-y += dimm.o
 libnvdimm-y += region_devs.o
 libnvdimm-y += region.o
diff --git a/drivers/nvdimm/nd_perf.c b/drivers/nvdimm/nd_perf.c
new file mode 100644
index ..314415894acf
--- /dev/null
+++ b/drivers/nvdimm/nd_perf.c
@@ -0,0 +1,328 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * nd_perf.c: NVDIMM Device Performance Monitoring Unit support
+ *
+ * Perf interface to expose nvdimm performance stats.
+ *
+ * Copyright (C) 2021 IBM Corporation
+ */
+
+#define pr_fmt(fmt) "nvdimm_pmu: " fmt
+
+#include 
+
+#define EVENT(_name, _code) enum{_name = _code}
+
+/*
+ * NVDIMM Events codes.
+ */
+
+/* Controller Reset Count */
+EVENT(CTL_RES_CNT, 0x1);
+/* Controller Reset Elapsed Time */
+EVENT(CTL_RES_TM,  0x2);
+/* Power-on Seconds */
+EVENT(POWERON_SECS,0x3);
+/* Life Remaining */
+EVENT(MEM_LIFE,0x4);
+/* Critical Resource Utilization */
+EVENT(CRI_RES_UTIL,0x5);
+/* Host Load Count */
+EVENT(HOST_L_CNT,  0x6);
+/* Host Store Count */
+EVENT(HOST_S_CNT,  0x7);
+/* Host Store Duration */
+EVENT(HOST_S_DUR,  0x8);
+/* Host Load Duration */
+EVENT(HOST_L_DUR,  0x9);
+/* Media Read Count */
+EVENT(MED_R_CNT,   0xa);
+/* Media Write Count */
+EVENT(MED_W_CNT,   0xb);
+/* Media Read Duration */
+EVENT(MED_R_DUR,   0xc);
+/* Media Write Duration */
+EVENT(MED_W_DUR,   0xd);
+/* Cache Read Hit Count */
+EVENT(CACHE_RH_CNT,0xe);
+/* Cache Write Hit Count */
+EVENT(CACHE_WH_CNT,0xf);
+/* Fast Write Count */
+EVENT(FAST_W_CNT,  0x10);
+
+NVDIMM_EVENT_ATTR(ctl_res_cnt, CTL_RES_CNT);
+NVDIMM_EVENT_ATTR(ctl_res_tm,  CTL_RES_TM);
+NVDIMM_EVENT_ATTR(poweron_secs,POWERON_SECS);
+NVDIMM_EVENT_ATTR(mem_life,MEM_LIFE);
+NVDIMM_EVENT_ATTR(cri_res_util,CRI_RES_UTIL);
+NVDIMM_EVENT_ATTR(host_l_cnt,  HOST_L_CNT);
+NVDIMM_EVENT_ATTR(host_s_cnt,  HOST_S_CNT);
+NVDIMM_EVENT_ATTR(host_s_dur,  HOST_S_DUR);
+NVDIMM_EVENT_ATTR(host_l_dur,  HOST_L_DUR);
+NVDIMM_EVENT_ATTR(med_r_cnt,   MED_R_CNT);
+NVDIMM_EVENT_ATTR(med_w_cnt,   MED_W_CNT);
+NVDIMM_EVENT_ATTR(med_r_dur,   MED_R_DUR);
+NVDIMM_EVENT_ATTR(med_w_dur,   MED_W_DUR);
+NVDIMM_EVENT_ATTR(cache_rh_cnt,CACHE_RH_CNT);
+NVDIMM_EVENT_ATTR(cache_wh_cnt,CACHE_WH_CNT);
+NVDIMM_EVENT_ATTR(fast_w_cnt,  FAST_W_CNT);
+
+static struct attribute *nvdimm_events_attr[] = {
+   NVDIMM_EVENT_PTR(CTL_RES_CNT),
+   NVDIMM_EVENT_PTR(CTL_RES_TM),
+   NVDIMM_EVENT_PTR(POWERON_SECS),
+   NVDIMM_EVENT_PTR(MEM_LIFE),
+   NVDIMM_EVENT_PTR(CRI_RES_UTIL),
+   NVDIMM_EVENT_PTR(HOST_L_CNT),
+   NVDIMM_EVENT_PTR(HOST_S_CNT),
+   NVDIMM_EVENT_PTR(HOST_S_DUR),
+   NVDIMM_EVENT_PTR(HOST_L_DUR),
+   NVDIMM_EVENT_PTR(MED_R_CNT),
+   NVDIMM_EVENT_PTR(MED_W_CNT),
+   NVDIMM_EVENT_PTR(MED_R_DUR),
+   NVDIMM_EVENT_PTR(MED_W_DUR),
+   NVDIMM_EVENT_PTR(CACHE_RH_CNT),
+   NVDIMM_EVENT_PTR(CACHE_WH_CNT),
+   NVDIMM_EVENT_PTR(FAST_W_CNT),
+   NULL
+};
+
+static struct attribute_group nvdimm_pmu_events_group = {
+   .name = "events",
+   .attrs = nvdimm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-4");
+
+static struct attribute *nvdimm_pmu_format_attr[] = {
+   _attr_event.attr,
+   NULL,
+};
+
+static struct attribute_group nvdimm_pmu_format_group = {
+   .name = "format",
+   .attrs = nvdimm_pmu_format_attr,
+};
+
+ssize_t nvdimm_events_sysfs_show(struct device *dev,
+struct device_attribute *attr, char *page)
+{
+   struct 

[PATCH v5 1/4] drivers/nvdimm: Add nvdimm pmu structure

2021-09-28 Thread Kajol Jain
A structure is added called nvdimm_pmu, for performance
stats reporting support of nvdimm devices. It can be used to add
device pmu data such as pmu data structure for performance
stats, nvdimm device pointer along with cpumask attributes.

Signed-off-by: Kajol Jain 
---
 include/linux/nd.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/include/linux/nd.h b/include/linux/nd.h
index ee9ad76afbba..f5ed4db2d859 100644
--- a/include/linux/nd.h
+++ b/include/linux/nd.h
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 enum nvdimm_event {
NVDIMM_REVALIDATE_POISON,
@@ -23,6 +24,25 @@ enum nvdimm_claim_class {
NVDIMM_CCLASS_UNKNOWN,
 };
 
+/**
+ * struct nvdimm_pmu - data structure for nvdimm perf driver
+ * @pmu: pmu data structure for nvdimm performance stats.
+ * @dev: nvdimm device pointer.
+ * @cpu: designated cpu for counter access.
+ * @node: node for cpu hotplug notifier link.
+ * @cpuhp_state: state for cpu hotplug notification.
+ * @arch_cpumask: cpumask to get designated cpu for counter access.
+ */
+struct nvdimm_pmu {
+   struct pmu pmu;
+   struct device *dev;
+   int cpu;
+   struct hlist_node node;
+   enum cpuhp_state cpuhp_state;
+   /* cpumask provided by arch/platform specific code */
+   struct cpumask arch_cpumask;
+};
+
 struct nd_device_driver {
struct device_driver drv;
unsigned long type;
-- 
2.26.2



[PATCH] powerpc: fix unbalanced node refcount in check_kvm_guest()

2021-09-28 Thread Nathan Lynch
When check_kvm_guest() succeeds in looking up a /hypervisor OF node, it
returns without performing a matching put for the lookup, leaving the
node's reference count elevated.

Add the necessary call to of_node_put(), rearranging the code slightly to
avoid repetition or goto.

Signed-off-by: Nathan Lynch 
Fixes: 107c55005fbd ("powerpc/pseries: Add KVM guest doorbell restrictions")
---
 arch/powerpc/kernel/firmware.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/firmware.c b/arch/powerpc/kernel/firmware.c
index c7022c41cc31..20328f72f9f2 100644
--- a/arch/powerpc/kernel/firmware.c
+++ b/arch/powerpc/kernel/firmware.c
@@ -31,11 +31,10 @@ int __init check_kvm_guest(void)
if (!hyper_node)
return 0;
 
-   if (!of_device_is_compatible(hyper_node, "linux,kvm"))
-   return 0;
-
-   static_branch_enable(_guest);
+   if (of_device_is_compatible(hyper_node, "linux,kvm"))
+   static_branch_enable(_guest);
 
+   of_node_put(hyper_node);
return 0;
 }
 core_initcall(check_kvm_guest); // before kvm_guest_init()
-- 
2.31.1



Re: [PATCH v8 1/2] powerpc/pseries: Interface to represent PAPR firmware attributes

2021-09-28 Thread Pratik Sampat

Hello Greg,

Thank you for your review.

On 28/09/21 5:38 pm, Greg KH wrote:

On Tue, Sep 28, 2021 at 05:21:01PM +0530, Pratik R. Sampat wrote:

Adds a generic interface to represent the energy and frequency related
PAPR attributes on the system using the new H_CALL
"H_GET_ENERGY_SCALE_INFO".

H_GET_EM_PARMS H_CALL was previously responsible for exporting this
information in the lparcfg, however the H_GET_EM_PARMS H_CALL
will be deprecated P10 onwards.

The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
hcall(
   uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
   uint64 flags,   // Per the flag request
   uint64 firstAttributeId,// The attribute id
   uint64 bufferAddress,   // Guest physical address of the output buffer
   uint64 bufferSize   // The size in bytes of the output buffer
);

This H_CALL can query either all the attributes at once with
firstAttributeId = 0, flags = 0 as well as query only one attribute
at a time with firstAttributeId = id, flags = 1.

The output buffer consists of the following
1. number of attributes  - 8 bytes
2. array offset to the data location - 8 bytes
3. version info  - 1 byte
4. A data array of size num attributes, which contains the following:
   a. attribute ID  - 8 bytes
   b. attribute value in number - 8 bytes
   c. attribute name in string  - 64 bytes
   d. attribute value in string - 64 bytes

The new H_CALL exports information in direct string value format, hence
a new interface has been introduced in
/sys/firmware/papr/energy_scale_info to export this information to
userspace in an extensible pass-through format.

The H_CALL returns the name, numeric value and string value (if exists)

The format of exposing the sysfs information is as follows:
/sys/firmware/papr/energy_scale_info/
|-- /
  |-- desc
  |-- value
  |-- value_desc (if exists)
|-- /
  |-- desc
  |-- value
  |-- value_desc (if exists)
...

The energy information that is exported is useful for userspace tools
such as powerpc-utils. Currently these tools infer the
"power_mode_data" value in the lparcfg, which in turn is obtained from
the to be deprecated H_GET_EM_PARMS H_CALL.
On future platforms, such userspace utilities will have to look at the
data returned from the new H_CALL being populated in this new sysfs
interface and report this information directly without the need of
interpretation.

Signed-off-by: Pratik R. Sampat 
Reviewed-by: Gautham R. Shenoy 
Reviewed-by: Fabiano Rosas 
Reviewed-by: Kajol Jain 
---
  .../sysfs-firmware-papr-energy-scale-info |  26 ++
  arch/powerpc/include/asm/hvcall.h |  24 +-
  arch/powerpc/kvm/trace_hv.h   |   1 +
  arch/powerpc/platforms/pseries/Makefile   |   3 +-
  .../pseries/papr_platform_attributes.c| 312 ++
  5 files changed, 364 insertions(+), 2 deletions(-)
  create mode 100644 
Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
  create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c

diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info 
b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
new file mode 100644
index ..139a576c7c9d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
@@ -0,0 +1,26 @@
+What:  /sys/firmware/papr/energy_scale_info
+Date:  June 2021
+Contact:   Linux for PowerPC mailing list 
+Description:   Directory hosting a set of platform attributes like
+   energy/frequency on Linux running as a PAPR guest.
+
+   Each file in a directory contains a platform
+   attribute hierarchy pertaining to performance/
+   energy-savings mode and processor frequency.
+
+What:  /sys/firmware/papr/energy_scale_info/
+   /sys/firmware/papr/energy_scale_info//desc
+   /sys/firmware/papr/energy_scale_info//value
+   /sys/firmware/papr/energy_scale_info//value_desc
+Date:  June 2021
+Contact:   Linux for PowerPC mailing list 
+Description:   Energy, frequency attributes directory for POWERVM servers
+
+   This directory provides energy, frequency, folding information. 
It
+   contains below sysfs attributes:
+
+   - desc: String description of the attribute 
+
+   - value: Numeric value of attribute 
+
+   - value_desc: String value of attribute 

Can you just make 4 different entries in this file, making it easier to
parse and extend over time?


Do you mean I only create one file per attribute and populate it with 4
different entries as follows?

# cat /sys/firmware/papr/energy_scale_info/
id:
desc:
value:
value_desc:




diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index 9bcf345cb208..38980fef7a3d 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ 

[PATCH v5 0/4] Add perf interface to expose nvdimm

2021-09-28 Thread Kajol Jain
Patchset adds performance stats reporting support for nvdimm.
Added interface includes support for pmu register/unregister
functions. A structure is added called nvdimm_pmu to be used for
adding arch/platform specific data such as cpumask, nvdimm device
pointer and pmu event functions like event_init/add/read/del.
User could use the standard perf tool to access perf events
exposed via pmu.

Interface also defines supported event list, config fields for the
event attributes and their corresponding bit values which are exported
via sysfs. Patch 3 exposes IBM pseries platform nmem* device
performance stats using this interface.

Result from power9 pseries lpar with 2 nvdimm device:

Ex: List all event by perf list

command:# perf list nmem

  nmem0/cache_rh_cnt/[Kernel PMU event]
  nmem0/cache_wh_cnt/[Kernel PMU event]
  nmem0/cri_res_util/[Kernel PMU event]
  nmem0/ctl_res_cnt/ [Kernel PMU event]
  nmem0/ctl_res_tm/  [Kernel PMU event]
  nmem0/fast_w_cnt/  [Kernel PMU event]
  nmem0/host_l_cnt/  [Kernel PMU event]
  nmem0/host_l_dur/  [Kernel PMU event]
  nmem0/host_s_cnt/  [Kernel PMU event]
  nmem0/host_s_dur/  [Kernel PMU event]
  nmem0/med_r_cnt/   [Kernel PMU event]
  nmem0/med_r_dur/   [Kernel PMU event]
  nmem0/med_w_cnt/   [Kernel PMU event]
  nmem0/med_w_dur/   [Kernel PMU event]
  nmem0/mem_life/[Kernel PMU event]
  nmem0/poweron_secs/[Kernel PMU event]
  ...
  nmem1/mem_life/[Kernel PMU event]
  nmem1/poweron_secs/[Kernel PMU event]

Patch1:
Introduces the nvdimm_pmu structure
Patch2:
Adds common interface to add arch/platform specific data
includes nvdimm device pointer, pmu data along with
pmu event functions. It also defines supported event list
and adds attribute groups for format, events and cpumask.
It also adds code for cpu hotplug support.
Patch3:
Add code in arch/powerpc/platform/pseries/papr_scm.c to expose
nmem* pmu. It fills in the nvdimm_pmu structure with pmu name,
capabilities, cpumask and event functions and then registers
the pmu by adding callbacks to register_nvdimm_pmu.
Patch4:
Sysfs documentation patch

Changelog
---
v4 -> v5:
- Remove multiple variables defined in nvdimm_pmu structure include
  name and pmu functions(event_int/add/del/read) as they are just
  used to copy them again in pmu variable. Now we are directly doing
  this step in arch specific code as suggested by Dan Williams.

- Remove attribute group field from nvdimm pmu structure and
  defined these attribute groups in common interface which
  includes format, event list along with cpumask as suggested by
  Dan Williams.
  Since we added static defination for attrbute groups needed in
  common interface, removes corresponding code from papr.

- Add nvdimm pmu event list with event codes in the common interface.

- Remove Acked-by/Reviewed-by/Tested-by tags as code is refactored
  to handle review comments from Dan.

- Make nvdimm_pmu_free_hotplug_memory function static as reported
  by kernel test robot, also add corresponding Reported-by tag.

- Link to the patchset v4: https://lkml.org/lkml/2021/9/3/45

v3 -> v4
- Rebase code on top of current papr_scm code without any logical
  changes.

- Added Acked-by tag from Peter Zijlstra and Reviewed by tag
  from Madhavan Srinivasan.

- Link to the patchset v3: https://lkml.org/lkml/2021/6/17/605

v2 -> v3
- Added Tested-by tag.

- Fix nvdimm mailing list in the ABI Documentation.

- Link to the patchset v2: https://lkml.org/lkml/2021/6/14/25

v1 -> v2
- Fix hotplug code by adding pmu migration call
  incase current designated cpu got offline. As
  pointed by Peter Zijlstra.

- Removed the retun -1 part from cpu hotplug offline
  function.

- Link to the patchset v1: https://lkml.org/lkml/2021/6/8/500

Kajol Jain (4):
  drivers/nvdimm: Add nvdimm pmu structure
  drivers/nvdimm: Add perf interface to expose nvdimm performance stats
  powerpc/papr_scm: Add perf interface support
  docs: ABI: sysfs-bus-nvdimm: Document sysfs event format entries for
nvdimm pmu

 Documentation/ABI/testing/sysfs-bus-nvdimm |  35 +++
 arch/powerpc/include/asm/device.h  |   5 +
 arch/powerpc/platforms/pseries/papr_scm.c  | 225 ++
 drivers/nvdimm/Makefile|   1 +
 drivers/nvdimm/nd_perf.c   | 328 +
 include/linux/nd.h |  41 +++
 6 

[PATCH] powerpc/powernv/prd: Unregister OPAL_MSG_PRD2 notifier during module unload

2021-09-28 Thread Vasant Hegde
Commit 587164cd, introduced new opal message type (OPAL_MSG_PRD2) and added
opal notifier. But I missed to unregister the notifier during module unload
path. This results in below call trace if you try to unload and load
opal_prd module.

Sample calltrace (modprobe -r opal_prd; modprobe opal_prd)
  [  213.335261] BUG: Unable to handle kernel data access on read at 
0xc008192200e0
  [  213.335287] Faulting instruction address: 0xc018d1cc
  [  213.335301] Oops: Kernel access of bad area, sig: 11 [#1]
  [  213.335313] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA PowerNV
  [  213.335736] CPU: 66 PID: 7446 Comm: modprobe Kdump: loaded Tainted: G  
  E 5.14.0prd #759
  [  213.335772] NIP:  c018d1cc LR: c018d2a8 CTR: 
c00cde10
  [  213.335805] REGS: c003c4c0f0a0 TRAP: 0300   Tainted: GE
  (5.14.0prd)
  [  213.335848] MSR:  92009033   CR: 
24224824  XER: 2004
  [  213.335893] CFAR: c018d2a4 DAR: c008192200e0 DSISR: 4000 
IRQMASK: 1
  [  213.335893] GPR00: c018d2a8 c003c4c0f340 c1995300 
c1a5ad08
  [  213.335893] GPR04: c0080e3700d0 c003c4c0f434 c10a8c08 
6e616d65
  [  213.335893] GPR08:  c008192200d0 0001 
c0080e351020
  [  213.335893] GPR12: c00cde10 c00ecb80 c003c4c0fd00 

  [  213.335893] GPR16: 0990 c0080d95 c0080d950990 
c103fd10
  [  213.335893] GPR20: c003c4c0fbc0 0001 c003c4c0fbc0 
c0080e370498
  [  213.335893] GPR24:  c0dab5c8 c1a5ad18 
c1a5ac80
  [  213.335893] GPR28: 0008 0001 c1a5ad00 
c1a5ad00
  [  213.336170] NIP [c018d1cc] notifier_chain_register+0x2c/0xc0
  [  213.336205] LR [c018d2a8] atomic_notifier_chain_register+0x48/0x80
  [  213.336238] Call Trace:
  [  213.336255] [c003c4c0f340] [c2090610] 0xc2090610 
(unreliable)
  [  213.336281] [c003c4c0f3a0] [c018d2b8] 
atomic_notifier_chain_register+0x58/0x80
  [  213.336309] [c003c4c0f3f0] [c00cde8c] 
opal_message_notifier_register+0x7c/0x1e0
  [  213.336345] [c003c4c0f4b0] [c0080e3508ac] 
opal_prd_probe+0x84/0x150 [opal_prd]
  [  213.336382] [c003c4c0f530] [c097acc8] platform_probe+0x78/0x130
  [  213.336416] [c003c4c0f5b0] [c0976520] really_probe+0x110/0x5d0
  [  213.336467] [c003c4c0f630] [c0976b5c] 
__driver_probe_device+0x17c/0x230
  [  213.336512] [c003c4c0f6b0] [c0976c70] 
driver_probe_device+0x60/0x130
  [  213.336556] [c003c4c0f700] [c097746c] 
__driver_attach+0xfc/0x220
  [  213.336592] [c003c4c0f780] [c0972e68] 
bus_for_each_dev+0xa8/0x130
  [  213.336627] [c003c4c0f7e0] [c0975b04] driver_attach+0x34/0x50
  [  213.336661] [c003c4c0f800] [c0974e20] 
bus_add_driver+0x1b0/0x300
  [  213.336696] [c003c4c0f890] [c0978468] 
driver_register+0x98/0x1a0
  [  213.336732] [c003c4c0f900] [c097a878] 
__platform_driver_register+0x38/0x50
  [  213.336768] [c003c4c0f920] [c0080e350dc0] 
opal_prd_driver_init+0x34/0x50 [opal_prd]
  [  213.336804] [c003c4c0f940] [c0012410] 
do_one_initcall+0x60/0x2d0
  [  213.336821] [c003c4c0fa10] [c0262c8c] do_init_module+0x7c/0x320
  [  213.336845] [c003c4c0fa90] [c0266544] load_module+0x3394/0x3650
  [  213.336880] [c003c4c0fc90] [c0266b34] 
__do_sys_finit_module+0xd4/0x160
  [  213.336914] [c003c4c0fdb0] [c0031e00] 
system_call_exception+0x140/0x290
  [  213.336958] [c003c4c0fe10] [c000c764] 
system_call_common+0xf4/0x258

Fixes: 587164cd ("powerpc/powernv: Add new opal message type")
Signed-off-by: Vasant Hegde 
---
 arch/powerpc/platforms/powernv/opal-prd.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/opal-prd.c 
b/arch/powerpc/platforms/powernv/opal-prd.c
index a191f4c60ce7..83305615db9e 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -393,6 +393,7 @@ static int opal_prd_probe(struct platform_device *pdev)
rc = opal_message_notifier_register(OPAL_MSG_PRD2, _prd_event_nb);
if (rc) {
pr_err("Couldn't register PRD2 event notifier\n");
+   opal_message_notifier_unregister(OPAL_MSG_PRD, 
_prd_event_nb);
return rc;
}
 
@@ -401,6 +402,8 @@ static int opal_prd_probe(struct platform_device *pdev)
pr_err("failed to register miscdev\n");
opal_message_notifier_unregister(OPAL_MSG_PRD,
_prd_event_nb);
+   opal_message_notifier_unregister(OPAL_MSG_PRD2,
+   _prd_event_nb);
return rc;
}
 
@@ -411,6 +414,7 @@ static int 

Re: [PATCH v8 1/2] powerpc/pseries: Interface to represent PAPR firmware attributes

2021-09-28 Thread Greg KH
On Tue, Sep 28, 2021 at 05:21:01PM +0530, Pratik R. Sampat wrote:
> Adds a generic interface to represent the energy and frequency related
> PAPR attributes on the system using the new H_CALL
> "H_GET_ENERGY_SCALE_INFO".
> 
> H_GET_EM_PARMS H_CALL was previously responsible for exporting this
> information in the lparcfg, however the H_GET_EM_PARMS H_CALL
> will be deprecated P10 onwards.
> 
> The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
> hcall(
>   uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
>   uint64 flags,   // Per the flag request
>   uint64 firstAttributeId,// The attribute id
>   uint64 bufferAddress,   // Guest physical address of the output buffer
>   uint64 bufferSize   // The size in bytes of the output buffer
> );
> 
> This H_CALL can query either all the attributes at once with
> firstAttributeId = 0, flags = 0 as well as query only one attribute
> at a time with firstAttributeId = id, flags = 1.
> 
> The output buffer consists of the following
> 1. number of attributes  - 8 bytes
> 2. array offset to the data location - 8 bytes
> 3. version info  - 1 byte
> 4. A data array of size num attributes, which contains the following:
>   a. attribute ID  - 8 bytes
>   b. attribute value in number - 8 bytes
>   c. attribute name in string  - 64 bytes
>   d. attribute value in string - 64 bytes
> 
> The new H_CALL exports information in direct string value format, hence
> a new interface has been introduced in
> /sys/firmware/papr/energy_scale_info to export this information to
> userspace in an extensible pass-through format.
> 
> The H_CALL returns the name, numeric value and string value (if exists)
> 
> The format of exposing the sysfs information is as follows:
> /sys/firmware/papr/energy_scale_info/
>|-- /
>  |-- desc
>  |-- value
>  |-- value_desc (if exists)
>|-- /
>  |-- desc
>  |-- value
>  |-- value_desc (if exists)
> ...
> 
> The energy information that is exported is useful for userspace tools
> such as powerpc-utils. Currently these tools infer the
> "power_mode_data" value in the lparcfg, which in turn is obtained from
> the to be deprecated H_GET_EM_PARMS H_CALL.
> On future platforms, such userspace utilities will have to look at the
> data returned from the new H_CALL being populated in this new sysfs
> interface and report this information directly without the need of
> interpretation.
> 
> Signed-off-by: Pratik R. Sampat 
> Reviewed-by: Gautham R. Shenoy 
> Reviewed-by: Fabiano Rosas 
> Reviewed-by: Kajol Jain 
> ---
>  .../sysfs-firmware-papr-energy-scale-info |  26 ++
>  arch/powerpc/include/asm/hvcall.h |  24 +-
>  arch/powerpc/kvm/trace_hv.h   |   1 +
>  arch/powerpc/platforms/pseries/Makefile   |   3 +-
>  .../pseries/papr_platform_attributes.c| 312 ++
>  5 files changed, 364 insertions(+), 2 deletions(-)
>  create mode 100644 
> Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
>  create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c
> 
> diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info 
> b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
> new file mode 100644
> index ..139a576c7c9d
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
> @@ -0,0 +1,26 @@
> +What:/sys/firmware/papr/energy_scale_info
> +Date:June 2021
> +Contact: Linux for PowerPC mailing list 
> +Description: Directory hosting a set of platform attributes like
> + energy/frequency on Linux running as a PAPR guest.
> +
> + Each file in a directory contains a platform
> + attribute hierarchy pertaining to performance/
> + energy-savings mode and processor frequency.
> +
> +What:/sys/firmware/papr/energy_scale_info/
> + /sys/firmware/papr/energy_scale_info//desc
> + /sys/firmware/papr/energy_scale_info//value
> + /sys/firmware/papr/energy_scale_info//value_desc
> +Date:June 2021
> +Contact: Linux for PowerPC mailing list 
> +Description: Energy, frequency attributes directory for POWERVM servers
> +
> + This directory provides energy, frequency, folding information. 
> It
> + contains below sysfs attributes:
> +
> + - desc: String description of the attribute 
> +
> + - value: Numeric value of attribute 
> +
> + - value_desc: String value of attribute 

Can you just make 4 different entries in this file, making it easier to
parse and extend over time?


> diff --git a/arch/powerpc/include/asm/hvcall.h 
> b/arch/powerpc/include/asm/hvcall.h
> index 9bcf345cb208..38980fef7a3d 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -323,7 +323,8 @@
>  

[PATCH v8 0/2] Interface to represent PAPR firmware attributes

2021-09-28 Thread Pratik R. Sampat
RFC: https://lkml.org/lkml/2021/6/4/791
PATCH v1: https://lkml.org/lkml/2021/6/16/805
PATCH v2: https://lkml.org/lkml/2021/7/6/138
PATCH v3: https://lkml.org/lkml/2021/7/12/2799
PATCH v4: https://lkml.org/lkml/2021/7/16/532
PATCH v5: https://lkml.org/lkml/2021/7/19/247
PATCH v6: https://lkml.org/lkml/2021/7/20/36
PATCH v7: https://lkml.org/lkml/2021/7/23/26 

Changelog v7-->v8
1. Rebased and tested against 5.15
2. Added a selftest to check if the energy and frequency attribues
   exist and their files populated

Also, have implemented a POC using this interface for the powerpc-utils'
ppc64_cpu --frequency command-line tool to utilize this information
in userspace.
The POC for the new interface has been sent to the powerpc-utils mailing
list for early review: 
https://groups.google.com/g/powerpc-utils-devel/c/r4i7JnlyQ8s

Sample output from the powerpc-utils tool is as follows:

# ppc64_cpu --frequency
Power and Performance Mode: 
Idle Power Saver Status   : 
Processor Folding Status  :  --> Printed if Idle power save status is 
supported

Platform reported frequencies --> Frequencies reported from the platform's 
H_CALL i.e PAPR interface
min: GHz
max: GHz
static : GHz

Tool Computed frequencies
min: GHz (cpu XX)
max: GHz (cpu XX)
avg: GHz

Pratik R. Sampat (2):
  powerpc/pseries: Interface to represent PAPR firmware attributes
  selftest/powerpc: Add PAPR sysfs attributes sniff test

 .../sysfs-firmware-papr-energy-scale-info |  26 ++
 arch/powerpc/include/asm/hvcall.h |  24 +-
 arch/powerpc/kvm/trace_hv.h   |   1 +
 arch/powerpc/platforms/pseries/Makefile   |   3 +-
 .../pseries/papr_platform_attributes.c| 312 ++
 tools/testing/selftests/powerpc/Makefile  |   1 +
 .../powerpc/papr_attributes/.gitignore|   2 +
 .../powerpc/papr_attributes/Makefile  |   7 +
 .../powerpc/papr_attributes/attr_test.c   | 107 ++
 9 files changed, 481 insertions(+), 2 deletions(-)
 create mode 100644 
Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
 create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c
 create mode 100644 tools/testing/selftests/powerpc/papr_attributes/.gitignore
 create mode 100644 tools/testing/selftests/powerpc/papr_attributes/Makefile
 create mode 100644 tools/testing/selftests/powerpc/papr_attributes/attr_test.c

-- 
2.31.1



[PATCH v8 2/2] selftest/powerpc: Add PAPR sysfs attributes sniff test

2021-09-28 Thread Pratik R. Sampat
Include a testcase to check if the sysfs files for energy and frequency
related have its related attribute files exist and populated

Signed-off-by: Pratik R. Sampat 
---
 tools/testing/selftests/powerpc/Makefile  |   1 +
 .../powerpc/papr_attributes/.gitignore|   2 +
 .../powerpc/papr_attributes/Makefile  |   7 ++
 .../powerpc/papr_attributes/attr_test.c   | 107 ++
 4 files changed, 117 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/papr_attributes/.gitignore
 create mode 100644 tools/testing/selftests/powerpc/papr_attributes/Makefile
 create mode 100644 tools/testing/selftests/powerpc/papr_attributes/attr_test.c

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 0830e63818c1..c68c872efb23 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -30,6 +30,7 @@ SUB_DIRS = alignment  \
   eeh  \
   vphn \
   math \
+  papr_attributes  \
   ptrace   \
   security
 
diff --git a/tools/testing/selftests/powerpc/papr_attributes/.gitignore 
b/tools/testing/selftests/powerpc/papr_attributes/.gitignore
new file mode 100644
index ..9c8cb54c8b28
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_attributes/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+attr_test
\ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/papr_attributes/Makefile 
b/tools/testing/selftests/powerpc/papr_attributes/Makefile
new file mode 100644
index ..135886f200ad
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_attributes/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := attr_test
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
diff --git a/tools/testing/selftests/powerpc/papr_attributes/attr_test.c 
b/tools/testing/selftests/powerpc/papr_attributes/attr_test.c
new file mode 100644
index ..905e2cbb3863
--- /dev/null
+++ b/tools/testing/selftests/powerpc/papr_attributes/attr_test.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PAPR Energy attributes sniff test
+ * This checks if the papr folders and contents are populated relating to
+ * the energy and frequency attributes
+ *
+ * Copyright 2021, Pratik Rajesh Sampat, IBM Corp.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+
+enum energy_freq_attrs {
+   POWER_PERFORMANCE_MODE = 1,
+   IDLE_POWER_SAVER_STATUS = 2,
+   MIN_FREQ = 3,
+   STAT_FREQ = 4,
+   MAX_FREQ = 6,
+   PROC_FOLDING_STATUS = 8
+};
+
+enum type {
+   INVALID,
+   STR_VAL,
+   NUM_VAL
+};
+
+int value_type(int id)
+{
+   int val_type;
+
+   switch(id) {
+   case POWER_PERFORMANCE_MODE:
+   case IDLE_POWER_SAVER_STATUS:
+   val_type = STR_VAL;
+   break;
+   case MIN_FREQ:
+   case STAT_FREQ:
+   case MAX_FREQ:
+   case PROC_FOLDING_STATUS:
+   val_type = NUM_VAL;
+   break;
+   default:
+   val_type = INVALID;
+   }
+
+   return val_type;
+}
+
+int verify_energy_info()
+{
+   const char *path = "/sys/firmware/papr/energy_scale_info";
+   struct dirent *entry;
+   struct stat s;
+   DIR *dirp;
+
+   if (stat(path, ) || !S_ISDIR(s.st_mode))
+   return -1;
+   dirp = opendir(path);
+
+   while ((entry = readdir(dirp)) != NULL) {
+   char file_name[64];
+   int id, attr_type;
+   FILE *f;
+
+   if (strcmp(entry->d_name,".") == 0 ||
+   strcmp(entry->d_name,"..") == 0)
+   continue;
+
+   id = atoi(entry->d_name);
+   attr_type = value_type(id);
+   if (attr_type == INVALID)
+   return -1;
+
+   /* Check if the files exist and have data in them */
+   sprintf(file_name, "%s/%d/desc", path, id);
+   f = fopen(file_name, "r");
+   if (!f || fgetc(f) == EOF)
+   return -1;
+
+   sprintf(file_name, "%s/%d/value", path, id);
+   f = fopen(file_name, "r");
+   if (!f || fgetc(f) == EOF)
+   return -1;
+
+   if (attr_type == STR_VAL) {
+   sprintf(file_name, "%s/%d/value_desc", path, id);
+   f = fopen(file_name, "r");
+   if (!f || fgetc(f) == EOF)
+   return -1;
+   }
+   }
+
+   return 0;
+}
+
+int main(void)
+{
+   return test_harness(verify_energy_info, "papr_attributes");
+}
-- 
2.31.1



[PATCH v8 1/2] powerpc/pseries: Interface to represent PAPR firmware attributes

2021-09-28 Thread Pratik R. Sampat
Adds a generic interface to represent the energy and frequency related
PAPR attributes on the system using the new H_CALL
"H_GET_ENERGY_SCALE_INFO".

H_GET_EM_PARMS H_CALL was previously responsible for exporting this
information in the lparcfg, however the H_GET_EM_PARMS H_CALL
will be deprecated P10 onwards.

The H_GET_ENERGY_SCALE_INFO H_CALL is of the following call format:
hcall(
  uint64 H_GET_ENERGY_SCALE_INFO,  // Get energy scale info
  uint64 flags,   // Per the flag request
  uint64 firstAttributeId,// The attribute id
  uint64 bufferAddress,   // Guest physical address of the output buffer
  uint64 bufferSize   // The size in bytes of the output buffer
);

This H_CALL can query either all the attributes at once with
firstAttributeId = 0, flags = 0 as well as query only one attribute
at a time with firstAttributeId = id, flags = 1.

The output buffer consists of the following
1. number of attributes  - 8 bytes
2. array offset to the data location - 8 bytes
3. version info  - 1 byte
4. A data array of size num attributes, which contains the following:
  a. attribute ID  - 8 bytes
  b. attribute value in number - 8 bytes
  c. attribute name in string  - 64 bytes
  d. attribute value in string - 64 bytes

The new H_CALL exports information in direct string value format, hence
a new interface has been introduced in
/sys/firmware/papr/energy_scale_info to export this information to
userspace in an extensible pass-through format.

The H_CALL returns the name, numeric value and string value (if exists)

The format of exposing the sysfs information is as follows:
/sys/firmware/papr/energy_scale_info/
   |-- /
 |-- desc
 |-- value
 |-- value_desc (if exists)
   |-- /
 |-- desc
 |-- value
 |-- value_desc (if exists)
...

The energy information that is exported is useful for userspace tools
such as powerpc-utils. Currently these tools infer the
"power_mode_data" value in the lparcfg, which in turn is obtained from
the to be deprecated H_GET_EM_PARMS H_CALL.
On future platforms, such userspace utilities will have to look at the
data returned from the new H_CALL being populated in this new sysfs
interface and report this information directly without the need of
interpretation.

Signed-off-by: Pratik R. Sampat 
Reviewed-by: Gautham R. Shenoy 
Reviewed-by: Fabiano Rosas 
Reviewed-by: Kajol Jain 
---
 .../sysfs-firmware-papr-energy-scale-info |  26 ++
 arch/powerpc/include/asm/hvcall.h |  24 +-
 arch/powerpc/kvm/trace_hv.h   |   1 +
 arch/powerpc/platforms/pseries/Makefile   |   3 +-
 .../pseries/papr_platform_attributes.c| 312 ++
 5 files changed, 364 insertions(+), 2 deletions(-)
 create mode 100644 
Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
 create mode 100644 arch/powerpc/platforms/pseries/papr_platform_attributes.c

diff --git a/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info 
b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
new file mode 100644
index ..139a576c7c9d
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-papr-energy-scale-info
@@ -0,0 +1,26 @@
+What:  /sys/firmware/papr/energy_scale_info
+Date:  June 2021
+Contact:   Linux for PowerPC mailing list 
+Description:   Directory hosting a set of platform attributes like
+   energy/frequency on Linux running as a PAPR guest.
+
+   Each file in a directory contains a platform
+   attribute hierarchy pertaining to performance/
+   energy-savings mode and processor frequency.
+
+What:  /sys/firmware/papr/energy_scale_info/
+   /sys/firmware/papr/energy_scale_info//desc
+   /sys/firmware/papr/energy_scale_info//value
+   /sys/firmware/papr/energy_scale_info//value_desc
+Date:  June 2021
+Contact:   Linux for PowerPC mailing list 
+Description:   Energy, frequency attributes directory for POWERVM servers
+
+   This directory provides energy, frequency, folding information. 
It
+   contains below sysfs attributes:
+
+   - desc: String description of the attribute 
+
+   - value: Numeric value of attribute 
+
+   - value_desc: String value of attribute 
diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index 9bcf345cb208..38980fef7a3d 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -323,7 +323,8 @@
 #define H_SCM_PERFORMANCE_STATS 0x418
 #define H_RPT_INVALIDATE   0x448
 #define H_SCM_FLUSH0x44C
-#define MAX_HCALL_OPCODE   H_SCM_FLUSH
+#define H_GET_ENERGY_SCALE_INFO0x450
+#define MAX_HCALL_OPCODE   H_GET_ENERGY_SCALE_INFO
 
 /* Scope args for H_SCM_UNBIND_ALL */
 #define H_UNBIND_SCOPE_ALL (0x1)
@@ -641,6 +642,27 @@ struct hv_gpci_request_buffer {

Re: [PATCH v4 4/8] PCI: replace pci_dev::driver usage that gets the driver name

2021-09-28 Thread Uwe Kleine-König
On Tue, Sep 28, 2021 at 12:01:28PM +0200, Simon Horman wrote:
> On Mon, Sep 27, 2021 at 10:43:22PM +0200, Uwe Kleine-König wrote:
> > From: Uwe Kleine-König 
> > 
> > struct pci_dev::driver holds (apart from a constant offset) the same
> > data as struct pci_dev::dev->driver. With the goal to remove struct
> > pci_dev::driver to get rid of data duplication replace getting the
> > driver name by dev_driver_string() which implicitly makes use of struct
> > pci_dev::dev->driver.
> > 
> > Signed-off-by: Uwe Kleine-König 
> 
> ...
> 
> > diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 
> > b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
> > index 0685ece1f155..23dfb599c828 100644
> > --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
> > +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
> > @@ -202,7 +202,7 @@ nfp_get_drvinfo(struct nfp_app *app, struct pci_dev 
> > *pdev,
> >  {
> > char nsp_version[ETHTOOL_FWVERS_LEN] = {};
> >  
> > -   strlcpy(drvinfo->driver, pdev->driver->name, sizeof(drvinfo->driver));
> > +   strlcpy(drvinfo->driver, dev_driver_string(>dev), 
> > sizeof(drvinfo->driver));
> 
> I'd slightly prefer to maintain lines under 80 columns wide.
> But not nearly strongly enough to engage in a long debate about it.

:-)

Looking at the output of

git grep strlcpy.\*sizeof

I wonder if it would be sensible to introduce something like

#define strlcpy_array(arr, src) (strlcpy(arr, src, sizeof(arr)) + 
__must_be_array(arr))

but not sure this is possible without a long debate either (and this
line is over 80 chars wide, too :-).

> In any case, for the NFP portion of this patch.
> 
> Acked-by: Simon Horman 

Thanks
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | https://www.pengutronix.de/ |


signature.asc
Description: PGP signature


Re: [next-20210827][ppc][multipathd] INFO: task hung in dm_table_add_target

2021-09-28 Thread Abdul Haleem


On 9/1/21 7:06 PM, Christoph Hellwig wrote:

On Wed, Sep 01, 2021 at 04:47:26PM +0530, Abdul Haleem wrote:

Greeting's

multiple task hung while adding the vfc disk back to the multipath on my
powerpc box running linux-next kernel

Can you retry to reproduce this with lockdep enabled to see if there
is anything interesting holding this lock?


LOCKDEP was earlier enabled by default

# cat .config | grep LOCKDEP
CONFIG_LOCKDEP_SUPPORT=y

BTW, Recreated again on 5.15.0-rc2 mainline kernel and attaching the logs

--
Regard's

Abdul Haleem
IBM Linux Technology Center

device-mapper: multipath: 253:1: Reinstating path 8:16.
device-mapper: multipath: 253:0: Failing path 8:0.
device-mapper: multipath: 253:0: Failing path 8:32.
device-mapper: multipath: 253:0: Failing path 8:192.
device-mapper: multipath: 253:0: Failing path 8:208.
device-mapper: multipath: 253:0: Reinstating path 8:0.
device-mapper: multipath: 253:0: Reinstating path 8:32.
device-mapper: multipath: 253:0: Reinstating path 8:192.
device-mapper: multipath: 253:0: Reinstating path 8:208.
INFO: task multipathd:881519 blocked for more than 122 seconds.
  Not tainted 5.15.0-rc2+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:multipathd  state:D stack:0 pid:881519 ppid: 1 flags:0x00040082
Call Trace:
[c00096eff2b0] [c000ae18dd10] 0xc000ae18dd10 (unreliable)
[c00096eff4a0] [c001ea68] __switch_to+0x288/0x4a0
[c00096eff500] [c0e07bfc] __schedule+0x30c/0x9f0
[c00096eff5c0] [c0e08348] schedule+0x68/0x120
[c00096eff5f0] [c0e08930] schedule_preempt_disabled+0x20/0x30
[c00096eff610] [c0e0aedc] __mutex_lock.isra.11+0x36c/0x700
[c00096eff6a0] [c0788e0c] bd_link_disk_holder+0x3c/0x280
[c00096eff6f0] [c00800fb5848] dm_get_table_device+0x1f0/0x2d0 [dm_mod]
[c00096eff790] [c00800fb9ce8] dm_get_device+0x130/0x2f0 [dm_mod]
[c00096eff840] [c008011553b4] multipath_ctr+0x9cc/0x1000 [dm_multipath]
[c00096eff9c0] [c00800fba704] dm_table_add_target+0x1ac/0x420 [dm_mod]
[c00096effa80] [c00800fc0a04] table_load+0x16c/0x4c0 [dm_mod]
[c00096effb30] [c00800fc3734] ctl_ioctl+0x28c/0x7e0 [dm_mod]
[c00096effd40] [c00800fc3ca8] dm_ctl_ioctl+0x20/0x40 [dm_mod]
[c00096effd60] [c0545db8] sys_ioctl+0xf8/0x150
[c00096effdb0] [c0031074] system_call_exception+0x174/0x370
[c00096effe10] [c000c74c] system_call_common+0xec/0x250
--- interrupt: c00 at 0x7fffb86ac010
NIP:  7fffb86ac010 LR: 7fffb8a86924 CTR: 
REGS: c00096effe80 TRAP: 0c00   Not tainted  (5.15.0-rc2+)
MSR:  8000d033   CR: 24042204  XER: 
IRQMASK: 0 
GPR00: 0036 7fffb7cec3a0 7fffb8797300 0005 
GPR04: c138fd09 7fffb0069c90 7fffb8a8a118 7fffb7cea298 
GPR08: 0005    
GPR12:  7fffb7cf6300 7fffb0069c90 7fffb8a89e80 
GPR16: 7fffb8a89e80 7fffb8a89e80 7fffb8ac3670  
GPR20: 7fffb8ac2040 7fffb8a93460 7fffb0069cc0 01001f65ab80 
GPR24: 7fffb8a89e80 7fffb8a89e80 7fffb8a89e80  
GPR28: 7fffb8a89e80 7fffb8a89e80  7fffb8a89e80 
NIP [7fffb86ac010] 0x7fffb86ac010
LR [7fffb8a86924] 0x7fffb8a86924
--- interrupt: c00
INFO: task systemd-udevd:881738 blocked for more than 122 seconds.
  Not tainted 5.15.0-rc2+ #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:systemd-udevd   state:D stack:0 pid:881738 ppid:   708 flags:0x00042482
Call Trace:
[c006b317b280] [c07640a4] bio_associate_blkg+0x44/0xb0 (unreliable)
[c006b317b470] [c001ea68] __switch_to+0x288/0x4a0
[c006b317b4d0] [c0e07bfc] __schedule+0x30c/0x9f0
[c006b317b590] [c0e08348] schedule+0x68/0x120
[c006b317b5c0] [c0e08adc] io_schedule+0x2c/0x50
[c006b317b5f0] [c03ea624] __lock_page+0x1e4/0x430
[c006b317b6d0] [c0407fc8] truncate_inode_pages_range+0x338/0x8b0
[c006b317b850] [c0725714] kill_bdev.isra.14+0x44/0x60
[c006b317b880] [c07261f4] blkdev_flush_mapping+0x54/0x260
[c006b317b960] [c0726488] blkdev_put_whole+0x88/0x90
[c006b317b9a0] [c072714c] blkdev_put+0x1cc/0x280
[c006b317ba00] [c0727e9c] blkdev_close+0x3c/0x60
[c006b317ba30] [c0525694] __fput+0xc4/0x350
[c006b317ba80] [c0191128] task_work_run+0xf8/0x170
[c006b317bad0] [c0161c34] do_exit+0x4a4/0xd30
[c006b317bba0] [c0162594] do_group_exit+0x64/0xe0
[c006b317bbe0] [c0177fb8] get_signal+0x258/0xce0
[c006b317bcd0] [c00219d4] do_notify_resume+0x114/0x480
[c006b317bd80] [c0030e40] 
interrupt_exit_user_prepare_main+0x1a0/0x260
[c006b317bde0] [c00312e0] 

[PATCH kernel] powerps/pseries/dma: Add support for 2M IOMMU page size

2021-09-28 Thread Alexey Kardashevskiy
The upcoming PAPR spec adds a 2M page size, bit 23 right after the 16G page
size in the "ibm,query-pe-dma-window" call.

This adds support for the new page size. Since the new page size is out
of sorted order, this changes the loop to not assume that shift[] is
sorted.

Signed-off-by: Alexey Kardashevskiy 
---

This might not work if PHYP keeps rejecting new window requests for less
than 32768 TCEs. This is needed:
https://github.com/aik/linux/commit/8cc8fa5ba5b3b4a18efbc9d81d9e5b85ca7c8a95


---
 arch/powerpc/platforms/pseries/iommu.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index c741689a5165..237bf405b0cb 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1159,14 +1159,15 @@ static void reset_dma_window(struct pci_dev *dev, 
struct device_node *par_dn)
 /* Return largest page shift based on "IO Page Sizes" output of 
ibm,query-pe-dma-window. */
 static int iommu_get_page_shift(u32 query_page_size)
 {
-   /* Supported IO page-sizes according to LoPAR */
+   /* Supported IO page-sizes according to LoPAR, note that 2M is out of 
order */
const int shift[] = {
__builtin_ctzll(SZ_4K),   __builtin_ctzll(SZ_64K), 
__builtin_ctzll(SZ_16M),
__builtin_ctzll(SZ_32M),  __builtin_ctzll(SZ_64M), 
__builtin_ctzll(SZ_128M),
-   __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G)
+   __builtin_ctzll(SZ_256M), __builtin_ctzll(SZ_16G), 
__builtin_ctzll(SZ_2M)
};
 
int i = ARRAY_SIZE(shift) - 1;
+   int ret = 0;
 
/*
 * On LoPAR, ibm,query-pe-dma-window outputs "IO Page Sizes" using a 
bit field:
@@ -1176,11 +1177,10 @@ static int iommu_get_page_shift(u32 query_page_size)
 */
for (; i >= 0 ; i--) {
if (query_page_size & (1 << i))
-   return shift[i];
+   ret = max(ret, shift[i]);
}
 
-   /* No valid page size found. */
-   return 0;
+   return ret;
 }
 
 static struct property *ddw_property_create(const char *propname, u32 liobn, 
u64 dma_addr,
-- 
2.30.2



Re: [PATCH v2 4/4] s390: Use generic version of arch_is_kernel_initmem_freed()

2021-09-28 Thread Heiko Carstens
On Tue, Sep 28, 2021 at 09:15:37AM +0200, Christophe Leroy wrote:
> Generic version of arch_is_kernel_initmem_freed() now does the same
> as s390 version.
> 
> Remove the s390 version.
> 
> Cc: Gerald Schaefer 
> Signed-off-by: Christophe Leroy 
> ---
> v2: No change
> ---
>  arch/s390/include/asm/sections.h | 12 
>  arch/s390/mm/init.c  |  3 ---
>  2 files changed, 15 deletions(-)

Looks good. Thanks for cleaning this up!

Acked-by: Heiko Carstens 


Re: [PATCH 1/4] crypto: nintendo-aes - add a new AES driver

2021-09-28 Thread Geert Uytterhoeven
On Wed, Sep 22, 2021 at 4:12 AM Joel Stanley  wrote:
> On Tue, 21 Sept 2021 at 21:47, Emmanuel Gil Peyrot
>  wrote:
> >
> > This engine implements AES in CBC mode, using 128-bit keys only.  It is
> > present on both the Wii and the Wii U, and is apparently identical in
> > both consoles.
> >
> > The hardware is capable of firing an interrupt when the operation is
> > done, but this driver currently uses a busy loop, I’m not too sure
> > whether it would be preferable to switch, nor how to achieve that.
> >
> > It also supports a mode where no operation is done, and thus could be
> > used as a DMA copy engine, but I don’t know how to expose that to the
> > kernel or whether it would even be useful.
> >
> > In my testing, on a Wii U, this driver reaches 80.7 MiB/s, while the
> > aes-generic driver only reaches 30.9 MiB/s, so it is a quite welcome
> > speedup.
> >
> > This driver was written based on reversed documentation, see:
> > https://wiibrew.org/wiki/Hardware/AES
> >
> > Signed-off-by: Emmanuel Gil Peyrot 
> > Tested-by: Emmanuel Gil Peyrot   # on Wii U
> > ---
> >  drivers/crypto/Kconfig|  11 ++
> >  drivers/crypto/Makefile   |   1 +
> >  drivers/crypto/nintendo-aes.c | 273 ++
> >  3 files changed, 285 insertions(+)
> >  create mode 100644 drivers/crypto/nintendo-aes.c
> >
> > diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig
> > index 9a4c275a1335..adc94ad7462d 100644
> > --- a/drivers/crypto/Kconfig
> > +++ b/drivers/crypto/Kconfig
> > @@ -871,4 +871,15 @@ config CRYPTO_DEV_SA2UL
> >
> >  source "drivers/crypto/keembay/Kconfig"
> >
> > +config CRYPTO_DEV_NINTENDO
> > +   tristate "Support for the Nintendo Wii U AES engine"
> > +   depends on WII || WIIU || COMPILE_TEST
>
> This current seteup will allow the driver to be compile tested for
> non-powerpc, which will fail on the dcbf instructions.
>
> Perhaps use this instead:
>
>depends on WII || WIIU || (COMPILE_TEST && PPC)

Or:

depends on PPC
depends on WII || WIIU || COMPILE_TEST

to distinguish between hard and soft dependencies.

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


Re: [PATCH v4 4/8] PCI: replace pci_dev::driver usage that gets the driver name

2021-09-28 Thread Kalle Valo
Uwe Kleine-König  writes:

> From: Uwe Kleine-König 
>
> struct pci_dev::driver holds (apart from a constant offset) the same
> data as struct pci_dev::dev->driver. With the goal to remove struct
> pci_dev::driver to get rid of data duplication replace getting the
> driver name by dev_driver_string() which implicitly makes use of struct
> pci_dev::dev->driver.
>
> Signed-off-by: Uwe Kleine-König 
> ---
>  arch/powerpc/include/asm/ppc-pci.h   | 9 -
>  drivers/bcma/host_pci.c  | 7 ---

For bcma:

Acked-by: Kalle Valo 

-- 
https://patchwork.kernel.org/project/linux-wireless/list/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches


[PATCH v2 2/4] mm: Make generic arch_is_kernel_initmem_freed() do what it says

2021-09-28 Thread Christophe Leroy
Commit 7a5da02de8d6 ("locking/lockdep: check for freed initmem in
static_obj()") added arch_is_kernel_initmem_freed() which is supposed
to report whether an object is part of already freed init memory.

For the time being, the generic version of arch_is_kernel_initmem_freed()
always reports 'false', allthough free_initmem() is generically called
on all architectures.

Therefore, change the generic version of arch_is_kernel_initmem_freed()
to check whether free_initmem() has been called. If so, then check
if a given address falls into init memory.

In order to use function init_section_contains(), the fonction is
moved at the end of asm-generic/section.h

Cc: Gerald Schaefer 
Signed-off-by: Christophe Leroy 
---
v2: Change to using the new SYSTEM_FREEING_INITMEM state
---
 include/asm-generic/sections.h | 31 +--
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index d16302d3eb59..13f301239007 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -80,20 +80,6 @@ static inline int arch_is_kernel_data(unsigned long addr)
 }
 #endif
 
-/*
- * Check if an address is part of freed initmem. This is needed on 
architectures
- * with virt == phys kernel mapping, for code that wants to check if an address
- * is part of a static object within [_stext, _end]. After initmem is freed,
- * memory can be allocated from it, and such allocations would then have
- * addresses within the range [_stext, _end].
- */
-#ifndef arch_is_kernel_initmem_freed
-static inline int arch_is_kernel_initmem_freed(unsigned long addr)
-{
-   return 0;
-}
-#endif
-
 /**
  * memory_contains - checks if an object is contained within a memory region
  * @begin: virtual address of the beginning of the memory region
@@ -172,4 +158,21 @@ static inline bool is_kernel_rodata(unsigned long addr)
   addr < (unsigned long)__end_rodata;
 }
 
+/*
+ * Check if an address is part of freed initmem. This is needed on 
architectures
+ * with virt == phys kernel mapping, for code that wants to check if an address
+ * is part of a static object within [_stext, _end]. After initmem is freed,
+ * memory can be allocated from it, and such allocations would then have
+ * addresses within the range [_stext, _end].
+ */
+#ifndef arch_is_kernel_initmem_freed
+static inline int arch_is_kernel_initmem_freed(unsigned long addr)
+{
+   if (system_state < SYSTEM_FREEING_INITMEM)
+   return 0;
+
+   return init_section_contains((void *)addr, 1);
+}
+#endif
+
 #endif /* _ASM_GENERIC_SECTIONS_H_ */
-- 
2.31.1



[PATCH v2 1/4] mm: Create a new system state and fix core_kernel_text()

2021-09-28 Thread Christophe Leroy
core_kernel_text() considers that until system_state in at least
SYSTEM_RUNNING, init memory is valid.

But init memory is freed a few lines before setting SYSTEM_RUNNING,
so we have a small period of time when core_kernel_text() is wrong.

Create an intermediate system state called SYSTEM_FREEING_INIT that
is set before starting freeing init memory, and use it in
core_kernel_text() to report init memory invalid earlier.

Cc: Gerald Schaefer 
Signed-off-by: Christophe Leroy 
---
v2: New
---
 include/linux/kernel.h | 1 +
 init/main.c| 2 ++
 kernel/extable.c   | 2 +-
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2776423a587e..471bc0593679 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -248,6 +248,7 @@ extern bool early_boot_irqs_disabled;
 extern enum system_states {
SYSTEM_BOOTING,
SYSTEM_SCHEDULING,
+   SYSTEM_FREEING_INITMEM,
SYSTEM_RUNNING,
SYSTEM_HALT,
SYSTEM_POWER_OFF,
diff --git a/init/main.c b/init/main.c
index 3f7216934441..c457d393fdd4 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1505,6 +1505,8 @@ static int __ref kernel_init(void *unused)
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
+
+   system_state = SYSTEM_FREEING_INITMEM;
kprobe_free_init_mem();
ftrace_free_init_mem();
kgdb_free_init_mem();
diff --git a/kernel/extable.c b/kernel/extable.c
index b0ea5eb0c3b4..290661f68e6b 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -76,7 +76,7 @@ int notrace core_kernel_text(unsigned long addr)
addr < (unsigned long)_etext)
return 1;
 
-   if (system_state < SYSTEM_RUNNING &&
+   if (system_state < SYSTEM_FREEING_INITMEM &&
init_kernel_text(addr))
return 1;
return 0;
-- 
2.31.1



[PATCH v2 3/4] powerpc: Use generic version of arch_is_kernel_initmem_freed()

2021-09-28 Thread Christophe Leroy
Generic version of arch_is_kernel_initmem_freed() now does the same
as powerpc version.

Remove the powerpc version.

Signed-off-by: Christophe Leroy 
---
v2: No change
---
 arch/powerpc/include/asm/sections.h | 13 -
 1 file changed, 13 deletions(-)

diff --git a/arch/powerpc/include/asm/sections.h 
b/arch/powerpc/include/asm/sections.h
index 6e4af4492a14..79cb7a25a5fb 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -6,21 +6,8 @@
 #include 
 #include 
 
-#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
-
 #include 
 
-extern bool init_mem_is_free;
-
-static inline int arch_is_kernel_initmem_freed(unsigned long addr)
-{
-   if (!init_mem_is_free)
-   return 0;
-
-   return addr >= (unsigned long)__init_begin &&
-   addr < (unsigned long)__init_end;
-}
-
 extern char __head_end[];
 
 #ifdef __powerpc64__
-- 
2.31.1



[PATCH v2 4/4] s390: Use generic version of arch_is_kernel_initmem_freed()

2021-09-28 Thread Christophe Leroy
Generic version of arch_is_kernel_initmem_freed() now does the same
as s390 version.

Remove the s390 version.

Cc: Gerald Schaefer 
Signed-off-by: Christophe Leroy 
---
v2: No change
---
 arch/s390/include/asm/sections.h | 12 
 arch/s390/mm/init.c  |  3 ---
 2 files changed, 15 deletions(-)

diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h
index 85881dd48022..3fecaa4e8b74 100644
--- a/arch/s390/include/asm/sections.h
+++ b/arch/s390/include/asm/sections.h
@@ -2,20 +2,8 @@
 #ifndef _S390_SECTIONS_H
 #define _S390_SECTIONS_H
 
-#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed
-
 #include 
 
-extern bool initmem_freed;
-
-static inline int arch_is_kernel_initmem_freed(unsigned long addr)
-{
-   if (!initmem_freed)
-   return 0;
-   return addr >= (unsigned long)__init_begin &&
-  addr < (unsigned long)__init_end;
-}
-
 /*
  * .boot.data section contains variables "shared" between the decompressor and
  * the decompressed kernel. The decompressor will store values in them, and
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index a04faf49001a..8c6f258a6183 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -58,8 +58,6 @@ unsigned long empty_zero_page, zero_page_mask;
 EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(zero_page_mask);
 
-bool initmem_freed;
-
 static void __init setup_zero_pages(void)
 {
unsigned int order;
@@ -214,7 +212,6 @@ void __init mem_init(void)
 
 void free_initmem(void)
 {
-   initmem_freed = true;
__set_memory((unsigned long)_sinittext,
 (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
 SET_MEMORY_RW | SET_MEMORY_NX);
-- 
2.31.1