date:20240521

Re: [PATCH V3 1/3] vhost-vdpa: flush workers on suspend

2024-05-21 Thread Jason Wang

On Tue, May 21, 2024 at 9:39 PM Steven Sistare
 wrote:
>
> On 5/20/2024 10:28 PM, Jason Wang wrote:
> > On Mon, May 20, 2024 at 11:21 PM Steve Sistare
> >  wrote:
> >>
> >> Flush to guarantee no workers are running when suspend returns.
> >>
> >> Fixes: f345a0143b4d ("vhost-vdpa: uAPI to suspend the device")
> >> Signed-off-by: Steve Sistare 
> >> Acked-by: Eugenio Pérez 
> >> ---
> >>   drivers/vhost/vdpa.c | 3 +++
> >>   1 file changed, 3 insertions(+)
> >>
> >> diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
> >> index ba52d128aeb7..189596caaec9 100644
> >> --- a/drivers/vhost/vdpa.c
> >> +++ b/drivers/vhost/vdpa.c
> >> @@ -594,6 +594,7 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v)
> >>  struct vdpa_device *vdpa = v->vdpa;
> >>  const struct vdpa_config_ops *ops = vdpa->config;
> >>  int ret;
> >> +   struct vhost_dev *vdev = >vdev;
> >>
> >>  if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK))
> >>  return 0;
> >> @@ -601,6 +602,8 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v)
> >>  if (!ops->suspend)
> >>  return -EOPNOTSUPP;
> >>
> >> +   vhost_dev_flush(vdev);
> >
> > vhost-vDPA doesn't use workers, see:
> >
> >  vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false,
> > vhost_vdpa_process_iotlb_msg);
> >
> > So I wonder if this is a must.
>
> True, but I am adding this to be future proof.  I could instead log a warning
> or an error message if vhost_vdpa_suspend is called and 
> v->vdev.use_worker=true,
> but IMO we should just fix it, given that the fix is trivial.

I meant we need to know if it fixes any actual issue or not.

Thanks

>
> - Steve
>
>
>

Re: [PATCH V3 2/3] vduse: suspend

2024-05-21 Thread Jason Wang

On Tue, May 21, 2024 at 9:39 PM Steven Sistare
 wrote:
>
> On 5/20/2024 10:30 PM, Jason Wang wrote:
> > On Mon, May 20, 2024 at 11:21 PM Steve Sistare
> >  wrote:
> >>
> >> Support the suspend operation.  There is little to do, except flush to
> >> guarantee no workers are running when suspend returns.
> >>
> >> Signed-off-by: Steve Sistare 
> >> ---
> >>   drivers/vdpa/vdpa_user/vduse_dev.c | 24 
> >>   1 file changed, 24 insertions(+)
> >>
> >> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
> >> b/drivers/vdpa/vdpa_user/vduse_dev.c
> >> index 73c89701fc9d..7dc46f771f12 100644
> >> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> >> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> >> @@ -472,6 +472,18 @@ static void vduse_dev_reset(struct vduse_dev *dev)
> >>  up_write(>rwsem);
> >>   }
> >>
> >> +static void vduse_flush_work(struct vduse_dev *dev)
> >> +{
> >> +   flush_work(>inject);
> >> +
> >> +   for (int i = 0; i < dev->vq_num; i++) {
> >> +   struct vduse_virtqueue *vq = dev->vqs[i];
> >> +
> >> +   flush_work(>inject);
> >> +   flush_work(>kick);
> >> +   }
> >> +}
> >> +
> >>   static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
> >>  u64 desc_area, u64 driver_area,
> >>  u64 device_area)
> >> @@ -724,6 +736,17 @@ static int vduse_vdpa_reset(struct vdpa_device *vdpa)
> >>  return ret;
> >>   }
> >>
> >> +static int vduse_vdpa_suspend(struct vdpa_device *vdpa)
> >> +{
> >> +   struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> >> +
> >> +   down_write(>rwsem);
> >> +   vduse_flush_work(dev);
> >> +   up_write(>rwsem);
> >
> > Can this forbid the new work to be scheduled?
>
> Are you suggesting I return an error below if the dev is suspended?
> I can do that.

I mean the irq injection work can still be scheduled after vduse_vdpa_suspend().

>
> However, I now suspect this implementation of vduse_vdpa_suspend is not
> complete in other ways, so I withdraw this patch pending future work.
> Thanks for looking at it.

Ok.

Thanks

>
> - Steve
>
> > static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
> >  struct work_struct *irq_work,
> >  int irq_effective_cpu)
> > {
> >  int ret = -EINVAL;
> >
> >  down_read(>rwsem);
> >  if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
> >  goto unlock;
> >
> >  ret = 0;
> >  if (irq_effective_cpu == IRQ_UNBOUND)
> >  queue_work(vduse_irq_wq, irq_work);
> >  else
> >  queue_work_on(irq_effective_cpu,
> >vduse_irq_bound_wq, irq_work);
> > unlock:
> >  up_read(>rwsem);
> >
> >  return ret;
> > }
> >
> > Thanks
> >
> >> +
> >> +   return 0;
> >> +}
> >> +
> >>   static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
> >>   {
> >>  struct vduse_dev *dev = vdpa_to_vduse(vdpa);
> >> @@ -806,6 +829,7 @@ static const struct vdpa_config_ops 
> >> vduse_vdpa_config_ops = {
> >>  .set_vq_affinity= vduse_vdpa_set_vq_affinity,
> >>  .get_vq_affinity= vduse_vdpa_get_vq_affinity,
> >>  .reset  = vduse_vdpa_reset,
> >> +   .suspend= vduse_vdpa_suspend,
> >>  .set_map= vduse_vdpa_set_map,
> >>  .free   = vduse_vdpa_free,
> >>   };
> >> --
> >> 2.39.3
> >>
> >
>

Re: [PATCH V3 3/3] vdpa_sim: flush workers on suspend

2024-05-21 Thread Jason Wang

On Tue, May 21, 2024 at 9:39 PM Steven Sistare
 wrote:
>
> On 5/20/2024 10:32 PM, Jason Wang wrote:
> > On Mon, May 20, 2024 at 11:21 PM Steve Sistare
> >  wrote:
> >>
> >> Flush to guarantee no workers are running when suspend returns.
> >> Add a lock to enforce ordering between clearing running, flushing,
> >> and posting new work in vdpasim_kick_vq.  It must be a spin lock
> >> because vdpasim_kick_vq may be reached va eventfd_write.
> >>
> >> Signed-off-by: Steve Sistare 
> >> ---
> >>   drivers/vdpa/vdpa_sim/vdpa_sim.c | 16 ++--
> >>   drivers/vdpa/vdpa_sim/vdpa_sim.h |  1 +
> >>   2 files changed, 15 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c 
> >> b/drivers/vdpa/vdpa_sim/vdpa_sim.c
> >> index 8ffea8430f95..67ed49d95bf0 100644
> >> --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
> >> +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
> >> @@ -322,7 +322,7 @@ static u16 vdpasim_get_vq_size(struct vdpa_device 
> >> *vdpa, u16 idx)
> >>  return VDPASIM_QUEUE_MAX;
> >>   }
> >>
> >> -static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx)
> >> +static void vdpasim_do_kick_vq(struct vdpa_device *vdpa, u16 idx)
> >>   {
> >>  struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
> >>  struct vdpasim_virtqueue *vq = >vqs[idx];
> >> @@ -337,6 +337,15 @@ static void vdpasim_kick_vq(struct vdpa_device *vdpa, 
> >> u16 idx)
> >>  vdpasim_schedule_work(vdpasim);
> >>   }
> >>
> >> +static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx)
> >> +{
> >> +   struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
> >> +
> >> +   spin_lock(>kick_lock);
> >> +   vdpasim_do_kick_vq(vdpa, idx);
> >> +   spin_unlock(>kick_lock);
> >> +}
> >> +
> >>   static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
> >>struct vdpa_callback *cb)
> >>   {
> >> @@ -520,8 +529,11 @@ static int vdpasim_suspend(struct vdpa_device *vdpa)
> >>  struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
> >>
> >>  mutex_lock(>mutex);
> >> +   spin_lock(>kick_lock);
> >>  vdpasim->running = false;
> >> +   spin_unlock(>kick_lock);
> >>  mutex_unlock(>mutex);
> >> +   kthread_flush_work(>work);
> >>
> >>  return 0;
> >>   }
> >> @@ -537,7 +549,7 @@ static int vdpasim_resume(struct vdpa_device *vdpa)
> >>  if (vdpasim->pending_kick) {
> >>  /* Process pending descriptors */
> >>  for (i = 0; i < vdpasim->dev_attr.nvqs; ++i)
> >> -   vdpasim_kick_vq(vdpa, i);
> >> +   vdpasim_do_kick_vq(vdpa, i);
> >>
> >>  vdpasim->pending_kick = false;
> >>  }
> >> diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h 
> >> b/drivers/vdpa/vdpa_sim/vdpa_sim.h
> >> index bb137e479763..5eb6ca9c5ec5 100644
> >> --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h
> >> +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h
> >> @@ -75,6 +75,7 @@ struct vdpasim {
> >>  bool pending_kick;
> >>  /* spinlock to synchronize iommu table */
> >>  spinlock_t iommu_lock;
> >> +   spinlock_t kick_lock;
> >
> > It looks to me this is not initialized?
>
> Yup, I lost that line while fiddling with different locking schemes.
> Thanks, will fix in V4.
>
> @@ -236,6 +236,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr
> *dev_attr,
>
>  mutex_init(>mutex);
>  spin_lock_init(>iommu_lock);
> +   spin_lock_init(>kick_lock);
>
> With that fix, does this patch earn your RB?

Yes.

Thanks

>
> - Steve
>
> >>   };
> >>
> >>   struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr,
> >> --
> >> 2.39.3
> >>
> >
>

Re: [PATCH v3 2/2] LoongArch: Add steal time support in guest side

2024-05-21 Thread kernel test robot

Hi Bibo,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 3c999d1ae3c75991902a1a7dad0cb62c2a3008b4]

url:
https://github.com/intel-lab-lkp/linux/commits/Bibo-Mao/LoongArch-KVM-Add-steal-time-support-in-kvm-side/20240521-104902
base:   3c999d1ae3c75991902a1a7dad0cb62c2a3008b4
patch link:
https://lore.kernel.org/r/20240521024556.419436-3-maobibo%40loongson.cn
patch subject: [PATCH v3 2/2] LoongArch: Add steal time support in guest side
config: loongarch-kismet-CONFIG_PARAVIRT-CONFIG_PARAVIRT_TIME_ACCOUNTING-0-0 
(https://download.01.org/0day-ci/archive/20240522/202405221028.qrcedmnq-...@intel.com/config)
reproduce: 
(https://download.01.org/0day-ci/archive/20240522/202405221028.qrcedmnq-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202405221028.qrcedmnq-...@intel.com/

kismet warnings: (new ones prefixed by >>)
>> kismet: WARNING: unmet direct dependencies detected for PARAVIRT when 
>> selected by PARAVIRT_TIME_ACCOUNTING
   

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

[PATCH v2 4/4] selftests/bpf: add test validating uprobe/uretprobe stack traces

2024-05-21 Thread Andrii Nakryiko

Add a set of tests to validate that stack traces captured from or in the
presence of active uprobes and uretprobes are valid and complete.

For this we use BPF program that are installed either on entry or exit
of user function, plus deep-nested USDT. One of target funtions
(target_1) is recursive to generate two different entries in the stack
trace for the same uprobe/uretprobe, testing potential edge conditions.

Without fixes in this patch set, we get something like this for one of
the scenarios:

 caller: 0x758fff - 0x7595ab
 target_1: 0x758fd5 - 0x758fff
 target_2: 0x758fca - 0x758fd5
 target_3: 0x758fbf - 0x758fca
 target_4: 0x758fb3 - 0x758fbf
 ENTRY #0: 0x758fb3 (in target_4)
 ENTRY #1: 0x758fd3 (in target_2)
 ENTRY #2: 0x758ffd (in target_1)
 ENTRY #3: 0x7fffe000
 ENTRY #4: 0x7fffe000
 ENTRY #5: 0x6f8f39
 ENTRY #6: 0x6fa6f0
 ENTRY #7: 0x7f403f229590

Entry #3 and #4 (0x7fffe000) are uretprobe trampoline addresses
which obscure actual target_1 and another target_1 invocations. Also
note that between entry #0 and entry #1 we are missing an entry for
target_3, which is fixed in patch #2.

With all the fixes, we get desired full stack traces:

 caller: 0x758fff - 0x7595ab
 target_1: 0x758fd5 - 0x758fff
 target_2: 0x758fca - 0x758fd5
 target_3: 0x758fbf - 0x758fca
 target_4: 0x758fb3 - 0x758fbf
 ENTRY #0: 0x758fb7 (in target_4)
 ENTRY #1: 0x758fc8 (in target_3)
 ENTRY #2: 0x758fd3 (in target_2)
 ENTRY #3: 0x758ffd (in target_1)
 ENTRY #4: 0x758ff3 (in target_1)
 ENTRY #5: 0x75922c (in caller)
 ENTRY #6: 0x6f8f39
 ENTRY #7: 0x6fa6f0
 ENTRY #8: 0x7f986adc4cd0

Now there is a logical and complete sequence of function calls.

Signed-off-by: Andrii Nakryiko 
---
 .../bpf/prog_tests/uretprobe_stack.c  | 186 ++
 .../selftests/bpf/progs/uretprobe_stack.c |  96 +
 2 files changed, 282 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/uretprobe_stack.c
 create mode 100644 tools/testing/selftests/bpf/progs/uretprobe_stack.c

diff --git a/tools/testing/selftests/bpf/prog_tests/uretprobe_stack.c 
b/tools/testing/selftests/bpf/prog_tests/uretprobe_stack.c
new file mode 100644
index ..6deb8d560ddd
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/uretprobe_stack.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
+
+#include 
+#include "uretprobe_stack.skel.h"
+#include "../sdt.h"
+
+/* We set up target_1() -> target_2() -> target_3() -> target_4() -> USDT()
+ * call chain, each being traced by our BPF program. On entry or return from
+ * each target_*() we are capturing user stack trace and recording it in
+ * global variable, so that user space part of the test can validate it.
+ *
+ * Note, we put each target function into a custom section to get those
+ * __start_XXX/__stop_XXX symbols, generated by linker for us, which allow us
+ * to know address range of those functions
+ */
+__attribute__((section("uprobe__target_4")))
+__weak int target_4(void)
+{
+   STAP_PROBE1(uretprobe_stack, target, 42);
+   return 42;
+}
+
+extern const void *__start_uprobe__target_4;
+extern const void *__stop_uprobe__target_4;
+
+__attribute__((section("uprobe__target_3")))
+__weak int target_3(void)
+{
+   return target_4();
+}
+
+extern const void *__start_uprobe__target_3;
+extern const void *__stop_uprobe__target_3;
+
+__attribute__((section("uprobe__target_2")))
+__weak int target_2(void)
+{
+   return target_3();
+}
+
+extern const void *__start_uprobe__target_2;
+extern const void *__stop_uprobe__target_2;
+
+__attribute__((section("uprobe__target_1")))
+__weak int target_1(int depth)
+{
+   if (depth < 1)
+   return 1 + target_1(depth + 1);
+   else
+   return target_2();
+}
+
+extern const void *__start_uprobe__target_1;
+extern const void *__stop_uprobe__target_1;
+
+extern const void *__start_uretprobe_stack_sec;
+extern const void *__stop_uretprobe_stack_sec;
+
+struct range {
+   long start;
+   long stop;
+};
+
+static struct range targets[] = {
+   {}, /* we want target_1 to map to target[1], so need 1-based indexing */
+   { (long)&__start_uprobe__target_1, (long)&__stop_uprobe__target_1 },
+   { (long)&__start_uprobe__target_2, (long)&__stop_uprobe__target_2 },
+   { (long)&__start_uprobe__target_3, (long)&__stop_uprobe__target_3 },
+   { (long)&__start_uprobe__target_4, (long)&__stop_uprobe__target_4 },
+};
+
+static struct range caller = {
+   (long)&__start_uretprobe_stack_sec,
+   (long)&__stop_uretprobe_stack_sec,
+};
+
+static void validate_stack(__u64 *ips, int stack_len, int cnt, ...)
+{
+   int i, j;
+   va_list args;
+
+   if (!ASSERT_GT(stack_len, 0, "stack_len"))
+   return;
+
+   stack_len /= 8;
+
+   /* check if we have enough entries to satisfy test expectations */
+   if (!ASSERT_GE(stack_len, cnt, "stack_len2"))
+

[PATCH v2 3/4] perf,x86: avoid missing caller address in stack traces captured in uprobe

2024-05-21 Thread Andrii Nakryiko

When tracing user functions with uprobe functionality, it's common to
install the probe (e.g., a BPF program) at the first instruction of the
function. This is often going to be `push %rbp` instruction in function
preamble, which means that within that function frame pointer hasn't
been established yet. This leads to consistently missing an actual
caller of the traced function, because perf_callchain_user() only
records current IP (capturing traced function) and then following frame
pointer chain (which would be caller's frame, containing the address of
caller's caller).

So when we have target_1 -> target_2 -> target_3 call chain and we are
tracing an entry to target_3, captured stack trace will report
target_1 -> target_3 call chain, which is wrong and confusing.

This patch proposes a x86-64-specific heuristic to detect `push %rbp`
instruction being traced. If that's the case, with the assumption that
applicatoin is compiled with frame pointers, this instruction would be
a strong indicator that this is the entry to the function. In that case,
return address is still pointed to by %rsp, so we fetch it and add to
stack trace before proceeding to unwind the rest using frame
pointer-based logic.

Signed-off-by: Andrii Nakryiko 
---
 arch/x86/events/core.c  | 20 
 include/linux/uprobes.h |  2 ++
 kernel/events/uprobes.c |  2 ++
 3 files changed, 24 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5b0dd07b1ef1..82d5570b58ff 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2884,6 +2884,26 @@ perf_callchain_user(struct perf_callchain_entry_ctx 
*entry, struct pt_regs *regs
return;
 
pagefault_disable();
+
+#ifdef CONFIG_UPROBES
+   /*
+* If we are called from uprobe handler, and we are indeed at the very
+* entry to user function (which is normally a `push %rbp` instruction,
+* under assumption of application being compiled with frame pointers),
+* we should read return address from *regs->sp before proceeding
+* to follow frame pointers, otherwise we'll skip immediate caller
+* as %rbp is not yet setup.
+*/
+   if (current->utask) {
+   struct arch_uprobe *auprobe = current->utask->auprobe;
+   u64 ret_addr;
+
+   if (auprobe && auprobe->insn[0] == 0x55 /* push %rbp */ &&
+   !__get_user(ret_addr, (const u64 __user *)regs->sp))
+   perf_callchain_store(entry, ret_addr);
+   }
+#endif
+
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 0c57eec85339..7b785cd30d86 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -76,6 +76,8 @@ struct uprobe_task {
struct uprobe   *active_uprobe;
unsigned long   xol_vaddr;
 
+   struct arch_uprobe  *auprobe;
+
struct return_instance  *return_instances;
unsigned intdepth;
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1c99380dc89d..504693845187 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2072,6 +2072,7 @@ static void handler_chain(struct uprobe *uprobe, struct 
pt_regs *regs)
bool need_prep = false; /* prepare return uprobe, when needed */
 
down_read(>register_rwsem);
+   current->utask->auprobe = >arch;
for (uc = uprobe->consumers; uc; uc = uc->next) {
int rc = 0;
 
@@ -2086,6 +2087,7 @@ static void handler_chain(struct uprobe *uprobe, struct 
pt_regs *regs)
 
remove &= rc;
}
+   current->utask->auprobe = NULL;
 
if (need_prep && !remove)
prepare_uretprobe(uprobe, regs); /* put bp at return */
-- 
2.43.0

[PATCH v2 2/4] perf,uprobes: fix user stack traces in the presence of pending uretprobes

2024-05-21 Thread Andrii Nakryiko

When kernel has pending uretprobes installed, it hijacks original user
function return address on the stack with a uretprobe trampoline
address. There could be multiple such pending uretprobes (either on
different user functions or on the same recursive one) at any given
time within the same task.

This approach interferes with the user stack trace capture logic, which
would report suprising addresses (like 0x7fffe000) that correspond
to a special "[uprobes]" section that kernel installs in the target
process address space for uretprobe trampoline code, while logically it
should be an address somewhere within the calling function of another
traced user function.

This is easy to correct for, though. Uprobes subsystem keeps track of
pending uretprobes and records original return addresses. This patch is
using this to do a post-processing step and restore each trampoline
address entries with correct original return address. This is done only
if there are pending uretprobes for current task.

This is a similar approach to what fprobe/kretprobe infrastructure is
doing when capturing kernel stack traces in the presence of pending
return probes.

Reported-by: Riham Selim 
Signed-off-by: Andrii Nakryiko 
---
 kernel/events/callchain.c | 43 ++-
 kernel/events/uprobes.c   |  9 
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 1273be84392c..b17e3323f7f6 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "internal.h"
 
@@ -176,13 +177,51 @@ put_callchain_entry(int rctx)
put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
 }
 
+static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry 
*entry,
+  int start_entry_idx)
+{
+#ifdef CONFIG_UPROBES
+   struct uprobe_task *utask = current->utask;
+   struct return_instance *ri;
+   __u64 *cur_ip, *last_ip, tramp_addr;
+
+   if (likely(!utask || !utask->return_instances))
+   return;
+
+   cur_ip = >ip[start_entry_idx];
+   last_ip = >ip[entry->nr - 1];
+   ri = utask->return_instances;
+   tramp_addr = uprobe_get_trampoline_vaddr();
+
+   /*
+* If there are pending uretprobes for the current thread, they are
+* recorded in a list inside utask->return_instances; each such
+* pending uretprobe replaces traced user function's return address on
+* the stack, so when stack trace is captured, instead of seeing
+* actual function's return address, we'll have one or many uretprobe
+* trampoline addresses in the stack trace, which are not helpful and
+* misleading to users.
+* So here we go over the pending list of uretprobes, and each
+* encountered trampoline address is replaced with actual return
+* address.
+*/
+   while (ri && cur_ip <= last_ip) {
+   if (*cur_ip == tramp_addr) {
+   *cur_ip = ri->orig_ret_vaddr;
+   ri = ri->next;
+   }
+   cur_ip++;
+   }
+#endif
+}
+
 struct perf_callchain_entry *
 get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
   u32 max_stack, bool crosstask, bool add_mark)
 {
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
-   int rctx;
+   int rctx, start_entry_idx;
 
entry = get_callchain_entry();
if (!entry)
@@ -215,7 +254,9 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool 
kernel, bool user,
if (add_mark)
perf_callchain_store_context(, 
PERF_CONTEXT_USER);
 
+   start_entry_idx = entry->nr;
perf_callchain_user(, regs);
+   fixup_uretprobe_trampoline_entries(entry, 
start_entry_idx);
}
}
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d60d24f0f2f4..1c99380dc89d 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2149,6 +2149,15 @@ static void handle_trampoline(struct pt_regs *regs)
 
instruction_pointer_set(regs, ri->orig_ret_vaddr);
do {
+   /* pop current instance from the stack of pending 
return instances,
+* as it's not pending anymore: we just fixed up 
original
+* instruction pointer in regs and are about to call 
handlers;
+* this allows fixup_uretprobe_trampoline_entries() to 
properly fix up
+* captured stack traces from uretprobe handlers, in 
which pending
+* trampoline addresses on the stack are replaced with 
correct
+* original

[PATCH v2 1/4] uprobes: rename get_trampoline_vaddr() and make it global

2024-05-21 Thread Andrii Nakryiko

This helper is needed in another file, so make it a bit more uniquely
named and expose it internally.

Signed-off-by: Andrii Nakryiko 
---
 include/linux/uprobes.h | 1 +
 kernel/events/uprobes.c | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index f46e0ca0169c..0c57eec85339 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -138,6 +138,7 @@ extern bool arch_uretprobe_is_alive(struct return_instance 
*ret, enum rp_check c
 extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 void *src, unsigned long len);
+extern unsigned long uprobe_get_trampoline_vaddr(void);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8ae0eefc3a34..d60d24f0f2f4 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1827,7 +1827,7 @@ void uprobe_copy_process(struct task_struct *t, unsigned 
long flags)
  *
  * Returns -1 in case the xol_area is not allocated.
  */
-static unsigned long get_trampoline_vaddr(void)
+unsigned long uprobe_get_trampoline_vaddr(void)
 {
struct xol_area *area;
unsigned long trampoline_vaddr = -1;
@@ -1878,7 +1878,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, 
struct pt_regs *regs)
if (!ri)
return;
 
-   trampoline_vaddr = get_trampoline_vaddr();
+   trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, 
regs);
if (orig_ret_vaddr == -1)
goto fail;
@@ -2187,7 +2187,7 @@ static void handle_swbp(struct pt_regs *regs)
int is_swbp;
 
bp_vaddr = uprobe_get_swbp_addr(regs);
-   if (bp_vaddr == get_trampoline_vaddr())
+   if (bp_vaddr == uprobe_get_trampoline_vaddr())
return handle_trampoline(regs);
 
uprobe = find_active_uprobe(bp_vaddr, _swbp);
-- 
2.43.0

[PATCH v2 0/4] Fix user stack traces captured from uprobes

2024-05-21 Thread Andrii Nakryiko

This patch set reports two issues with captured stack traces.

First issue, fixed in patch #2, deals with fixing up uretprobe trampoline
addresses in captured stack trace. This issue happens when there are pending
return probes, for which kernel hijacks some of the return addresses on user
stacks. The code is matching those special uretprobe trampoline addresses with
the list of pending return probe instances and replaces them with actual
return addresses. This is the same fixup logic that fprobe/kretprobe has for
kernel stack traces.

Second issue, which patch #3 is fixing with the help of heuristic, is having
to do with capturing user stack traces in entry uprobes. At the very entrance
to user function, frame pointer in rbp register is not yet setup, so actual
caller return address is still pointed to by rsp. Patch is using a simple
heuristic, looking for `push %rbp` instruction, to fetch this extra direct
caller return address, before proceeding to unwind the stack using rbp.

Patch #4 adds tests into BPF selftests, that validate that captured stack
traces at various points is what we expect to get. This patch, while being BPF
selftests, is isolated from any other BPF selftests changes and can go in
through non-BPF tree without the risk of merge conflicts.

Patches are based on latest linux-trace/probes/for-next.

v1->v2:
  - fixed GCC aggressively inlining test_uretprobe_stack() function (BPF CI);
  - fixed comments (Peter).

Andrii Nakryiko (4):
  uprobes: rename get_trampoline_vaddr() and make it global
  perf,uprobes: fix user stack traces in the presence of pending
uretprobes
  perf,x86: avoid missing caller address in stack traces captured in
uprobe
  selftests/bpf: add test validating uprobe/uretprobe stack traces

 arch/x86/events/core.c|  20 ++
 include/linux/uprobes.h   |   3 +
 kernel/events/callchain.c |  43 +++-
 kernel/events/uprobes.c   |  17 +-
 .../bpf/prog_tests/uretprobe_stack.c  | 186 ++
 .../selftests/bpf/progs/uretprobe_stack.c |  96 +
 6 files changed, 361 insertions(+), 4 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/uretprobe_stack.c
 create mode 100644 tools/testing/selftests/bpf/progs/uretprobe_stack.c

-- 
2.43.0

Re: [PATCHv6 bpf-next 0/9] uprobe: uretprobe speed up

2024-05-21 Thread Alexei Starovoitov

On Tue, May 21, 2024 at 1:49 PM Deepak Gupta  wrote:
>
> On Tue, May 21, 2024 at 12:48:16PM +0200, Jiri Olsa wrote:
> >hi,
> >as part of the effort on speeding up the uprobes [0] coming with
> >return uprobe optimization by using syscall instead of the trap
> >on the uretprobe trampoline.
>
> I understand this provides an optimization on x86. I believe primary reason
> is syscall is straight-line microcode and short sequence while trap delivery
> still does all the GDT / IDT and segmentation checks and it makes delivery
> of the trap slow.
>
> So doing syscall improves that. Although it seems x86 is going to get rid of
> that as part of FRED [1, 2]. And linux kernel support for FRED is already 
> upstream [2].
> So I am imagining x86 hardware already exists with FRED support.
>
> On other architectures, I believe trap delivery for breakpoint instruction
> is same as syscall instruction.
>
> Given that x86 trap delivery is pretty much going following the suit here and
> intend to make trap delivery cost similar to syscall delivery.
>
> Sorry for being buzzkill here but ...
> Is it worth introducing this syscall which otherwise has no use on other 
> arches
> and x86 (and x86 kernel) has already taken steps to match trap delivery 
> latency with
> syscall latency would have similar cost?
>
> Did you do any study of this on FRED enabled x86 CPUs?

afaik CPUs with FRED do not exist on the market and it's
not clear when they will be available.
And when they finally will be on the shelves
the overhead of FRED vs int3 would still have to be measured.
int3 with FRED might still be higher than syscall with FRED.

>
> [1] - 
> https://www.intel.com/content/www/us/en/content-details/780121/flexible-return-and-event-delivery-fred-specification.html
> [2] - https://docs.kernel.org/arch/x86/x86_64/fred.html
>
> >
> >The speed up depends on instruction type that uprobe is installed
> >and depends on specific HW type, please check patch 1 for details.
> >

Re: [PATCHv6 9/9] man2: Add uretprobe syscall page

2024-05-21 Thread Alejandro Colomar

Hi Jirka,

On Tue, May 21, 2024 at 10:24:30PM GMT, Jiri Olsa wrote:
> how about the change below?

Much better.  I still have a few comments below.  :-)

> 
> thanks,
> jirka
> 
> 
> ---
> diff --git a/man/man2/uretprobe.2 b/man/man2/uretprobe.2
> new file mode 100644
> index ..959b7a47102b
> --- /dev/null
> +++ b/man/man2/uretprobe.2
> @@ -0,0 +1,55 @@
> +.\" Copyright (C) 2024, Jiri Olsa 
> +.\"
> +.\" SPDX-License-Identifier: Linux-man-pages-copyleft
> +.\"
> +.TH uretprobe 2 (date) "Linux man-pages (unreleased)"
> +.SH NAME
> +uretprobe \- execute pending return uprobes
> +.SH SYNOPSIS
> +.nf
> +.B int uretprobe(void)
> +.fi
> +.SH DESCRIPTION
> +The
> +.BR uretprobe ()
> +system call is an alternative to breakpoint instructions for triggering 
> return
> +uprobe consumers.
> +.P
> +Calls to
> +.BR uretprobe ()
> +system call are only made from the user-space trampoline provided by the 
> kernel.
> +Calls from any other place result in a
> +.BR SIGILL .
> +.SH RETURN VALUE
> +The
> +.BR uretprobe ()
> +system call return value is architecture-specific.
> +.SH ERRORS
> +.BR SIGILL

This should be a tagged paragraph, preceeded with '.TP'.  See any manual
page with an ERRORS section for an example.

Also, BR is Bold alternating with Roman, but this is just bold, so it
should use '.B'.

.TP
.B SIGILL

> +The
> +.BR uretprobe ()
> +system call was called by user.
> +.SH VERSIONS
> +Details of the
> +.BR uretprobe ()
> +system call behavior vary across systems.
> +.SH STANDARDS
> +None.
> +.SH HISTORY
> +TBD
> +.SH NOTES
> +The
> +.BR uretprobe ()
> +system call was initially introduced for the x86_64 architecture where it 
> was shown

We have a strong-ish limit at column 80.  Please break after
'architecture', which is a clause boundary.

Have a lovely night!
Alex

> +to be faster than breakpoint traps.
> +It might be extended to other architectures.
> +.P
> +The
> +.BR uretprobe ()
> +system call exists only to allow the invocation of return uprobe consumers.
> +It should
> +.B never
> +be called directly.
> +Details of the arguments (if any) passed to
> +.BR uretprobe ()
> +and the return value are architecture-specific.
> 

-- 



signature.asc
Description: PGP signature

Re: [PATCHv6 bpf-next 0/9] uprobe: uretprobe speed up

2024-05-21 Thread Deepak Gupta

On Tue, May 21, 2024 at 12:48:16PM +0200, Jiri Olsa wrote:

hi,
as part of the effort on speeding up the uprobes [0] coming with
return uprobe optimization by using syscall instead of the trap
on the uretprobe trampoline.

I understand this provides an optimization on x86. I believe primary reason
is syscall is straight-line microcode and short sequence while trap delivery
still does all the GDT / IDT and segmentation checks and it makes delivery
of the trap slow.

So doing syscall improves that. Although it seems x86 is going to get rid of
that as part of FRED [1, 2]. And linux kernel support for FRED is already upstream [2].

So I am imagining x86 hardware already exists with FRED support.

On other architectures, I believe trap delivery for breakpoint instruction
is same as syscall instruction.

Given that x86 trap delivery is pretty much going following the suit here and
intend to make trap delivery cost similar to syscall delivery.

Sorry for being buzzkill here but ...
Is it worth introducing this syscall which otherwise has no use on other arches
and x86 (and x86 kernel) has already taken steps to match trap delivery latency
with
syscall latency would have similar cost?

Did you do any study of this on FRED enabled x86 CPUs?

[1] -
https://www.intel.com/content/www/us/en/content-details/780121/flexible-return-and-event-delivery-fred-specification.html
[2] - https://docs.kernel.org/arch/x86/x86_64/fred.html

The speed up depends on instruction type that uprobe is installed
and depends on specific HW type, please check patch 1 for details.

Re: [PATCH RFC 1/2] dt-bindings: soc: qcom,smsm: Allow specifying mboxes instead of qcom,ipc

2024-05-21 Thread Luca Weiss

On Dienstag, 21. Mai 2024 10:58:07 MESZ Krzysztof Kozlowski wrote:
> On 20/05/2024 17:11, Luca Weiss wrote:
> > Hi Krzysztof
> > 
> > Ack, sounds good.
> > 
> > Maybe also from you, any opinion between these two binding styles?
> > 
> > So first using index of mboxes for the numbering, where for the known
> > usages the first element (and sometimes the 3rd - ipc-2) are empty <>.
> > 
> > The second variant is using mbox-names to get the correct channel-mbox
> > mapping.
> > 
> > -   qcom,ipc-1 = < 8 13>;
> > -   qcom,ipc-2 = < 8 9>;
> > -   qcom,ipc-3 = < 8 19>;
> > +   mboxes = <0>, < 13>, < 9>, < 19>;
> > 
> > vs.
> > 
> > -   qcom,ipc-1 = < 8 13>;
> > -   qcom,ipc-2 = < 8 9>;
> > -   qcom,ipc-3 = < 8 19>;
> > +   mboxes = < 13>, < 9>, < 19>;
> > +   mbox-names = "ipc-1", "ipc-2", "ipc-3";
> 
> Sorry, don't get, ipc-1 is the first mailbox, so why would there be <0>
> in first case?

Actually not, ipc-0 would be permissible by the driver, used for the 0th host

e.g. from:

/* Iterate over all hosts to check whom wants a kick */
for (host = 0; host < smsm->num_hosts; host++) {
hostp = >hosts[host];

Even though no mailbox is specified in any upstream dts for this 0th host I
didn't want the bindings to restrict that, that's why in the first example
there's an empty element (<0>) for the 0th smsm host

> Anyway, the question is if you need to know that some
> mailbox is missing. But then it is weird to name them "ipc-1" etc.

In either case we'd just query the mbox (either by name or index) and then
see if it's there? Not quite sure I understand the sentence..
Pretty sure either binding would work the same way.

Regards
Luca

> 
> Best regards,
> Krzysztof
> 
>

Re: [PATCHv6 9/9] man2: Add uretprobe syscall page

2024-05-21 Thread Jiri Olsa

On Tue, May 21, 2024 at 01:48:59PM +0200, Jiri Olsa wrote:
> On Tue, May 21, 2024 at 01:36:25PM +0200, Alejandro Colomar wrote:
> > Hi Jiri,
> > 
> > On Tue, May 21, 2024 at 12:48:25PM GMT, Jiri Olsa wrote:
> > > Adding man page for new uretprobe syscall.
> > > 
> > > Signed-off-by: Jiri Olsa 
> > > ---
> > >  man2/uretprobe.2 | 50 
> > >  1 file changed, 50 insertions(+)
> > >  create mode 100644 man2/uretprobe.2
> > > 
> > > diff --git a/man2/uretprobe.2 b/man2/uretprobe.2
> > > new file mode 100644
> > > index ..690fe3b1a44f
> > > --- /dev/null
> > > +++ b/man2/uretprobe.2
> > > @@ -0,0 +1,50 @@
> > > +.\" Copyright (C) 2024, Jiri Olsa 
> > > +.\"
> > > +.\" SPDX-License-Identifier: Linux-man-pages-copyleft
> > > +.\"
> > > +.TH uretprobe 2 (date) "Linux man-pages (unreleased)"
> > > +.SH NAME
> > > +uretprobe \- execute pending return uprobes
> > > +.SH SYNOPSIS
> > > +.nf
> > > +.B int uretprobe(void)
> > > +.fi
> > 
> > What header file provides this system call?
> 
> there's no header, it's used/called only by user space trampoline
> provided by kernel, it's not expected to be called by user
> 
> > 
> > > +.SH DESCRIPTION
> > > +The
> > > +.BR uretprobe ()
> > > +syscall is an alternative to breakpoint instructions for
> > > +triggering return uprobe consumers.
> > > +.P
> > > +Calls to
> > > +.BR uretprobe ()
> > > +suscall are only made from the user-space trampoline provided by the 
> > > kernel.
> > 
> > s/suscall/system call/
> 
> ugh leftover sry
> 
> > 
> > > +Calls from any other place result in a
> > > +.BR SIGILL .
> > 
> > Maybe add an ERRORS section?
> > 
> > > +
> > 
> > We don't use blank lines; it causes a groff(1) warning, and other
> > problems.  Instead, use '.P'.
> > 
> > > +.SH RETURN VALUE
> > > +The
> > > +.BR uretprobe ()
> > > +syscall return value is architecture-specific.
> > > +
> > 
> > .P
> > 
> > > +.SH VERSIONS
> > > +This syscall is not specified in POSIX,
> > 
> > Redundant with "STANDARDS: None.".
> > 
> > > +and details of its behavior vary across systems.
> > 
> > Keep this.
> 
> ok
> 
> > 
> > > +.SH STANDARDS
> > > +None.
> > > +.SH HISTORY
> > > +TBD
> > > +.SH NOTES
> > > +The
> > > +.BR uretprobe ()
> > > +syscall was initially introduced for the x86_64 architecture where it 
> > > was shown
> > > +to be faster than breakpoint traps. It might be extended to other 
> > > architectures.
> > 
> > Please use semantic newlines.
> > 
> > $ MANWIDTH=72 man man-pages | sed -n '/Use semantic newlines/,/^$/p'
> >Use semantic newlines
> >  In the source of a manual page, new sentences should be started on
> >  new lines, long sentences should be split  into  lines  at  clause
> >  breaks  (commas,  semicolons, colons, and so on), and long clauses
> >  should be split at phrase boundaries.  This convention,  sometimes
> >  known as "semantic newlines", makes it easier to see the effect of
> >  patches, which often operate at the level of individual sentences,
> >  clauses, or phrases.
> 

how about the change below?

thanks,
jirka


---
diff --git a/man/man2/uretprobe.2 b/man/man2/uretprobe.2
new file mode 100644
index ..959b7a47102b
--- /dev/null
+++ b/man/man2/uretprobe.2
@@ -0,0 +1,55 @@
+.\" Copyright (C) 2024, Jiri Olsa 
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH uretprobe 2 (date) "Linux man-pages (unreleased)"
+.SH NAME
+uretprobe \- execute pending return uprobes
+.SH SYNOPSIS
+.nf
+.B int uretprobe(void)
+.fi
+.SH DESCRIPTION
+The
+.BR uretprobe ()
+system call is an alternative to breakpoint instructions for triggering return
+uprobe consumers.
+.P
+Calls to
+.BR uretprobe ()
+system call are only made from the user-space trampoline provided by the 
kernel.
+Calls from any other place result in a
+.BR SIGILL .
+.SH RETURN VALUE
+The
+.BR uretprobe ()
+system call return value is architecture-specific.
+.SH ERRORS
+.BR SIGILL
+The
+.BR uretprobe ()
+system call was called by user.
+.SH VERSIONS
+Details of the
+.BR uretprobe ()
+system call behavior vary across systems.
+.SH STANDARDS
+None.
+.SH HISTORY
+TBD
+.SH NOTES
+The
+.BR uretprobe ()
+system call was initially introduced for the x86_64 architecture where it was 
shown
+to be faster than breakpoint traps.
+It might be extended to other architectures.
+.P
+The
+.BR uretprobe ()
+system call exists only to allow the invocation of return uprobe consumers.
+It should
+.B never
+be called directly.
+Details of the arguments (if any) passed to
+.BR uretprobe ()
+and the return value are architecture-specific.

Re: [PATCH 06/12] remoteproc: qcom_q6v5_pas: switch to mbn files by default

2024-05-21 Thread Bjorn Andersson

On Tue, May 21, 2024 at 11:49:42AM +0200, neil.armstr...@linaro.org wrote:
> On 21/05/2024 11:45, Dmitry Baryshkov wrote:
> > We have been pushing userspace to use mbn files by default for ages.
> > As a preparation for making the firmware-name optional, make the driver
> > use .mbn instead of .mdt files by default.
> 
> I think we should have a mechanism to fallback to .mdt since downstream
> uses split mdt on the devices filesystem.
> 

Let's ignore and continue to move away from the split .mdt files.

Combining split files is trivial and removes a class of problems where
people mix and match their parts. (And worst case you can rename/symlink
your downstream firmware to match the requested filename)

Regards,
Bjorn

Re: [GIT PULL] remoteproc updates for v6.10

2024-05-21 Thread pr-tracker-bot

The pull request you sent on Mon, 20 May 2024 20:12:20 -0700:

> https://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git 
> tags/rproc-v6.10

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/ab7b884a34ffda718cb93c772f575e45e8241c62

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html

Re: [GIT PULL] rpmsg updates for v6.10

2024-05-21 Thread pr-tracker-bot

The pull request you sent on Mon, 20 May 2024 19:58:46 -0700:

> https://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git 
> tags/rpmsg-v6.10

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/e66128fa8e7e38ebd0b0c95578f8020aec6c0dee

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html

Re: [PATCH v2 1/2] drivers: remoteproc: xlnx: add attach detach support

2024-05-21 Thread Mathieu Poirier

Hi Tanmay,

On Fri, May 10, 2024 at 05:51:25PM -0700, Tanmay Shah wrote:
> It is possible that remote processor is already running before
> linux boot or remoteproc platform driver probe. Implement required
> remoteproc framework ops to provide resource table address and
> connect or disconnect with remote processor in such case.
> 
> Signed-off-by: Tanmay Shah 
> ---
> 
> Changes in v2:
>   - Fix following sparse warnings
> 
> drivers/remoteproc/xlnx_r5_remoteproc.c:827:21: sparse:expected struct 
> rsc_tbl_data *rsc_data_va
> drivers/remoteproc/xlnx_r5_remoteproc.c:844:18: sparse:expected struct 
> resource_table *rsc_addr
> drivers/remoteproc/xlnx_r5_remoteproc.c:898:24: sparse:expected void 
> volatile [noderef] __iomem *addr
> 
>  drivers/remoteproc/xlnx_r5_remoteproc.c | 164 +++-
>  1 file changed, 160 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/remoteproc/xlnx_r5_remoteproc.c 
> b/drivers/remoteproc/xlnx_r5_remoteproc.c
> index 84243d1dff9f..039370cffa32 100644
> --- a/drivers/remoteproc/xlnx_r5_remoteproc.c
> +++ b/drivers/remoteproc/xlnx_r5_remoteproc.c
> @@ -25,6 +25,10 @@
>  /* RX mailbox client buffer max length */
>  #define MBOX_CLIENT_BUF_MAX  (IPI_BUF_LEN_MAX + \
>sizeof(struct zynqmp_ipi_message))
> +
> +#define RSC_TBL_XLNX_MAGIC   ((uint32_t)'x' << 24 | (uint32_t)'a' << 16 | \
> +  (uint32_t)'m' << 8 | (uint32_t)'p')
> +
>  /*
>   * settings for RPU cluster mode which
>   * reflects possible values of xlnx,cluster-mode dt-property
> @@ -73,6 +77,15 @@ struct mbox_info {
>   struct mbox_chan *rx_chan;
>  };
>  
> +/* Xilinx Platform specific data structure */
> +struct rsc_tbl_data {
> + const int version;
> + const u32 magic_num;
> + const u32 comp_magic_num;

Why is a complement magic number needed?

> + const u32 rsc_tbl_size;
> + const uintptr_t rsc_tbl;
> +} __packed;
> +
>  /*
>   * Hardcoded TCM bank values. This will stay in driver to maintain backward
>   * compatibility with device-tree that does not have TCM information.
> @@ -95,20 +108,24 @@ static const struct mem_bank_data 
> zynqmp_tcm_banks_lockstep[] = {
>  /**
>   * struct zynqmp_r5_core
>   *
> + * @rsc_tbl_va: resource table virtual address
>   * @dev: device of RPU instance
>   * @np: device node of RPU instance
>   * @tcm_bank_count: number TCM banks accessible to this RPU
>   * @tcm_banks: array of each TCM bank data
>   * @rproc: rproc handle
> + * @rsc_tbl_size: resource table size retrieved from remote
>   * @pm_domain_id: RPU CPU power domain id
>   * @ipi: pointer to mailbox information
>   */
>  struct zynqmp_r5_core {
> + struct resource_table *rsc_tbl_va;

Shouldn't this be of type "void __iomem *"?  Did sparse give you trouble on that
one?

>   struct device *dev;
>   struct device_node *np;
>   int tcm_bank_count;
>   struct mem_bank_data **tcm_banks;
>   struct rproc *rproc;
> + u32 rsc_tbl_size;
>   u32 pm_domain_id;
>   struct mbox_info *ipi;
>  };
> @@ -621,10 +638,19 @@ static int zynqmp_r5_rproc_prepare(struct rproc *rproc)
>  {
>   int ret;
>  
> - ret = add_tcm_banks(rproc);
> - if (ret) {
> - dev_err(>dev, "failed to get TCM banks, err %d\n", ret);
> - return ret;
> + /**

Using "/**" is for comments that will endup in the documentation, which I don't
think is needed here.  Please correct throughout the patch.

> +  * For attach/detach use case, Firmware is already loaded so
> +  * TCM isn't really needed at all. Also, for security TCM can be
> +  * locked in such case and linux may not have access at all.
> +  * So avoid adding TCM banks. TCM power-domains requested during attach
> +  * callback.
> +  */
> + if (rproc->state != RPROC_DETACHED) {
> + ret = add_tcm_banks(rproc);
> + if (ret) {
> + dev_err(>dev, "failed to get TCM banks, err 
> %d\n", ret);
> + return ret;
> + }
>   }
>  
>   ret = add_mem_regions_carveout(rproc);
> @@ -662,6 +688,123 @@ static int zynqmp_r5_rproc_unprepare(struct rproc 
> *rproc)
>   return 0;
>  }
>  
> +static struct resource_table *zynqmp_r5_get_loaded_rsc_table(struct rproc 
> *rproc,
> +  size_t *size)
> +{
> + struct zynqmp_r5_core *r5_core;
> +
> + r5_core = rproc->priv;
> +
> + *size = r5_core->rsc_tbl_size;
> +
> + return r5_core->rsc_tbl_va;
> +}
> +
> +static int zynqmp_r5_get_rsc_table_va(struct zynqmp_r5_core *r5_core)
> +{
> + struct device *dev = r5_core->dev;
> + struct rsc_tbl_data *rsc_data_va;
> + struct resource_table *rsc_addr;
> + struct resource res_mem;
> + struct device_node *np;
> + int ret;
> +
> + /**
> +  * It is expected from remote processor firmware to provide resource
> +  * table address via struct

Re: [PATCHv6 bpf-next 1/9] x86/shstk: Make return uprobe work with shadow stack

2024-05-21 Thread Jiri Olsa

On Tue, May 21, 2024 at 04:22:21PM +0200, Oleg Nesterov wrote:
> On 05/21, Jiri Olsa wrote:
> >
> > Currently the application with enabled shadow stack will crash
> > if it sets up return uprobe. The reason is the uretprobe kernel
> > code changes the user space task's stack, but does not update
> > shadow stack accordingly.
> >
> > Adding new functions to update values on shadow stack and using
> > them in uprobe code to keep shadow stack in sync with uretprobe
> > changes to user stack.
> 
> I don't think my ack has any value in this area but looks good to me.
> 
> Reviewed-by: Oleg Nesterov 
> 
> 
> > Fixes: 8b1c23543436 ("x86/shstk: Add return uprobe support")
> 
> Hmm... Was this commit ever applied?

should have been:
  488af8ea7131 x86/shstk: Wire in shadow stack interface

will send new version

thanks,
jirka

> 
> Oleg.
>

Re: [PATCH] rpmsg: char: fix rpmsg_eptdev structure documentation

2024-05-21 Thread Mathieu Poirier

On Fri, May 17, 2024 at 06:56:54PM +0200, Arnaud Pouliquen wrote:
> Add missing @ tags for some rpmsg_eptdev structure parameters.
> 
> This fixes warning messages on build:
> drivers/rpmsg/rpmsg_char.c:75: warning: Function parameter or struct member 
> 'remote_flow_restricted' not described in 'rpmsg_eptdev'
> drivers/rpmsg/rpmsg_char.c:75: warning: Function parameter or struct member 
> 'remote_flow_updated' not described in 'rpmsg_eptdev'
> 
> Fixes: 5550201c0fe2 ("rpmsg: char: Add RPMSG GET/SET FLOWCONTROL IOCTL 
> support")
> 
> Signed-off-by: Arnaud Pouliquen 
> ---
>  drivers/rpmsg/rpmsg_char.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/rpmsg/rpmsg_char.c b/drivers/rpmsg/rpmsg_char.c
> index 1cb8d7474428..98d95ce5b6fb 100644
> --- a/drivers/rpmsg/rpmsg_char.c
> +++ b/drivers/rpmsg/rpmsg_char.c
> @@ -52,8 +52,8 @@ static DEFINE_IDA(rpmsg_minor_ida);
>   * @readq:   wait object for incoming queue
>   * @default_ept: set to channel default endpoint if the default endpoint 
> should be re-used
>   *  on device open to prevent endpoint address update.
> - * remote_flow_restricted: to indicate if the remote has requested for flow 
> to be limited
> - * remote_flow_updated: to indicate if the flow control has been requested
> + * @remote_flow_restricted: to indicate if the remote has requested for flow 
> to be limited
> + * @remote_flow_updated: to indicate if the flow control has been requested

I will apply this patch next week when rc1 comes out.

Thanks,
Mathieu

>   */
>  struct rpmsg_eptdev {
>   struct device dev;
> -- 
> 2.25.1
>

Re: [PATCH] remoteproc: mediatek: Zero out only remaining bytes of IPI buffer

2024-05-21 Thread Mathieu Poirier

On Mon, May 20, 2024 at 01:27:24PM +0200, AngeloGioacchino Del Regno wrote:
> In scp_ipi_handler(), instead of zeroing out the entire shared
> buffer, which may be as large as 600 bytes, overwrite it with the
> received data, then zero out only the remaining bytes.
> 
> Signed-off-by: AngeloGioacchino Del Regno 
> 
> ---
>  drivers/remoteproc/mtk_scp.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/remoteproc/mtk_scp.c b/drivers/remoteproc/mtk_scp.c
> index e5214d43181e..dc70cf7db44d 100644
> --- a/drivers/remoteproc/mtk_scp.c
> +++ b/drivers/remoteproc/mtk_scp.c
> @@ -117,8 +117,8 @@ static void scp_ipi_handler(struct mtk_scp *scp)
>   return;
>   }
>  
> - memset(scp->share_buf, 0, scp_sizes->ipi_share_buffer_size);
>   memcpy_fromio(scp->share_buf, _obj->share_buf, len);
> + memset(>share_buf[len], 0, scp_sizes->ipi_share_buffer_size - len);

I will apply this patch when rc1 comes out next week.

Thanks,
Mathieu

>   handler(scp->share_buf, len, ipi_desc[id].priv);
>   scp_ipi_unlock(scp, id);
>  
> -- 
> 2.45.1
>

[PATCH] remoteproc: stm32_rproc: Fix mailbox interrupts queuing

2024-05-21 Thread Gwenael Treuveur

Manage interrupt coming from coprocessor also when state is
ATTACHED.

Fixes: 35bdafda40cc ("remoteproc: stm32_rproc: Add mutex protection for 
workqueue")
Signed-off-by: Gwenael Treuveur 
Acked-by: Arnaud Pouliquen 
---
 drivers/remoteproc/stm32_rproc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/remoteproc/stm32_rproc.c b/drivers/remoteproc/stm32_rproc.c
index 88623df7d0c3..8c7f7950b80e 100644
--- a/drivers/remoteproc/stm32_rproc.c
+++ b/drivers/remoteproc/stm32_rproc.c
@@ -294,7 +294,7 @@ static void stm32_rproc_mb_vq_work(struct work_struct *work)
 
mutex_lock(>lock);
 
-   if (rproc->state != RPROC_RUNNING)
+   if (rproc->state != RPROC_RUNNING && rproc->state != RPROC_ATTACHED)
goto unlock_mutex;
 
if (rproc_vq_interrupt(rproc, mb->vq_id) == IRQ_NONE)

base-commit: 4d5ba6ead1dc9fa298d727e92db40cd98564d1ac
-- 
2.25.1

Re: [PATCH] tools/latency-collector: fix -Wformat-security compile warns

2024-05-21 Thread Steven Rostedt

On Tue, 21 May 2024 09:11:08 -0600
Shuah Khan  wrote:

> Any thoughts on this patch?

Sorry, this one fell through the cracks. Daniel Bristot has been
maintaining his tools and I thought this was one of his changes.

I'll take a look at it.

-- Steve

Re: [PATCH v3 2/9] riscv: mm: Pre-allocate vmemmap/direct map PGD entries

2024-05-21 Thread Björn Töpel

Björn Töpel  writes:

> From: Björn Töpel 
>
> The RISC-V port copies the PGD table from init_mm/swapper_pg_dir to
> all userland page tables, which means that if the PGD level table is
> changed, other page tables has to be updated as well.
>
> Instead of having the PGD changes ripple out to all tables, the
> synchronization can be avoided by pre-allocating the PGD entries/pages
> at boot, avoiding the synchronization all together.
>
> This is currently done for the bpf/modules, and vmalloc PGD regions.
> Extend this scheme for the PGD regions touched by memory hotplugging.
>
> Prepare the RISC-V port for memory hotplug by pre-allocate
> vmemmap/direct map entries at the PGD level. This will roughly waste
> ~128 worth of 4K pages when memory hotplugging is enabled in the
> kernel configuration.
>
> Reviewed-by: Alexandre Ghiti 
> Signed-off-by: Björn Töpel 
> ---
>  arch/riscv/include/asm/kasan.h | 4 ++--
>  arch/riscv/mm/init.c   | 7 +++
>  2 files changed, 9 insertions(+), 2 deletions(-)
>
> diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
> index 0b85e363e778..e6a0071bdb56 100644
> --- a/arch/riscv/include/asm/kasan.h
> +++ b/arch/riscv/include/asm/kasan.h
> @@ -6,8 +6,6 @@
>  
>  #ifndef __ASSEMBLY__
>  
> -#ifdef CONFIG_KASAN
> -
>  /*
>   * The following comment was copied from arm64:
>   * KASAN_SHADOW_START: beginning of the kernel virtual addresses.
> @@ -34,6 +32,8 @@
>   */
>  #define KASAN_SHADOW_START   ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & 
> PGDIR_MASK)
>  #define KASAN_SHADOW_END MODULES_LOWEST_VADDR
> +
> +#ifdef CONFIG_KASAN
>  #define KASAN_SHADOW_OFFSET  _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
>  
>  void kasan_init(void);
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index b66f846e7634..c98010ede810 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -27,6 +27,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1488,10 +1489,16 @@ static void __init 
> preallocate_pgd_pages_range(unsigned long start, unsigned lon
>   panic("Failed to pre-allocate %s pages for %s area\n", lvl, area);
>  }
>  
> +#define PAGE_END KASAN_SHADOW_START
> +
>  void __init pgtable_cache_init(void)
>  {
>   preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc");
>   if (IS_ENABLED(CONFIG_MODULES))
>   preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, 
> "bpf/modules");
> + if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
> + preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, 
> "vmemmap");
> + preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct 
> map");

Alex pointed out that KASAN PGDs should be preallocated as well! I'll
address this in the next revision.


Björn

Re: [PATCH] tools/latency-collector: fix -Wformat-security compile warns

2024-05-21 Thread Shuah Khan


On 4/3/24 19:10, Shuah Khan wrote:

Fix the following -Wformat-security compile warnings adding missing
format arguments:

latency-collector.c: In function ‘show_available’:
latency-collector.c:938:17: warning: format not a string literal and
no format arguments [-Wformat-security]
   938 | warnx(no_tracer_msg);
   | ^

latency-collector.c:943:17: warning: format not a string literal and
no format arguments [-Wformat-security]
   943 | warnx(no_latency_tr_msg);
   | ^

latency-collector.c: In function ‘find_default_tracer’:
latency-collector.c:986:25: warning: format not a string literal and
no format arguments [-Wformat-security]
   986 | errx(EXIT_FAILURE, no_tracer_msg);
   |
  ^~~~
latency-collector.c: In function ‘scan_arguments’:
latency-collector.c:1881:33: warning: format not a string literal and
no format arguments [-Wformat-security]
  1881 | errx(EXIT_FAILURE, no_tracer_msg);
   | ^~~~

Signed-off-by: Shuah Khan 
---
  tools/tracing/latency/latency-collector.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/tracing/latency/latency-collector.c 
b/tools/tracing/latency/latency-collector.c
index 0fd9c747d396..cf263fe9deaf 100644
--- a/tools/tracing/latency/latency-collector.c
+++ b/tools/tracing/latency/latency-collector.c
@@ -935,12 +935,12 @@ static void show_available(void)
}
  
  	if (!tracers) {

-   warnx(no_tracer_msg);
+   warnx("%s", no_tracer_msg);
return;
}
  
  	if (!found) {

-   warnx(no_latency_tr_msg);
+   warnx("%s", no_latency_tr_msg);
tracefs_list_free(tracers);
return;
}
@@ -983,7 +983,7 @@ static const char *find_default_tracer(void)
for (i = 0; relevant_tracers[i]; i++) {
valid = tracer_valid(relevant_tracers[i], );
if (notracer)
-   errx(EXIT_FAILURE, no_tracer_msg);
+   errx(EXIT_FAILURE, "%s", no_tracer_msg);
if (valid)
return relevant_tracers[i];
}
@@ -1878,7 +1878,7 @@ static void scan_arguments(int argc, char *argv[])
}
valid = tracer_valid(current_tracer, );
if (notracer)
-   errx(EXIT_FAILURE, no_tracer_msg);
+   errx(EXIT_FAILURE, "%s", no_tracer_msg);
if (!valid)
errx(EXIT_FAILURE,
  "The tracer %s is not supported by your kernel!\n", current_tracer);


Any thoughts on this patch?

thanks,
-- Shuah

Re: [PATCH] uprobes: prevent mutex_lock() under rcu_read_lock()

2024-05-21 Thread Breno Leitao

On Mon, May 20, 2024 at 10:30:17PM -0700, Andrii Nakryiko wrote:
> Recent changes made uprobe_cpu_buffer preparation lazy, and moved it
> deeper into __uprobe_trace_func(). This is problematic because
> __uprobe_trace_func() is called inside rcu_read_lock()/rcu_read_unlock()
> block, which then calls prepare_uprobe_buffer() -> uprobe_buffer_get() ->
> mutex_lock(>mutex), leading to a splat about using mutex under
> non-sleepable RCU:
> 
>   BUG: sleeping function called from invalid context at 
> kernel/locking/mutex.c:585
>in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 98231, name: 
> stress-ng-sigq
>preempt_count: 0, expected: 0
>RCU nest depth: 1, expected: 0
>...
>Call Trace:
> 
> dump_stack_lvl+0x3d/0xe0
> __might_resched+0x24c/0x270
> ? prepare_uprobe_buffer+0xd5/0x1d0
> __mutex_lock+0x41/0x820
> ? ___perf_sw_event+0x206/0x290
> ? __perf_event_task_sched_in+0x54/0x660
> ? __perf_event_task_sched_in+0x54/0x660
> prepare_uprobe_buffer+0xd5/0x1d0
> __uprobe_trace_func+0x4a/0x140
> uprobe_dispatcher+0x135/0x280
> ? uprobe_dispatcher+0x94/0x280
> uprobe_notify_resume+0x650/0xec0
> ? atomic_notifier_call_chain+0x21/0x110
> ? atomic_notifier_call_chain+0xf8/0x110
> irqentry_exit_to_user_mode+0xe2/0x1e0
> asm_exc_int3+0x35/0x40
>RIP: 0033:0x7f7e1d4da390
>Code: 33 04 00 0f 1f 80 00 00 00 00 f3 0f 1e fa b9 01 00 00 00 e9 b2 fc ff 
> ff 66 90 f3 0f 1e fa 31 c9 e9 a5 fc ff ff 0f 1f 44 00 00  0f 1e fa b8 27 
> 00 00 00 0f 05 c3 0f 1f 40 00 f3 0f 1e fa b8 6e
>RSP: 002b:7ffd2abc3608 EFLAGS: 0246
>RAX:  RBX: 76d325f1 RCX: 
>RDX: 76d325f1 RSI: 000a RDI: 7ffd2abc3690
>RBP: 000a R08: 00017fb7 R09: 00017fb7
>R10: 00017fb7 R11: 0246 R12: 00017ff2
>R13: 7ffd2abc3610 R14:  R15: 7ffd2abc3780
> 
> 
> Luckily, it's easy to fix by moving prepare_uprobe_buffer() to be called
> slightly earlier: into uprobe_trace_func() and uretprobe_trace_func(), outside
> of RCU locked section. This still keeps this buffer preparation lazy and helps
> avoid the overhead when it's not needed. E.g., if there is only BPF uprobe
> handler installed on a given uprobe, buffer won't be initialized.
> 
> Note, the other user of prepare_uprobe_buffer(), __uprobe_perf_func(), is not
> affected, as it doesn't prepare buffer under RCU read lock.
> 
> Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily")
> Reported-by: Breno Leitao 
> Signed-off-by: Andrii Nakryiko 

Tested-by: Breno Leitao

Re: [PATCH] uprobes: prevent mutex_lock() under rcu_read_lock()

2024-05-21 Thread Oleg Nesterov

On 05/20, Andrii Nakryiko wrote:
>
> Fixes: 1b8f85defbc8 ("uprobes: prepare uprobe args buffer lazily")
> Reported-by: Breno Leitao 
> Signed-off-by: Andrii Nakryiko 
> ---
>  kernel/trace/trace_uprobe.c | 14 +-
>  1 file changed, 9 insertions(+), 5 deletions(-)

Reviewed-by: Oleg Nesterov

Re: [PATCHv6 bpf-next 1/9] x86/shstk: Make return uprobe work with shadow stack

2024-05-21 Thread Oleg Nesterov

On 05/21, Jiri Olsa wrote:
>
> Currently the application with enabled shadow stack will crash
> if it sets up return uprobe. The reason is the uretprobe kernel
> code changes the user space task's stack, but does not update
> shadow stack accordingly.
>
> Adding new functions to update values on shadow stack and using
> them in uprobe code to keep shadow stack in sync with uretprobe
> changes to user stack.

I don't think my ack has any value in this area but looks good to me.

Reviewed-by: Oleg Nesterov 


> Fixes: 8b1c23543436 ("x86/shstk: Add return uprobe support")

Hmm... Was this commit ever applied?

Oleg.

Re: [PATCH v3 5/9] riscv: mm: Add memory hotplugging support

2024-05-21 Thread Oscar Salvador

On Tue, May 21, 2024 at 03:19:37PM +0200, Alexandre Ghiti wrote:
> On Tue, May 21, 2024 at 1:49 PM Björn Töpel  wrote:
> > +   if (PageReserved(page)) {
> > +   __ClearPageReserved(page);
> 
> What's the difference between __ClearPageReserved() and
> ClearPageReserved()? Because it seems like free_reserved_page() calls
> the latter already, so why would you need to call
> __ClearPageReserved() on the first page?

__{Set,Clear}Page are the non-atomic version.
Usually used when you know that no one else can fiddle with the page, which
should be the case here since we are removing the memory.

As to why we have __ClearPageReserved and then having
free_reserved_page() call ClearPageReserved I do not really know.
Looking at the history, it has always been like this.

I remember I looked at this a few years ago but I cannot remember the outcome
of that.

Maybe David remembers better, but I think we could remove that
__ClearPageReserved.
Looking at powerpc implementation code, it does not do the
__ClearPageReserved and relies only on free_reserved_page().

I will have a look.

-- 
Oscar Salvador
SUSE Labs

Re: [PATCH v3 5/9] riscv: mm: Add memory hotplugging support

2024-05-21 Thread Björn Töpel

Alexandre Ghiti  writes:

> On Tue, May 21, 2024 at 1:49 PM Björn Töpel  wrote:
>>
>> From: Björn Töpel 
>>
>> For an architecture to support memory hotplugging, a couple of
>> callbacks needs to be implemented:
>>
>>  arch_add_memory()
>>   This callback is responsible for adding the physical memory into the
>>   direct map, and call into the memory hotplugging generic code via
>>   __add_pages() that adds the corresponding struct page entries, and
>>   updates the vmemmap mapping.
>>
>>  arch_remove_memory()
>>   This is the inverse of the callback above.
>>
>>  vmemmap_free()
>>   This function tears down the vmemmap mappings (if
>>   CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>>   backing vmemmap pages. Note that for persistent memory, an
>>   alternative allocator for the backing pages can be used; The
>>   vmem_altmap. This means that when the backing pages are cleared,
>>   extra care is needed so that the correct deallocation method is
>>   used.
>>
>>  arch_get_mappable_range()
>>   This functions returns the PA range that the direct map can map.
>>   Used by the MHP internals for sanity checks.
>>
>> The page table unmap/teardown functions are heavily based on code from
>> the x86 tree. The same remove_pgd_mapping() function is used in both
>> vmemmap_free() and arch_remove_memory(), but in the latter function
>> the backing pages are not removed.
>>
>> Signed-off-by: Björn Töpel 
>> ---
>>  arch/riscv/mm/init.c | 261 +++
>>  1 file changed, 261 insertions(+)
>>
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index 6f72b0b2b854..6693b742bf2f 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -1493,3 +1493,264 @@ void __init pgtable_cache_init(void)
>> }
>>  }
>>  #endif
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __meminit free_pagetable(struct page *page, int order)
>> +{
>> +   unsigned int nr_pages = 1 << order;
>> +
>> +   /*
>> +* vmemmap/direct page tables can be reserved, if added at
>> +* boot.
>> +*/
>> +   if (PageReserved(page)) {
>> +   __ClearPageReserved(page);
>
> What's the difference between __ClearPageReserved() and
> ClearPageReserved()? Because it seems like free_reserved_page() calls
> the latter already, so why would you need to call
> __ClearPageReserved() on the first page?

Indeed! x86 copy pasta (which uses bootmem info page that RV doesn't).

>> +   while (nr_pages--)
>> +   free_reserved_page(page++);
>> +   return;
>> +   }
>> +
>> +   free_pages((unsigned long)page_address(page), order);
>> +}
>> +
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
>> +{
>> +   pte_t *pte;
>> +   int i;
>> +
>> +   for (i = 0; i < PTRS_PER_PTE; i++) {
>> +   pte = pte_start + i;
>> +   if (!pte_none(*pte))
>> +   return;
>> +   }
>> +
>> +   free_pagetable(pmd_page(*pmd), 0);
>> +   pmd_clear(pmd);
>> +}
>> +
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
>> +{
>> +   pmd_t *pmd;
>> +   int i;
>> +
>> +   for (i = 0; i < PTRS_PER_PMD; i++) {
>> +   pmd = pmd_start + i;
>> +   if (!pmd_none(*pmd))
>> +   return;
>> +   }
>> +
>> +   free_pagetable(pud_page(*pud), 0);
>> +   pud_clear(pud);
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
>> +{
>> +   pud_t *pud;
>> +   int i;
>> +
>> +   for (i = 0; i < PTRS_PER_PUD; i++) {
>> +   pud = pud_start + i;
>> +   if (!pud_none(*pud))
>> +   return;
>> +   }
>> +
>> +   free_pagetable(p4d_page(*p4d), 0);
>> +   p4d_clear(p4d);
>> +}
>> +
>> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
>> +  struct vmem_altmap *altmap)
>> +{
>> +   if (altmap)
>> +   vmem_altmap_free(altmap, size >> PAGE_SHIFT);
>> +   else
>> +   free_pagetable(page, get_order(size));
>> +}
>> +
>> +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long 
>> addr, unsigned long end,
>> +bool is_vmemmap, struct vmem_altmap 
>> *altmap)
>> +{
>> +   unsigned long next;
>> +   pte_t *ptep, pte;
>> +
>> +   for (; addr < end; addr = next) {
>> +   next = (addr + PAGE_SIZE) & PAGE_MASK;
>
> Nit: use ALIGN() instead.
>
>> +   if (next > end)
>> +   next = end;
>> +
>> +   ptep = pte_base + pte_index(addr);
>> +   pte = READ_ONCE(*ptep);
>
> Nit: Use ptep_get()
>
>> +
>> +   if (!pte_present(*ptep))
>> +   continue;
>> +
>> +   pte_clear(_mm, addr, ptep);
>> +   if (is_vmemmap)
>> +

Re: [PATCH v3 9/9] riscv: mm: Add support for ZONE_DEVICE

2024-05-21 Thread Björn Töpel

Alexandre Ghiti  writes:

> On Tue, May 21, 2024 at 1:49 PM Björn Töpel  wrote:
>>
>> From: Björn Töpel 
>>
>> ZONE_DEVICE pages need DEVMAP PTEs support to function
>> (ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit
>> in the PTE for DEVMAP mark, add the corresponding helpers, and enable
>> ARCH_HAS_PTE_DEVMAP for riscv64.
>>
>> Signed-off-by: Björn Töpel 
>> ---
>>  arch/riscv/Kconfig|  1 +
>>  arch/riscv/include/asm/pgtable-64.h   | 20 
>>  arch/riscv/include/asm/pgtable-bits.h |  1 +
>>  arch/riscv/include/asm/pgtable.h  | 17 +
>>  4 files changed, 39 insertions(+)
>>
>> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
>> index 2724dc2af29f..0b74698c63c7 100644
>> --- a/arch/riscv/Kconfig
>> +++ b/arch/riscv/Kconfig
>> @@ -36,6 +36,7 @@ config RISCV
>> select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
>> select ARCH_HAS_PMEM_API
>> select ARCH_HAS_PREPARE_SYNC_CORE_CMD
>> +   select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU
>> select ARCH_HAS_PTE_SPECIAL
>> select ARCH_HAS_SET_DIRECT_MAP if MMU
>> select ARCH_HAS_SET_MEMORY if MMU
>> diff --git a/arch/riscv/include/asm/pgtable-64.h 
>> b/arch/riscv/include/asm/pgtable-64.h
>> index 221a5c1ee287..c67a9bbfd010 100644
>> --- a/arch/riscv/include/asm/pgtable-64.h
>> +++ b/arch/riscv/include/asm/pgtable-64.h
>> @@ -400,4 +400,24 @@ static inline struct page *pgd_page(pgd_t pgd)
>>  #define p4d_offset p4d_offset
>>  p4d_t *p4d_offset(pgd_t *pgd, unsigned long address);
>>
>> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
>> +static inline int pte_devmap(pte_t pte);
>> +static inline pte_t pmd_pte(pmd_t pmd);
>> +
>> +static inline int pmd_devmap(pmd_t pmd)
>> +{
>> +   return pte_devmap(pmd_pte(pmd));
>> +}
>> +
>> +static inline int pud_devmap(pud_t pud)
>> +{
>> +   return 0;
>> +}
>> +
>> +static inline int pgd_devmap(pgd_t pgd)
>> +{
>> +   return 0;
>> +}
>> +#endif
>> +
>>  #endif /* _ASM_RISCV_PGTABLE_64_H */
>> diff --git a/arch/riscv/include/asm/pgtable-bits.h 
>> b/arch/riscv/include/asm/pgtable-bits.h
>> index 179bd4afece4..a8f5205cea54 100644
>> --- a/arch/riscv/include/asm/pgtable-bits.h
>> +++ b/arch/riscv/include/asm/pgtable-bits.h
>> @@ -19,6 +19,7 @@
>>  #define _PAGE_SOFT  (3 << 8)/* Reserved for software */
>>
>>  #define _PAGE_SPECIAL   (1 << 8)/* RSW: 0x1 */
>> +#define _PAGE_DEVMAP(1 << 9)/* RSW, devmap */
>>  #define _PAGE_TABLE _PAGE_PRESENT
>>
>>  /*
>> diff --git a/arch/riscv/include/asm/pgtable.h 
>> b/arch/riscv/include/asm/pgtable.h
>> index 7933f493db71..02fadc276064 100644
>> --- a/arch/riscv/include/asm/pgtable.h
>> +++ b/arch/riscv/include/asm/pgtable.h
>> @@ -387,6 +387,13 @@ static inline int pte_special(pte_t pte)
>> return pte_val(pte) & _PAGE_SPECIAL;
>>  }
>>
>> +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
>> +static inline int pte_devmap(pte_t pte)
>> +{
>> +   return pte_val(pte) & _PAGE_DEVMAP;
>> +}
>> +#endif
>
> Not sure you need the #ifdef here.

W/o it 32b builds break (!defined(CONFIG_ARCH_HAS_PTE_DEVMAP) will have
a default implementation).. Maybe it's cleaner just to use that instead?

>> +
>>  /* static inline pte_t pte_rdprotect(pte_t pte) */
>>
>>  static inline pte_t pte_wrprotect(pte_t pte)
>> @@ -428,6 +435,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
>> return __pte(pte_val(pte) | _PAGE_SPECIAL);
>>  }
>>
>> +static inline pte_t pte_mkdevmap(pte_t pte)
>> +{
>> +   return __pte(pte_val(pte) | _PAGE_DEVMAP);
>> +}
>> +
>>  static inline pte_t pte_mkhuge(pte_t pte)
>>  {
>> return pte;
>> @@ -711,6 +723,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
>> return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
>>  }
>>
>> +static inline pmd_t pmd_mkdevmap(pmd_t pmd)
>> +{
>> +   return pte_pmd(pte_mkdevmap(pmd_pte(pmd)));
>> +}
>> +
>>  static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
>> pmd_t *pmdp, pmd_t pmd)
>>  {
>> --
>> 2.40.1
>>
>
> Otherwise, you can add:
>
> Reviewed-by: Alexandre Ghiti 

Thank you!


Björn

Re: [PATCH v3 9/9] riscv: mm: Add support for ZONE_DEVICE

2024-05-21 Thread Alexandre Ghiti

On Tue, May 21, 2024 at 1:49 PM Björn Töpel  wrote:
>
> From: Björn Töpel 
>
> ZONE_DEVICE pages need DEVMAP PTEs support to function
> (ARCH_HAS_PTE_DEVMAP). Claim another RSW (reserved for software) bit
> in the PTE for DEVMAP mark, add the corresponding helpers, and enable
> ARCH_HAS_PTE_DEVMAP for riscv64.
>
> Signed-off-by: Björn Töpel 
> ---
>  arch/riscv/Kconfig|  1 +
>  arch/riscv/include/asm/pgtable-64.h   | 20 
>  arch/riscv/include/asm/pgtable-bits.h |  1 +
>  arch/riscv/include/asm/pgtable.h  | 17 +
>  4 files changed, 39 insertions(+)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 2724dc2af29f..0b74698c63c7 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -36,6 +36,7 @@ config RISCV
> select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
> select ARCH_HAS_PMEM_API
> select ARCH_HAS_PREPARE_SYNC_CORE_CMD
> +   select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU
> select ARCH_HAS_PTE_SPECIAL
> select ARCH_HAS_SET_DIRECT_MAP if MMU
> select ARCH_HAS_SET_MEMORY if MMU
> diff --git a/arch/riscv/include/asm/pgtable-64.h 
> b/arch/riscv/include/asm/pgtable-64.h
> index 221a5c1ee287..c67a9bbfd010 100644
> --- a/arch/riscv/include/asm/pgtable-64.h
> +++ b/arch/riscv/include/asm/pgtable-64.h
> @@ -400,4 +400,24 @@ static inline struct page *pgd_page(pgd_t pgd)
>  #define p4d_offset p4d_offset
>  p4d_t *p4d_offset(pgd_t *pgd, unsigned long address);
>
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static inline int pte_devmap(pte_t pte);
> +static inline pte_t pmd_pte(pmd_t pmd);
> +
> +static inline int pmd_devmap(pmd_t pmd)
> +{
> +   return pte_devmap(pmd_pte(pmd));
> +}
> +
> +static inline int pud_devmap(pud_t pud)
> +{
> +   return 0;
> +}
> +
> +static inline int pgd_devmap(pgd_t pgd)
> +{
> +   return 0;
> +}
> +#endif
> +
>  #endif /* _ASM_RISCV_PGTABLE_64_H */
> diff --git a/arch/riscv/include/asm/pgtable-bits.h 
> b/arch/riscv/include/asm/pgtable-bits.h
> index 179bd4afece4..a8f5205cea54 100644
> --- a/arch/riscv/include/asm/pgtable-bits.h
> +++ b/arch/riscv/include/asm/pgtable-bits.h
> @@ -19,6 +19,7 @@
>  #define _PAGE_SOFT  (3 << 8)/* Reserved for software */
>
>  #define _PAGE_SPECIAL   (1 << 8)/* RSW: 0x1 */
> +#define _PAGE_DEVMAP(1 << 9)/* RSW, devmap */
>  #define _PAGE_TABLE _PAGE_PRESENT
>
>  /*
> diff --git a/arch/riscv/include/asm/pgtable.h 
> b/arch/riscv/include/asm/pgtable.h
> index 7933f493db71..02fadc276064 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -387,6 +387,13 @@ static inline int pte_special(pte_t pte)
> return pte_val(pte) & _PAGE_SPECIAL;
>  }
>
> +#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
> +static inline int pte_devmap(pte_t pte)
> +{
> +   return pte_val(pte) & _PAGE_DEVMAP;
> +}
> +#endif

Not sure you need the #ifdef here.

> +
>  /* static inline pte_t pte_rdprotect(pte_t pte) */
>
>  static inline pte_t pte_wrprotect(pte_t pte)
> @@ -428,6 +435,11 @@ static inline pte_t pte_mkspecial(pte_t pte)
> return __pte(pte_val(pte) | _PAGE_SPECIAL);
>  }
>
> +static inline pte_t pte_mkdevmap(pte_t pte)
> +{
> +   return __pte(pte_val(pte) | _PAGE_DEVMAP);
> +}
> +
>  static inline pte_t pte_mkhuge(pte_t pte)
>  {
> return pte;
> @@ -711,6 +723,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
> return pte_pmd(pte_mkdirty(pmd_pte(pmd)));
>  }
>
> +static inline pmd_t pmd_mkdevmap(pmd_t pmd)
> +{
> +   return pte_pmd(pte_mkdevmap(pmd_pte(pmd)));
> +}
> +
>  static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
> pmd_t *pmdp, pmd_t pmd)
>  {
> --
> 2.40.1
>

Otherwise, you can add:

Reviewed-by: Alexandre Ghiti 

Thanks,

Alex

Re: [PATCH V3 3/3] vdpa_sim: flush workers on suspend

2024-05-21 Thread Steven Sistare


On 5/20/2024 10:32 PM, Jason Wang wrote:

On Mon, May 20, 2024 at 11:21 PM Steve Sistare
 wrote:


Flush to guarantee no workers are running when suspend returns.
Add a lock to enforce ordering between clearing running, flushing,
and posting new work in vdpasim_kick_vq.  It must be a spin lock
because vdpasim_kick_vq may be reached va eventfd_write.

Signed-off-by: Steve Sistare 
---
  drivers/vdpa/vdpa_sim/vdpa_sim.c | 16 ++--
  drivers/vdpa/vdpa_sim/vdpa_sim.h |  1 +
  2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c
index 8ffea8430f95..67ed49d95bf0 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
@@ -322,7 +322,7 @@ static u16 vdpasim_get_vq_size(struct vdpa_device *vdpa, 
u16 idx)
 return VDPASIM_QUEUE_MAX;
  }

-static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx)
+static void vdpasim_do_kick_vq(struct vdpa_device *vdpa, u16 idx)
  {
 struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
 struct vdpasim_virtqueue *vq = >vqs[idx];
@@ -337,6 +337,15 @@ static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 
idx)
 vdpasim_schedule_work(vdpasim);
  }

+static void vdpasim_kick_vq(struct vdpa_device *vdpa, u16 idx)
+{
+   struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
+
+   spin_lock(>kick_lock);
+   vdpasim_do_kick_vq(vdpa, idx);
+   spin_unlock(>kick_lock);
+}
+
  static void vdpasim_set_vq_cb(struct vdpa_device *vdpa, u16 idx,
   struct vdpa_callback *cb)
  {
@@ -520,8 +529,11 @@ static int vdpasim_suspend(struct vdpa_device *vdpa)
 struct vdpasim *vdpasim = vdpa_to_sim(vdpa);

 mutex_lock(>mutex);
+   spin_lock(>kick_lock);
 vdpasim->running = false;
+   spin_unlock(>kick_lock);
 mutex_unlock(>mutex);
+   kthread_flush_work(>work);

 return 0;
  }
@@ -537,7 +549,7 @@ static int vdpasim_resume(struct vdpa_device *vdpa)
 if (vdpasim->pending_kick) {
 /* Process pending descriptors */
 for (i = 0; i < vdpasim->dev_attr.nvqs; ++i)
-   vdpasim_kick_vq(vdpa, i);
+   vdpasim_do_kick_vq(vdpa, i);

 vdpasim->pending_kick = false;
 }
diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h
index bb137e479763..5eb6ca9c5ec5 100644
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.h
+++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h
@@ -75,6 +75,7 @@ struct vdpasim {
 bool pending_kick;
 /* spinlock to synchronize iommu table */
 spinlock_t iommu_lock;
+   spinlock_t kick_lock;


It looks to me this is not initialized?


Yup, I lost that line while fiddling with different locking schemes.
Thanks, will fix in V4.

@@ -236,6 +236,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr 
*dev_attr,


mutex_init(>mutex);
spin_lock_init(>iommu_lock);
+   spin_lock_init(>kick_lock);

With that fix, does this patch earn your RB?

- Steve


  };

  struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *attr,
--
2.39.3

Re: [PATCH V3 2/3] vduse: suspend

2024-05-21 Thread Steven Sistare


On 5/20/2024 10:30 PM, Jason Wang wrote:

On Mon, May 20, 2024 at 11:21 PM Steve Sistare
 wrote:


Support the suspend operation.  There is little to do, except flush to
guarantee no workers are running when suspend returns.

Signed-off-by: Steve Sistare 
---
  drivers/vdpa/vdpa_user/vduse_dev.c | 24 
  1 file changed, 24 insertions(+)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c 
b/drivers/vdpa/vdpa_user/vduse_dev.c
index 73c89701fc9d..7dc46f771f12 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -472,6 +472,18 @@ static void vduse_dev_reset(struct vduse_dev *dev)
 up_write(>rwsem);
  }

+static void vduse_flush_work(struct vduse_dev *dev)
+{
+   flush_work(>inject);
+
+   for (int i = 0; i < dev->vq_num; i++) {
+   struct vduse_virtqueue *vq = dev->vqs[i];
+
+   flush_work(>inject);
+   flush_work(>kick);
+   }
+}
+
  static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx,
 u64 desc_area, u64 driver_area,
 u64 device_area)
@@ -724,6 +736,17 @@ static int vduse_vdpa_reset(struct vdpa_device *vdpa)
 return ret;
  }

+static int vduse_vdpa_suspend(struct vdpa_device *vdpa)
+{
+   struct vduse_dev *dev = vdpa_to_vduse(vdpa);
+
+   down_write(>rwsem);
+   vduse_flush_work(dev);
+   up_write(>rwsem);


Can this forbid the new work to be scheduled?


Are you suggesting I return an error below if the dev is suspended?
I can do that.

However, I now suspect this implementation of vduse_vdpa_suspend is not
complete in other ways, so I withdraw this patch pending future work.
Thanks for looking at it.

- Steve


static int vduse_dev_queue_irq_work(struct vduse_dev *dev,
 struct work_struct *irq_work,
 int irq_effective_cpu)
{
 int ret = -EINVAL;

 down_read(>rwsem);
 if (!(dev->status & VIRTIO_CONFIG_S_DRIVER_OK))
 goto unlock;

 ret = 0;
 if (irq_effective_cpu == IRQ_UNBOUND)
 queue_work(vduse_irq_wq, irq_work);
 else
 queue_work_on(irq_effective_cpu,
   vduse_irq_bound_wq, irq_work);
unlock:
 up_read(>rwsem);

 return ret;
}

Thanks


+
+   return 0;
+}
+
  static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa)
  {
 struct vduse_dev *dev = vdpa_to_vduse(vdpa);
@@ -806,6 +829,7 @@ static const struct vdpa_config_ops vduse_vdpa_config_ops = 
{
 .set_vq_affinity= vduse_vdpa_set_vq_affinity,
 .get_vq_affinity= vduse_vdpa_get_vq_affinity,
 .reset  = vduse_vdpa_reset,
+   .suspend= vduse_vdpa_suspend,
 .set_map= vduse_vdpa_set_map,
 .free   = vduse_vdpa_free,
  };
--
2.39.3

Re: [PATCH V3 1/3] vhost-vdpa: flush workers on suspend

2024-05-21 Thread Steven Sistare


On 5/20/2024 10:28 PM, Jason Wang wrote:

On Mon, May 20, 2024 at 11:21 PM Steve Sistare
 wrote:


Flush to guarantee no workers are running when suspend returns.

Fixes: f345a0143b4d ("vhost-vdpa: uAPI to suspend the device")
Signed-off-by: Steve Sistare 
Acked-by: Eugenio Pérez 
---
  drivers/vhost/vdpa.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index ba52d128aeb7..189596caaec9 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -594,6 +594,7 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v)
 struct vdpa_device *vdpa = v->vdpa;
 const struct vdpa_config_ops *ops = vdpa->config;
 int ret;
+   struct vhost_dev *vdev = >vdev;

 if (!(ops->get_status(vdpa) & VIRTIO_CONFIG_S_DRIVER_OK))
 return 0;
@@ -601,6 +602,8 @@ static long vhost_vdpa_suspend(struct vhost_vdpa *v)
 if (!ops->suspend)
 return -EOPNOTSUPP;

+   vhost_dev_flush(vdev);


vhost-vDPA doesn't use workers, see:

 vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false,
vhost_vdpa_process_iotlb_msg);

So I wonder if this is a must.


True, but I am adding this to be future proof.  I could instead log a warning
or an error message if vhost_vdpa_suspend is called and v->vdev.use_worker=true,
but IMO we should just fix it, given that the fix is trivial.

- Steve

Re: [PATCH v3 7/9] riscv: Enable memory hotplugging for RISC-V

2024-05-21 Thread Alexandre Ghiti

On Tue, May 21, 2024 at 1:49 PM Björn Töpel  wrote:
>
> From: Björn Töpel 
>
> Enable ARCH_ENABLE_MEMORY_HOTPLUG and ARCH_ENABLE_MEMORY_HOTREMOVE for
> RISC-V.
>
> Signed-off-by: Björn Töpel 
> ---
>  arch/riscv/Kconfig | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index fe5281398543..2724dc2af29f 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -16,6 +16,8 @@ config RISCV
> select ACPI_REDUCED_HARDWARE_ONLY if ACPI
> select ARCH_DMA_DEFAULT_COHERENT
> select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION
> +   select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM_VMEMMAP && 64BIT && MMU

Not sure you need 64BIT && MMU here since ARCH_SPARSEMEM_ENABLE
depends on MMU and SPARSEMEM_VMEMMAP_ENABLE is only enabled on 64BIT.

> +   select ARCH_ENABLE_MEMORY_HOTREMOVE if MEMORY_HOTPLUG
> select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2
> select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
> select ARCH_HAS_BINFMT_FLAT
> --
> 2.40.1
>

But anyway, to me that does not require a new version so you can add:

Reviewed-by: Alexandre Ghiti 

Thanks,

Alex

Re: [PATCH v3 5/9] riscv: mm: Add memory hotplugging support

2024-05-21 Thread Alexandre Ghiti

On Tue, May 21, 2024 at 1:49 PM Björn Töpel  wrote:
>
> From: Björn Töpel 
>
> For an architecture to support memory hotplugging, a couple of
> callbacks needs to be implemented:
>
>  arch_add_memory()
>   This callback is responsible for adding the physical memory into the
>   direct map, and call into the memory hotplugging generic code via
>   __add_pages() that adds the corresponding struct page entries, and
>   updates the vmemmap mapping.
>
>  arch_remove_memory()
>   This is the inverse of the callback above.
>
>  vmemmap_free()
>   This function tears down the vmemmap mappings (if
>   CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>   backing vmemmap pages. Note that for persistent memory, an
>   alternative allocator for the backing pages can be used; The
>   vmem_altmap. This means that when the backing pages are cleared,
>   extra care is needed so that the correct deallocation method is
>   used.
>
>  arch_get_mappable_range()
>   This functions returns the PA range that the direct map can map.
>   Used by the MHP internals for sanity checks.
>
> The page table unmap/teardown functions are heavily based on code from
> the x86 tree. The same remove_pgd_mapping() function is used in both
> vmemmap_free() and arch_remove_memory(), but in the latter function
> the backing pages are not removed.
>
> Signed-off-by: Björn Töpel 
> ---
>  arch/riscv/mm/init.c | 261 +++
>  1 file changed, 261 insertions(+)
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 6f72b0b2b854..6693b742bf2f 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -1493,3 +1493,264 @@ void __init pgtable_cache_init(void)
> }
>  }
>  #endif
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void __meminit free_pagetable(struct page *page, int order)
> +{
> +   unsigned int nr_pages = 1 << order;
> +
> +   /*
> +* vmemmap/direct page tables can be reserved, if added at
> +* boot.
> +*/
> +   if (PageReserved(page)) {
> +   __ClearPageReserved(page);

What's the difference between __ClearPageReserved() and
ClearPageReserved()? Because it seems like free_reserved_page() calls
the latter already, so why would you need to call
__ClearPageReserved() on the first page?

> +   while (nr_pages--)
> +   free_reserved_page(page++);
> +   return;
> +   }
> +
> +   free_pages((unsigned long)page_address(page), order);
> +}
> +
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> +   pte_t *pte;
> +   int i;
> +
> +   for (i = 0; i < PTRS_PER_PTE; i++) {
> +   pte = pte_start + i;
> +   if (!pte_none(*pte))
> +   return;
> +   }
> +
> +   free_pagetable(pmd_page(*pmd), 0);
> +   pmd_clear(pmd);
> +}
> +
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +   pmd_t *pmd;
> +   int i;
> +
> +   for (i = 0; i < PTRS_PER_PMD; i++) {
> +   pmd = pmd_start + i;
> +   if (!pmd_none(*pmd))
> +   return;
> +   }
> +
> +   free_pagetable(pud_page(*pud), 0);
> +   pud_clear(pud);
> +}
> +
> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
> +{
> +   pud_t *pud;
> +   int i;
> +
> +   for (i = 0; i < PTRS_PER_PUD; i++) {
> +   pud = pud_start + i;
> +   if (!pud_none(*pud))
> +   return;
> +   }
> +
> +   free_pagetable(p4d_page(*p4d), 0);
> +   p4d_clear(p4d);
> +}
> +
> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
> +  struct vmem_altmap *altmap)
> +{
> +   if (altmap)
> +   vmem_altmap_free(altmap, size >> PAGE_SHIFT);
> +   else
> +   free_pagetable(page, get_order(size));
> +}
> +
> +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long 
> addr, unsigned long end,
> +bool is_vmemmap, struct vmem_altmap 
> *altmap)
> +{
> +   unsigned long next;
> +   pte_t *ptep, pte;
> +
> +   for (; addr < end; addr = next) {
> +   next = (addr + PAGE_SIZE) & PAGE_MASK;

Nit: use ALIGN() instead.

> +   if (next > end)
> +   next = end;
> +
> +   ptep = pte_base + pte_index(addr);
> +   pte = READ_ONCE(*ptep);

Nit: Use ptep_get()

> +
> +   if (!pte_present(*ptep))
> +   continue;
> +
> +   pte_clear(_mm, addr, ptep);
> +   if (is_vmemmap)
> +   free_vmemmap_storage(pte_page(pte), PAGE_SIZE, 
> altmap);
> +   }
> +}
> +
> +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long 
> addr, unsigned long end,
> +bool is_vmemmap, struct

1 2 >

1 - 100 of 104 matches

Mail list logo