date:20240611

[PATCH V2] net: qrtr: ns: Ignore ENODEV failures in ns

2024-06-11 Thread Sarannya S

From: Chris Lew 

Ignore the ENODEV failures returned by kernel_sendmsg(). These errors
indicate that either the local port has been closed or the remote has
gone down. Neither of these scenarios are fatal and will eventually be
handled through packets that are later queued on the control port.

Signed-off-by: Chris Lew 
Signed-off-by: Sarannya Sasikumar 
Reviewed-by: Simon Horman 
---
Changes from previous revision:
Changed return type of service_announce_del from int to void.
Fixed alignment issues.

 net/qrtr/ns.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c
index 654a3cc0d347..3de9350cbf30 100644
--- a/net/qrtr/ns.c
+++ b/net/qrtr/ns.c
@@ -132,8 +132,8 @@ static int service_announce_new(struct sockaddr_qrtr *dest,
return kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
 }
 
-static int service_announce_del(struct sockaddr_qrtr *dest,
-   struct qrtr_server *srv)
+static void service_announce_del(struct sockaddr_qrtr *dest,
+struct qrtr_server *srv)
 {
struct qrtr_ctrl_pkt pkt;
struct msghdr msg = { };
@@ -157,10 +157,10 @@ static int service_announce_del(struct sockaddr_qrtr 
*dest,
msg.msg_namelen = sizeof(*dest);
 
ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
-   if (ret < 0)
+   if (ret < 0 && ret != -ENODEV)
pr_err("failed to announce del service\n");
 
-   return ret;
+   return;
 }
 
 static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv,
@@ -188,7 +188,7 @@ static void lookup_notify(struct sockaddr_qrtr *to, struct 
qrtr_server *srv,
msg.msg_namelen = sizeof(*to);
 
ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
-   if (ret < 0)
+   if (ret < 0 && ret != -ENODEV)
pr_err("failed to send lookup notification\n");
 }
 
@@ -207,6 +207,9 @@ static int announce_servers(struct sockaddr_qrtr *sq)
xa_for_each(&node->servers, index, srv) {
ret = service_announce_new(sq, srv);
if (ret < 0) {
+   if (ret == -ENODEV)
+   continue;
+
pr_err("failed to announce new service\n");
return ret;
}
@@ -369,7 +372,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from)
msg.msg_namelen = sizeof(sq);
 
ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
-   if (ret < 0) {
+   if (ret < 0 && ret != -ENODEV) {
pr_err("failed to send bye cmd\n");
return ret;
}
@@ -443,7 +446,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from,
msg.msg_namelen = sizeof(sq);
 
ret = kernel_sendmsg(qrtr_ns.sock, &msg, &iv, 1, sizeof(pkt));
-   if (ret < 0) {
+   if (ret < 0 && ret != -ENODEV) {
pr_err("failed to send del client cmd\n");
return ret;
}
-- 
2.25.1

Re: [PATCH 1/2] vdpa: support set mac address from vdpa tool

2024-06-11 Thread Jiri Pirko

Wed, Jun 12, 2024 at 03:58:10AM CEST, k...@kernel.org wrote:
>On Tue, 11 Jun 2024 13:32:32 +0800 Cindy Lu wrote:
>> Add new UAPI to support the mac address from vdpa tool
>> Function vdpa_nl_cmd_dev_config_set_doit() will get the
>> MAC address from the vdpa tool and then set it to the device.
>> 
>> The usage is: vdpa dev set name vdpa_name mac **:**:**:**:**:**
>
>Why don't you use devlink?

Fair question. Why does vdpa-specific uapi even exist? To have
driver-specific uapi Does not make any sense to me :/

Re: [PATCH] testing: nvdimm: Add MODULE_DESCRIPTION() macros

2024-06-11 Thread Jeff Johnson

On 6/11/2024 9:47 PM, Ira Weiny wrote:
> When building with W=1 the following errors are seen:
> 
> WARNING: modpost: missing MODULE_DESCRIPTION() in 
> tools/testing/nvdimm/test/nfit_test.o
> WARNING: modpost: missing MODULE_DESCRIPTION() in 
> tools/testing/nvdimm/test/ndtest.o
> 
> Add the required MODULE_DESCRIPTION() to the test platform device
> drivers.
> 
> Suggested-by: Jeff Johnson 
> Signed-off-by: Ira Weiny 
> ---
> Jeff I'm not seeing a patch to cover these cases for the missing module
> descriptions you have been sending out.  If you have an outstanding
> patch I missed could you point me to it?  Otherwise I believe this
> cleans up the nvdimm tree.
> ---
>  tools/testing/nvdimm/test/ndtest.c | 1 +
>  tools/testing/nvdimm/test/nfit.c   | 1 +
>  2 files changed, 2 insertions(+)
> 
> diff --git a/tools/testing/nvdimm/test/ndtest.c 
> b/tools/testing/nvdimm/test/ndtest.c
> index b438f3d053ee..892e990c034a 100644
> --- a/tools/testing/nvdimm/test/ndtest.c
> +++ b/tools/testing/nvdimm/test/ndtest.c
> @@ -987,5 +987,6 @@ static __exit void ndtest_exit(void)
>  
>  module_init(ndtest_init);
>  module_exit(ndtest_exit);
> +MODULE_DESCRIPTION("Test non-NFIT devices");
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("IBM Corporation");
> diff --git a/tools/testing/nvdimm/test/nfit.c 
> b/tools/testing/nvdimm/test/nfit.c
> index a61df347a33d..cfd4378e2129 100644
> --- a/tools/testing/nvdimm/test/nfit.c
> +++ b/tools/testing/nvdimm/test/nfit.c
> @@ -3382,5 +3382,6 @@ static __exit void nfit_test_exit(void)
>  
>  module_init(nfit_test_init);
>  module_exit(nfit_test_exit);
> +MODULE_DESCRIPTION("Test ACPI NFIT devices");
>  MODULE_LICENSE("GPL v2");
>  MODULE_AUTHOR("Intel Corporation");
> 
> ---
> base-commit: 2df0193e62cf887f373995fb8a91068562784adc
> change-id: 20240611-nvdimm-test-mod-warn-8cf773360b37
> 
> Best regards,

Not on my radar, so thanks for fixing!

Reviewed-by: Jeff Johnson

[PATCH] testing: nvdimm: Add MODULE_DESCRIPTION() macros

2024-06-11 Thread Ira Weiny

When building with W=1 the following errors are seen:

WARNING: modpost: missing MODULE_DESCRIPTION() in 
tools/testing/nvdimm/test/nfit_test.o
WARNING: modpost: missing MODULE_DESCRIPTION() in 
tools/testing/nvdimm/test/ndtest.o

Add the required MODULE_DESCRIPTION() to the test platform device
drivers.

Suggested-by: Jeff Johnson 
Signed-off-by: Ira Weiny 
---
Jeff I'm not seeing a patch to cover these cases for the missing module
descriptions you have been sending out.  If you have an outstanding
patch I missed could you point me to it?  Otherwise I believe this
cleans up the nvdimm tree.
---
 tools/testing/nvdimm/test/ndtest.c | 1 +
 tools/testing/nvdimm/test/nfit.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tools/testing/nvdimm/test/ndtest.c 
b/tools/testing/nvdimm/test/ndtest.c
index b438f3d053ee..892e990c034a 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -987,5 +987,6 @@ static __exit void ndtest_exit(void)
 
 module_init(ndtest_init);
 module_exit(ndtest_exit);
+MODULE_DESCRIPTION("Test non-NFIT devices");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("IBM Corporation");
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index a61df347a33d..cfd4378e2129 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -3382,5 +3382,6 @@ static __exit void nfit_test_exit(void)
 
 module_init(nfit_test_init);
 module_exit(nfit_test_exit);
+MODULE_DESCRIPTION("Test ACPI NFIT devices");
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Intel Corporation");

---
base-commit: 2df0193e62cf887f373995fb8a91068562784adc
change-id: 20240611-nvdimm-test-mod-warn-8cf773360b37

Best regards,
-- 
Ira Weiny

[PATCH v5 13/13] tracing: Add last boot delta offset for stack traces

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

The addresses of a stack trace event are relative to the kallsyms. As that
can change between boots, when printing the stack trace from a buffer that
was from the last boot, it needs all the addresses to be added to the
"text_delta" that gives the delta between the addresses of the functions
for the current boot compared to the address of the last boot. Then it can
be passed to kallsyms to find the function name, otherwise it just shows a
useless list of addresses.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace_output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b9d2c64c0648..48de93598897 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1233,6 +1233,7 @@ static enum print_line_t trace_stack_print(struct 
trace_iterator *iter,
struct trace_seq *s = &iter->seq;
unsigned long *p;
unsigned long *end;
+   long delta = iter->tr->text_delta;
 
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
@@ -1245,7 +1246,7 @@ static enum print_line_t trace_stack_print(struct 
trace_iterator *iter,
break;
 
trace_seq_puts(s, " => ");
-   seq_print_ip_sym(s, *p, flags);
+   seq_print_ip_sym(s, (*p) + delta, flags);
trace_seq_putc(s, '\n');
}
 
-- 
2.43.0

[PATCH v5 10/13] tracing/ring-buffer: Add last_boot_info file to boot instance

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

If an instance is mapped to memory on boot up, create a new file called
"last_boot_info" that will hold information that can be used to properly
parse the raw data in the ring buffer.

It will export the delta of the addresses for text and data from what it
was from the last boot. It does not expose actually addresses (unless you
knew what the actual address was from the last boot).

The output will look like:

 # cat last_boot_info
 text delta:-268435456
 data delta:-268435456

The text and data are kept separate in case they are ever made different.

Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ring_buffer.h |  3 +++
 kernel/trace/ring_buffer.c  | 23 ++
 kernel/trace/trace.c| 47 -
 kernel/trace/trace.h|  2 ++
 4 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index a50b0223b1d3..55de3798a9b9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -94,6 +94,9 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long 
size, unsigned flag
   unsigned long range_size,
   struct lock_class_key *key);
 
+bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
+long *data);
+
 /*
  * Because the ring buffer is generic, if other users of the ring buffer get
  * traced by ftrace, it can produce lockdep warnings. We need to keep each
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 195e47ef730d..ccb2101a2e38 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2396,6 +2396,29 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned 
long size, unsigned flag
return alloc_buffer(size, flags, order, start, start + range_size, key);
 }
 
+/**
+ * ring_buffer_last_boot_delta - return the delta offset from last boot
+ * @buffer: The buffer to return the delta from
+ * @text: Return text delta
+ * @data: Return data delta
+ *
+ * Returns: The true if the delta is non zero
+ */
+bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
+long *data)
+{
+   if (!buffer)
+   return false;
+
+   if (!buffer->last_text_delta)
+   return false;
+
+   *text = buffer->last_text_delta;
+   *data = buffer->last_data_delta;
+
+   return true;
+}
+
 /**
  * ring_buffer_free - free a ring buffer.
  * @buffer: the buffer to free.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dfde26aa3211..dc4eee33d920 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6041,6 +6041,18 @@ ssize_t tracing_resize_ring_buffer(struct trace_array 
*tr,
return ret;
 }
 
+static void update_last_data(struct trace_array *tr)
+{
+   if (!tr->text_delta && !tr->data_delta)
+   return;
+
+   /* Clear old data */
+   tracing_reset_online_cpus(&tr->array_buffer);
+
+   /* Using current data now */
+   tr->text_delta = 0;
+   tr->data_delta = 0;
+}
 
 /**
  * tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -6058,6 +6070,9 @@ int tracing_update_buffers(struct trace_array *tr)
int ret = 0;
 
mutex_lock(&trace_types_lock);
+
+   update_last_data(tr);
+
if (!tr->ring_buffer_expanded)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6113,6 +6128,8 @@ int tracing_set_tracer(struct trace_array *tr, const char 
*buf)
 
mutex_lock(&trace_types_lock);
 
+   update_last_data(tr);
+
if (!tr->ring_buffer_expanded) {
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6860,6 +6877,21 @@ tracing_total_entries_read(struct file *filp, char 
__user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+static ssize_t
+tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, 
loff_t *ppos)
+{
+   struct trace_array *tr = filp->private_data;
+   struct seq_buf seq;
+   char buf[64];
+
+   seq_buf_init(&seq, buf, 64);
+
+   seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta);
+   seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta);
+
+   return simple_read_from_buffer(ubuf, cnt, ppos, buf, 
seq_buf_used(&seq));
+}
+
 static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
 {
struct trace_array *tr = inode->i_private;
@@ -7499,6 +7531,13 @@ static const struct file_operations 
trace_time_stamp_mode_fops = {
.release= tracing_single_release_tr,
 };
 
+static const struct file_operations last_boot_fops = {
+   .open

[PATCH v5 12/13] tracing: Update function tracing output for previous boot buffer

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

For a persistent ring buffer that is saved across boots, if function
tracing was performed in the previous boot, it only saves the address of
the functions and uses "%pS" to print their names. But the current boot,
those functions may be in different locations. The persistent meta-data
saves the text delta between the two boots and can be used to find the
address of the saved function of where it is located in the current boot.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace_output.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d8b302d01083..b9d2c64c0648 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -990,8 +990,11 @@ enum print_line_t trace_nop_print(struct trace_iterator 
*iter, int flags,
 }
 
 static void print_fn_trace(struct trace_seq *s, unsigned long ip,
-  unsigned long parent_ip, int flags)
+  unsigned long parent_ip, long delta, int flags)
 {
+   ip += delta;
+   parent_ip += delta;
+
seq_print_ip_sym(s, ip, flags);
 
if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
@@ -1009,7 +1012,7 @@ static enum print_line_t trace_fn_trace(struct 
trace_iterator *iter, int flags,
 
trace_assign_type(field, iter->ent);
 
-   print_fn_trace(s, field->ip, field->parent_ip, flags);
+   print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, 
flags);
trace_seq_putc(s, '\n');
 
return trace_handle_return(s);
@@ -1674,7 +1677,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int 
flags,
 
trace_assign_type(field, iter->ent);
 
-   print_fn_trace(s, field->ip, field->parent_ip, flags);
+   print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, 
flags);
trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
trace_print_time(s, iter,
 iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
-- 
2.43.0

[PATCH v5 11/13] tracing: Handle old buffer mappings for event strings and functions

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Use the saved text_delta and data_delta of a persistent memory mapped ring
buffer that was saved from a previous boot, and use the delta in the trace
event print output so that strings and functions show up normally.

That is, for an event like trace_kmalloc() that prints the callsite via
"%pS", if it used the address saved in the ring buffer it will not match
the function that was saved in the previous boot if the kernel remaps
itself between boots.

For RCU events that point to saved static strings where only the address
of the string is saved in the ring buffer, it too will be adjusted to
point to where the string is on the current boot.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace.c | 42 +++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc4eee33d920..71cca10581d6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3671,8 +3671,11 @@ static void test_can_verify(void)
 void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
 va_list ap)
 {
+   long text_delta = iter->tr->text_delta;
+   long data_delta = iter->tr->data_delta;
const char *p = fmt;
const char *str;
+   bool good;
int i, j;
 
if (WARN_ON_ONCE(!fmt))
@@ -3691,7 +3694,10 @@ void trace_check_vprintf(struct trace_iterator *iter, 
const char *fmt,
 
j = 0;
 
-   /* We only care about %s and variants */
+   /*
+* We only care about %s and variants
+* as well as %p[sS] if delta is non-zero
+*/
for (i = 0; p[i]; i++) {
if (i + 1 >= iter->fmt_size) {
/*
@@ -3720,6 +3726,11 @@ void trace_check_vprintf(struct trace_iterator *iter, 
const char *fmt,
}
if (p[i+j] == 's')
break;
+
+   if (text_delta && p[i+1] == 'p' &&
+   ((p[i+2] == 's' || p[i+2] == 'S')))
+   break;
+
star = false;
}
j = 0;
@@ -3733,6 +3744,24 @@ void trace_check_vprintf(struct trace_iterator *iter, 
const char *fmt,
iter->fmt[i] = '\0';
trace_seq_vprintf(&iter->seq, iter->fmt, ap);
 
+   /* Add delta to %pS pointers */
+   if (p[i+1] == 'p') {
+   unsigned long addr;
+   char fmt[4];
+
+   fmt[0] = '%';
+   fmt[1] = 'p';
+   fmt[2] = p[i+2]; /* Either %ps or %pS */
+   fmt[3] = '\0';
+
+   addr = va_arg(ap, unsigned long);
+   addr += text_delta;
+   trace_seq_printf(&iter->seq, fmt, (void *)addr);
+
+   p += i + 3;
+   continue;
+   }
+
/*
 * If iter->seq is full, the above call no longer guarantees
 * that ap is in sync with fmt processing, and further calls
@@ -3751,6 +3780,14 @@ void trace_check_vprintf(struct trace_iterator *iter, 
const char *fmt,
/* The ap now points to the string data of the %s */
str = va_arg(ap, const char *);
 
+   good = trace_safe_str(iter, str, star, len);
+
+   /* Could be from the last boot */
+   if (data_delta && !good) {
+   str += data_delta;
+   good = trace_safe_str(iter, str, star, len);
+   }
+
/*
 * If you hit this warning, it is likely that the
 * trace event in question used %s on a string that
@@ -3760,8 +3797,7 @@ void trace_check_vprintf(struct trace_iterator *iter, 
const char *fmt,
 * instead. See samples/trace_events/trace-events-sample.h
 * for reference.
 */
-   if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
- "fmt: '%s' current_buffer: '%s'",
+   if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'",
  fmt, seq_buf_str(&iter->seq.seq))) {
int ret;
 
-- 
2.43.0

[PATCH v5 09/13] ring-buffer: Save text and data locations in mapped meta data

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

When a ring buffer is mapped to a specific address, save the address of a
text function and some data. This will be used to determine the delta
between the last boot and the current boot for pointers to functions as
well as to data.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2118c478e42b..195e47ef730d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -45,6 +45,8 @@
 static void update_pages_handler(struct work_struct *work);
 
 struct ring_buffer_meta {
+   unsigned long   text_addr;
+   unsigned long   data_addr;
unsigned long   first_buffer;
unsigned long   head_buffer;
unsigned long   commit_buffer;
@@ -542,6 +544,9 @@ struct trace_buffer {
unsigned long   range_addr_start;
unsigned long   range_addr_end;
 
+   longlast_text_delta;
+   longlast_data_delta;
+
unsigned intsubbuf_size;
unsigned intsubbuf_order;
unsigned intmax_data_size;
@@ -1821,10 +1826,15 @@ static void rb_meta_validate_events(struct 
ring_buffer_per_cpu *cpu_buffer)
}
 }
 
+/* Used to calculate data delta */
+static char rb_data_ptr[] = "";
+
 static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
 {
struct ring_buffer_meta *meta;
unsigned long delta;
+   unsigned long this_text = (unsigned long)rb_range_meta_init;
+   unsigned long this_data = (unsigned long)rb_data_ptr;
void *subbuf;
int cpu;
int i;
@@ -1841,6 +1851,10 @@ static void rb_range_meta_init(struct trace_buffer 
*buffer, int nr_pages)
meta->first_buffer += delta;
meta->head_buffer += delta;
meta->commit_buffer += delta;
+   buffer->last_text_delta = this_text - meta->text_addr;
+   buffer->last_data_delta = this_data - meta->data_addr;
+   meta->text_addr = this_text;
+   meta->data_addr = this_data;
continue;
}
 
@@ -1857,6 +1871,8 @@ static void rb_range_meta_init(struct trace_buffer 
*buffer, int nr_pages)
subbuf = rb_subbufs_from_meta(meta);
 
meta->first_buffer = (unsigned long)subbuf;
+   meta->text_addr = this_text;
+   meta->data_addr = this_data;
 
/*
 * The buffers[] array holds the order of the sub-buffers
-- 
2.43.0

[PATCH v5 08/13] tracing: Add option to use memmapped memory for trace boot instance

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Add an option to the trace_instance kernel command line parameter that
allows it to use the reserved memory from memmap boot parameter.

  memmap=12M$0x28450 trace_instance=boot_mapped@0x28450:12M

The above will reserves 12 megs at the physical address 0x28450.
The second parameter will create a "boot_mapped" instance and use the
memory reserved as the memory for the ring buffer.

That will create an instance called "boot_mapped":

  /sys/kernel/tracing/instances/boot_mapped

Note, because the ring buffer is using a defined memory ranged, it will
act just like a memory mapped ring buffer. It will not have a snapshot
buffer, as it can't swap out the buffer. The snapshot files as well as any
tracers that uses a snapshot will not be present in the boot_mapped
instance.

Cc: linux...@kvack.org
Signed-off-by: Steven Rostedt (Google) 
---
 .../admin-guide/kernel-parameters.txt |  9 +++
 kernel/trace/trace.c  | 75 +--
 2 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index b600df82669d..ff26b6094e79 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6754,6 +6754,15 @@
the same thing would happen if it was left off). The 
irq_handler_entry
event, and all events under the "initcall" system.
 
+   If memory has been reserved (see memmap for x86), the 
instance
+   can use that memory:
+
+   memmap=12M$0x28450 
trace_instance=boot_map@0x28450:12M
+
+   The above will create a "boot_map" instance that uses 
the physical
+   memory at 0x28450 that is 12Megs. The per CPU 
buffers of that
+   instance will be split up accordingly.
+
trace_options=[option-list]
[FTRACE] Enable or disable tracer options at boot.
The option-list is a comma delimited list of options
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 622fe670949d..dfde26aa3211 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9504,6 +9504,31 @@ static int instance_mkdir(const char *name)
return ret;
 }
 
+static u64 map_pages(u64 start, u64 size)
+{
+   struct page **pages;
+   phys_addr_t page_start;
+   unsigned int page_count;
+   unsigned int i;
+   void *vaddr;
+
+   page_count = DIV_ROUND_UP(size, PAGE_SIZE);
+
+   page_start = start;
+   pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
+   if (!pages)
+   return 0;
+
+   for (i = 0; i < page_count; i++) {
+   phys_addr_t addr = page_start + i * PAGE_SIZE;
+   pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
+   }
+   vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL);
+   kfree(pages);
+
+   return (u64)(unsigned long)vaddr;
+}
+
 /**
  * trace_array_get_by_name - Create/Lookup a trace array, given its name.
  * @name: The name of the trace array to be looked up/created.
@@ -10350,6 +10375,7 @@ __init static void enable_instances(void)
 {
struct trace_array *tr;
char *curr_str;
+   char *name;
char *str;
char *tok;
 
@@ -10358,19 +10384,56 @@ __init static void enable_instances(void)
str = boot_instance_info;
 
while ((curr_str = strsep(&str, "\t"))) {
+   unsigned long start = 0;
+   unsigned long size = 0;
+   unsigned long addr = 0;
 
tok = strsep(&curr_str, ",");
+   name = strsep(&tok, "@");
+   if (tok) {
+   start = memparse(tok, &tok);
+   if (!start) {
+   pr_warn("Tracing: Invalid boot instance address 
for %s\n",
+   name);
+   continue;
+   }
+   }
 
-   if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
-   do_allocate_snapshot(tok);
+   if (start) {
+   if (*tok != ':') {
+   pr_warn("Tracing: No size specified for 
instance %s\n", name);
+   continue;
+   }
+   tok++;
+   size = memparse(tok, &tok);
+   if (!size) {
+   pr_warn("Tracing: Invalid boot instance size 
for %s\n",
+   name);
+   continue;
+   }
+   addr = map_pages(start, size);
+   if (addr) {
+   pr_info("Tracing: mapped

[PATCH v5 07/13] ring-buffer: Validate boot range memory events

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Make sure all the events in each of the sub-buffers that were mapped in a
memory region are valid. This moves the code that walks the buffers for
time-stamp validation out of the CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
ifdef block and is used to validate the content. Only the ring buffer
event meta data and time stamps are checked and not the data load.

This also has a second purpose. The buffer_page structure that points to
the data sub-buffers has accounting that keeps track of the number of
events that are on the sub-buffer. This updates that counter as well. That
counter is used in reading the buffer and knowing if the ring buffer is
empty or not.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 190 +
 1 file changed, 152 insertions(+), 38 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c746ec12b7cd..2118c478e42b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1675,10 +1675,152 @@ static bool rb_meta_valid(struct ring_buffer_meta 
*meta, int cpu,
subbuf = (void *)subbuf + subbuf_size;
}
 
-   pr_info("Ring buffer meta is from previous boot!\n");
return true;
 }
 
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+
+static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int 
cpu,
+  unsigned long long *timestamp, u64 *delta_ptr)
+{
+   struct ring_buffer_event *event;
+   u64 ts, delta;
+   int events = 0;
+   int e;
+
+   *delta_ptr = 0;
+   *timestamp = 0;
+
+   ts = dpage->time_stamp;
+
+   for (e = 0; e < tail; e += rb_event_length(event)) {
+
+   event = (struct ring_buffer_event *)(dpage->data + e);
+
+   switch (event->type_len) {
+
+   case RINGBUF_TYPE_TIME_EXTEND:
+   delta = rb_event_time_stamp(event);
+   ts += delta;
+   break;
+
+   case RINGBUF_TYPE_TIME_STAMP:
+   delta = rb_event_time_stamp(event);
+   delta = rb_fix_abs_ts(delta, ts);
+   if (delta < ts) {
+   *delta_ptr = delta;
+   *timestamp = ts;
+   return -1;
+   }
+   ts = delta;
+   break;
+
+   case RINGBUF_TYPE_PADDING:
+   if (event->time_delta == 1)
+   break;
+   fallthrough;
+   case RINGBUF_TYPE_DATA:
+   events++;
+   ts += event->time_delta;
+   break;
+
+   default:
+   return -1;
+   }
+   }
+   *timestamp = ts;
+   return events;
+}
+
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+{
+   unsigned long long ts;
+   u64 delta;
+   int tail;
+
+   tail = local_read(&dpage->commit);
+   return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+}
+
+/* If the meta data has been validated, now validate the events */
+static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+   struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+   struct buffer_page *head_page;
+   unsigned long entry_bytes = 0;
+   unsigned long entries = 0;
+   int ret;
+   int i;
+
+   if (!meta || !meta->head_buffer)
+   return;
+
+   /* Do the reader page first */
+   ret = rb_validate_buffer(cpu_buffer->reader_page->page, 
cpu_buffer->cpu);
+   if (ret < 0) {
+   pr_info("Ring buffer reader page is invalid\n");
+   goto invalid;
+   }
+   entries += ret;
+   entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
+   local_set(&cpu_buffer->reader_page->entries, ret);
+
+   head_page = cpu_buffer->head_page;
+
+   /* If both the head and commit are on the reader_page then we are done. 
*/
+   if (head_page == cpu_buffer->reader_page &&
+   head_page == cpu_buffer->commit_page)
+   goto done;
+
+   /* Iterate until finding the commit page */
+   for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
+
+   /* Reader page has already been done */
+   if (head_page == cpu_buffer->reader_page)
+   continue;
+
+   ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+   if (ret < 0) {
+   pr_info("Ring buffer meta [%d] invalid buffer page\n",
+   cpu_buffer->cpu);
+   goto invalid;
+   }
+   entries += ret;
+   entry_bytes += local_read(&head

[PATCH v5 06/13] ring-buffer: Add test if range of boot buffer is valid

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Add a test against the ring buffer memory range to see if it has valid
data. The ring_buffer_meta structure is given a new field called
"first_buffer" which holds the address of the first sub-buffer. This is
used to both determine if the other fields are valid as well as finding
the offset between the old addresses of the sub-buffer from the previous
boot to the new addresses of the current boot.

Since the values for nr_subbufs and subbuf_size is to be the same, check
if the values in the meta page match the values calculated.

Take the range of the first_buffer and the total size of all the buffers
and make sure the saved head_buffer and commit_buffer fall in the range.

Iterate through all the sub-buffers to make sure that the values in the
sub-buffer "commit" field (the field that holds the amount of data on the
sub-buffer) is within the end of the sub-buffer. Also check the index
array to make sure that all the indexes are within nr_subbufs.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 143 ++---
 1 file changed, 135 insertions(+), 8 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 076e7135b9ef..c746ec12b7cd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -45,6 +45,7 @@
 static void update_pages_handler(struct work_struct *work);
 
 struct ring_buffer_meta {
+   unsigned long   first_buffer;
unsigned long   head_buffer;
unsigned long   commit_buffer;
__u32   subbuf_size;
@@ -1618,21 +1619,103 @@ static void *rb_range_buffer(struct 
ring_buffer_per_cpu *cpu_buffer, int idx)
return (void *)ptr;
 }
 
+/*
+ * See if the existing memory contains valid ring buffer data.
+ * As the previous kernel must be the same as this kernel, all
+ * the calculations (size of buffers and number of buffers)
+ * must be the same.
+ */
+static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
+ struct trace_buffer *buffer, int nr_pages)
+{
+   int subbuf_size = PAGE_SIZE;
+   struct buffer_data_page *subbuf;
+   unsigned long buffers_start;
+   unsigned long buffers_end;
+   int i;
+
+   /* The subbuffer's size and number of subbuffers must match */
+   if (meta->subbuf_size != subbuf_size ||
+   meta->nr_subbufs != nr_pages + 1) {
+   pr_info("Ring buffer boot meta [%d] mismatch of 
subbuf_size/nr_pages\n", cpu);
+   return false;
+   }
+
+   buffers_start = meta->first_buffer;
+   buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
+
+   /* Is the head and commit buffers within the range of buffers? */
+   if (meta->head_buffer < buffers_start ||
+   meta->head_buffer >= buffers_end) {
+   pr_info("Ring buffer boot meta [%d] head buffer out of 
range\n", cpu);
+   return false;
+   }
+
+   if (meta->commit_buffer < buffers_start ||
+   meta->commit_buffer >= buffers_end) {
+   pr_info("Ring buffer boot meta [%d] commit buffer out of 
range\n", cpu);
+   return false;
+   }
+
+   subbuf = rb_subbufs_from_meta(meta);
+
+   /* Is the meta buffers and the subbufs themselves have correct data? */
+   for (i = 0; i < meta->nr_subbufs; i++) {
+   if (meta->buffers[i] < 0 ||
+   meta->buffers[i] >= meta->nr_subbufs) {
+   pr_info("Ring buffer boot meta [%d] array out of 
range\n", cpu);
+   return false;
+   }
+
+   if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
+   pr_info("Ring buffer boot meta [%d] buffer invalid 
commit\n", cpu);
+   return false;
+   }
+
+   subbuf = (void *)subbuf + subbuf_size;
+   }
+
+   pr_info("Ring buffer meta is from previous boot!\n");
+   return true;
+}
+
 static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
 {
struct ring_buffer_meta *meta;
+   unsigned long delta;
void *subbuf;
int cpu;
int i;
 
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+   void *next_meta;
+
meta = rb_range_meta(buffer, nr_pages, cpu);
 
+   if (rb_meta_valid(meta, cpu, buffer, nr_pages)) {
+   /* Make the mappings match the current address */
+   subbuf = rb_subbufs_from_meta(meta);
+   delta = (unsigned long)subbuf - meta->first_buffer;
+   meta->first_buffer += delta;
+   meta->head_buffer += delta;
+   meta->commit_buffer += delta;
+   continue;
+   }
+
+   if (cpu < nr_cpu_ids - 1)
+   next_meta = rb_range_meta(buffer, nr_pages, cpu + 1);
+

[PATCH v5 05/13] ring-buffer: Add output of ring buffer meta page

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Add a buffer_meta per-cpu file for the trace instance that is mapped to
boot memory. This shows the current meta-data and can be used by user
space tools to record off the current mappings to help reconstruct the
ring buffer after a reboot.

It does not expose any virtual addresses, just indexes into the sub-buffer
pages.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 77 ++
 kernel/trace/trace.c   | 30 ++-
 kernel/trace/trace.h   |  2 +
 3 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 17818b9eab2a..076e7135b9ef 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -32,6 +32,8 @@
 #include 
 #include 
 
+#include "trace.h"
+
 /*
  * The "absolute" timestamp in the buffer is only 59 bits.
  * If a clock has the 5 MSBs set, it needs to be saved and
@@ -1647,6 +1649,81 @@ static void rb_range_meta_init(struct trace_buffer 
*buffer, int nr_pages)
}
 }
 
+static void *rbm_start(struct seq_file *m, loff_t *pos)
+{
+   struct ring_buffer_per_cpu *cpu_buffer = m->private;
+   struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+   unsigned long val;
+
+   if (!meta)
+   return NULL;
+
+   if (*pos > meta->nr_subbufs)
+   return NULL;
+
+   val = *pos;
+   val++;
+
+   return (void *)val;
+}
+
+static void *rbm_next(struct seq_file *m, void *v, loff_t *pos)
+{
+   (*pos)++;
+
+   return rbm_start(m, pos);
+}
+
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+
+static int rbm_show(struct seq_file *m, void *v)
+{
+   struct ring_buffer_per_cpu *cpu_buffer = m->private;
+   struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+   unsigned long val = (unsigned long)v;
+
+   if (val == 1) {
+   seq_printf(m, "head_buffer:   %d\n",
+  rb_meta_subbuf_idx(meta, (void *)meta->head_buffer));
+   seq_printf(m, "commit_buffer: %d\n",
+  rb_meta_subbuf_idx(meta, (void 
*)meta->commit_buffer));
+   seq_printf(m, "subbuf_size:   %d\n", meta->subbuf_size);
+   seq_printf(m, "nr_subbufs:%d\n", meta->nr_subbufs);
+   return 0;
+   }
+
+   val -= 2;
+   seq_printf(m, "buffer[%ld]:%d\n", val, meta->buffers[val]);
+
+   return 0;
+}
+
+static void rbm_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations rb_meta_seq_ops = {
+   .start  = rbm_start,
+   .next   = rbm_next,
+   .show   = rbm_show,
+   .stop   = rbm_stop,
+};
+
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, 
int cpu)
+{
+   struct seq_file *m;
+   int ret;
+
+   ret = seq_open(file, &rb_meta_seq_ops);
+   if (ret)
+   return ret;
+
+   m = file->private_data;
+   m->private = buffer->buffers[cpu];
+
+   return 0;
+}
+
 static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ff2b504fbe00..622fe670949d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5018,7 +5018,7 @@ static int show_traces_open(struct inode *inode, struct 
file *file)
return 0;
 }
 
-static int show_traces_release(struct inode *inode, struct file *file)
+static int tracing_seq_release(struct inode *inode, struct file *file)
 {
struct trace_array *tr = inode->i_private;
 
@@ -5059,7 +5059,7 @@ static const struct file_operations show_traces_fops = {
.open   = show_traces_open,
.read   = seq_read,
.llseek = seq_lseek,
-   .release= show_traces_release,
+   .release= tracing_seq_release,
 };
 
 static ssize_t
@@ -6860,6 +6860,22 @@ tracing_total_entries_read(struct file *filp, char 
__user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
+{
+   struct trace_array *tr = inode->i_private;
+   int cpu = tracing_get_cpu(inode);
+   int ret;
+
+   ret = tracing_check_open_get_tr(tr);
+   if (ret)
+   return ret;
+
+   ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu);
+   if (ret < 0)
+   __trace_array_put(tr);
+   return ret;
+}
+
 static ssize_t
 tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
  size_t cnt, loff_t *ppos)
@@ -7436,6 +7452,13 @@ static const struct file_operations tracing_entries_fops 
= {
.release= tracing_release_generic_tr,
 };
 
+static const struct file_operations tracing_buffer_meta_fops = {
+   .open

[PATCH v5 04/13] tracing: Implement creating an instance based on a given memory region

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Allow for creating a new instance by passing in an address and size to map
the ring buffer for the instance to.

This will allow features like a pstore memory mapped region to be used for
an tracing instance ring buffer that can be retrieved from one boot to the
next.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace.c | 50 +++-
 kernel/trace/trace.h |  4 
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 578a49ff5c32..ff2b504fbe00 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4921,6 +4921,11 @@ static int tracing_open(struct inode *inode, struct file 
*file)
 static bool
 trace_ok_for_array(struct tracer *t, struct trace_array *tr)
 {
+#ifdef CONFIG_TRACER_SNAPSHOT
+   /* arrays with mapped buffer range do not have snapshots */
+   if (tr->range_addr_start && t->use_max_tr)
+   return false;
+#endif
return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
 }
 
@@ -8664,11 +8669,13 @@ tracing_init_tracefs_percpu(struct trace_array *tr, 
long cpu)
tr, cpu, &tracing_entries_fops);
 
 #ifdef CONFIG_TRACER_SNAPSHOT
-   trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
-   tr, cpu, &snapshot_fops);
+   if (!tr->range_addr_start) {
+   trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
+ tr, cpu, &snapshot_fops);
 
-   trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
-   tr, cpu, &snapshot_raw_fops);
+   trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &snapshot_raw_fops);
+   }
 #endif
 }
 
@@ -9205,7 +9212,18 @@ allocate_trace_buffer(struct trace_array *tr, struct 
array_buffer *buf, int size
 
buf->tr = tr;
 
-   buf->buffer = ring_buffer_alloc(size, rb_flags);
+   if (tr->range_addr_start && tr->range_addr_size) {
+   buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
+ tr->range_addr_start,
+ tr->range_addr_size);
+   /*
+* This is basically the same as a mapped buffer,
+* with the same restrictions.
+*/
+   tr->mapped++;
+   } else {
+   buf->buffer = ring_buffer_alloc(size, rb_flags);
+   }
if (!buf->buffer)
return -ENOMEM;
 
@@ -9242,6 +9260,10 @@ static int allocate_trace_buffers(struct trace_array 
*tr, int size)
return ret;
 
 #ifdef CONFIG_TRACER_MAX_TRACE
+   /* Fix mapped buffer trace arrays do not have snapshot buffers */
+   if (tr->range_addr_start)
+   return 0;
+
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
@@ -9342,7 +9364,9 @@ static int trace_array_create_dir(struct trace_array *tr)
 }
 
 static struct trace_array *
-trace_array_create_systems(const char *name, const char *systems)
+trace_array_create_systems(const char *name, const char *systems,
+  unsigned long range_addr_start,
+  unsigned long range_addr_size)
 {
struct trace_array *tr;
int ret;
@@ -9368,6 +9392,10 @@ trace_array_create_systems(const char *name, const char 
*systems)
goto out_free_tr;
}
 
+   /* Only for boot up memory mapped ring buffers */
+   tr->range_addr_start = range_addr_start;
+   tr->range_addr_size = range_addr_size;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
 
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9425,7 +9453,7 @@ trace_array_create_systems(const char *name, const char 
*systems)
 
 static struct trace_array *trace_array_create(const char *name)
 {
-   return trace_array_create_systems(name, NULL);
+   return trace_array_create_systems(name, NULL, 0, 0);
 }
 
 static int instance_mkdir(const char *name)
@@ -9479,7 +9507,7 @@ struct trace_array *trace_array_get_by_name(const char 
*name, const char *system
goto out_unlock;
}
 
-   tr = trace_array_create_systems(name, systems);
+   tr = trace_array_create_systems(name, systems, 0, 0);
 
if (IS_ERR(tr))
tr = NULL;
@@ -9672,8 +9700,10 @@ init_tracer_tracefs(struct trace_array *tr, struct 
dentry *d_tracer)
MEM_FAIL(1, "Could not allocate function filter files");
 
 #ifdef CONFIG_TRACER_SNAPSHOT
-   trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
- tr, &snapshot_fops);
+   if (!tr->ran

[PATCH v5 03/13] ring-buffer: Add ring_buffer_meta data

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Populate the ring_buffer_meta array. It holds the pointer to the
head_buffer (next to read), the commit_buffer (next to write) the size of
the sub-buffers, number of sub-buffers and an array that keeps track of
the order of the sub-buffers.

This information will be stored in the persistent memory to help on reboot
to reconstruct the ring buffer.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 209 -
 1 file changed, 184 insertions(+), 25 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fd52cad34b0f..17818b9eab2a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -43,6 +43,11 @@
 static void update_pages_handler(struct work_struct *work);
 
 struct ring_buffer_meta {
+   unsigned long   head_buffer;
+   unsigned long   commit_buffer;
+   __u32   subbuf_size;
+   __u32   nr_subbufs;
+   int buffers[];
 };
 
 /*
@@ -501,6 +506,7 @@ struct ring_buffer_per_cpu {
struct mutexmapping_lock;
unsigned long   *subbuf_ids;/* ID to subbuf VA */
struct trace_buffer_meta*meta_page;
+   struct ring_buffer_meta *ring_meta;
 
/* ring buffer pages to update, > 0 to add, < 0 to remove */
longnr_pages_to_update;
@@ -1261,6 +1267,11 @@ static void rb_head_page_activate(struct 
ring_buffer_per_cpu *cpu_buffer)
 * Set the previous list pointer to have the HEAD flag.
 */
rb_set_list_to_head(head->list.prev);
+
+   if (cpu_buffer->ring_meta) {
+   struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+   meta->head_buffer = (unsigned long)head->page;
+   }
 }
 
 static void rb_list_head_clear(struct list_head *list)
@@ -1515,51 +1526,127 @@ rb_range_align_subbuf(unsigned long addr, int 
subbuf_size, int nr_subbufs)
 }
 
 /*
- * Return a specific sub-buffer for a given @cpu defined by @idx.
+ * Return the ring_buffer_meta for a given @cpu.
  */
-static void *rb_range_buffer(struct trace_buffer *buffer, int cpu, int 
nr_pages, int idx)
+static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
 {
-   unsigned long ptr;
int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+   unsigned long ptr = buffer->range_addr_start;
+   struct ring_buffer_meta *meta;
int nr_subbufs;
 
-   /* Include the reader page */
-   nr_subbufs = nr_pages + 1;
+   if (!ptr)
+   return NULL;
+
+   /* When nr_pages passed in is zero, the first meta has already been 
initialized */
+   if (!nr_pages) {
+   meta = (struct ring_buffer_meta *)ptr;
+   nr_subbufs = meta->nr_subbufs;
+   } else {
+   meta = NULL;
+   /* Include the reader page */
+   nr_subbufs = nr_pages + 1;
+   }
 
/*
 * The first chunk may not be subbuffer aligned, where as
 * the rest of the chunks are.
 */
-   ptr = buffer->range_addr_start;
-   ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
if (cpu) {
-   unsigned long p;
-
-   ptr += subbuf_size * nr_subbufs;
-
-   /* Save the beginning of this CPU chunk */
-   p = ptr;
-
ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+   ptr += subbuf_size * nr_subbufs;
 
/* We can use multiplication to find chunks greater than 1 */
if (cpu > 1) {
unsigned long size;
+   unsigned long p;
 
+   /* Save the beginning of this CPU chunk */
+   p = ptr;
+   ptr = rb_range_align_subbuf(ptr, subbuf_size, 
nr_subbufs);
ptr += subbuf_size * nr_subbufs;
 
/* Now all chunks after this are the same size */
size = ptr - p;
ptr += size * (cpu - 2);
-
-   ptr = rb_range_align_subbuf(ptr, subbuf_size, 
nr_subbufs);
}
}
-   if (ptr + subbuf_size * nr_subbufs > buffer->range_addr_end)
+   return (void *)ptr;
+}
+
+/* Return the start of subbufs given the meta pointer */
+static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
+{
+   int subbuf_size = meta->subbuf_size;
+   unsigned long ptr;
+
+   ptr = (unsigned long)meta;
+   ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs);
+
+   return (void *)ptr;
+}
+
+/*
+ * Return a specific sub-buffer for a given @cpu defined by @idx.
+ */
+static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
+{
+   struct ring_buffer_meta *meta;
+   unsigned long ptr;
+   int subbuf_size;
+
+   meta =

[PATCH v5 01/13] ring-buffer: Allow mapped field to be set without mapping

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

In preparation for having the ring buffer mapped to a dedicated location,
which will have the same restrictions as user space memory mapped buffers,
allow it to use the "mapped" field of the ring_buffer_per_cpu structure
without having the user space meta page mapping.

When this starts using the mapped field, it will need to handle adding a
user space mapping (and removing it) from a ring buffer that is using a
dedicated memory range.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 24 +---
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 28853966aa9a..aa8eb878e0d4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -491,6 +491,7 @@ struct ring_buffer_per_cpu {
unsigned long   pages_removed;
 
unsigned intmapped;
+   unsigned intuser_mapped;/* first user space 
mapping */
struct mutexmapping_lock;
unsigned long   *subbuf_ids;/* ID to subbuf VA */
struct trace_buffer_meta*meta_page;
@@ -5224,6 +5225,9 @@ static void rb_update_meta_page(struct 
ring_buffer_per_cpu *cpu_buffer)
 {
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
 
+   if (!meta)
+   return;
+
meta->reader.read = cpu_buffer->reader_page->read;
meta->reader.id = cpu_buffer->reader_page->id;
meta->reader.lost_events = cpu_buffer->lost_events;
@@ -6167,7 +6171,7 @@ rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
 
mutex_lock(&cpu_buffer->mapping_lock);
 
-   if (!cpu_buffer->mapped) {
+   if (!cpu_buffer->mapped || !cpu_buffer->meta_page) {
mutex_unlock(&cpu_buffer->mapping_lock);
return ERR_PTR(-ENODEV);
}
@@ -6194,7 +6198,7 @@ static int __rb_inc_dec_mapped(struct ring_buffer_per_cpu 
*cpu_buffer,
if (inc && cpu_buffer->mapped == UINT_MAX)
return -EBUSY;
 
-   if (WARN_ON(!inc && cpu_buffer->mapped == 0))
+   if (WARN_ON(!inc && cpu_buffer->mapped < cpu_buffer->user_mapped))
return -EINVAL;
 
mutex_lock(&cpu_buffer->buffer->mutex);
@@ -6328,7 +6332,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
 
mutex_lock(&cpu_buffer->mapping_lock);
 
-   if (cpu_buffer->mapped) {
+   if (cpu_buffer->user_mapped) {
err = __rb_map_vma(cpu_buffer, vma);
if (!err)
err = __rb_inc_dec_mapped(cpu_buffer, true);
@@ -6359,12 +6363,15 @@ int ring_buffer_map(struct trace_buffer *buffer, int 
cpu,
 */
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
err = __rb_map_vma(cpu_buffer, vma);
if (!err) {
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-   cpu_buffer->mapped = 1;
+   /* This is the first time it is mapped externally */
+   cpu_buffer->mapped++;
+   cpu_buffer->user_mapped = cpu_buffer->mapped;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
} else {
kfree(cpu_buffer->subbuf_ids);
@@ -6392,10 +6399,10 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int 
cpu)
 
mutex_lock(&cpu_buffer->mapping_lock);
 
-   if (!cpu_buffer->mapped) {
+   if (!cpu_buffer->user_mapped) {
err = -ENODEV;
goto out;
-   } else if (cpu_buffer->mapped > 1) {
+   } else if (cpu_buffer->mapped > cpu_buffer->user_mapped) {
__rb_inc_dec_mapped(cpu_buffer, false);
goto out;
}
@@ -6403,7 +6410,10 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int 
cpu)
mutex_lock(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
-   cpu_buffer->mapped = 0;
+   /* This is the last user space mapping */
+   if (!WARN_ON_ONCE(cpu_buffer->mapped != cpu_buffer->user_mapped))
+   cpu_buffer->mapped--;
+   cpu_buffer->user_mapped = 0;
 
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
2.43.0

[PATCH v5 00/13] tracing: Persistent traces across a reboot or crash

2024-06-11 Thread Steven Rostedt



This is a way to map a ring buffer instance across reboots.
The requirement is that you have a memory region that is not erased.
I tested this on a Debian VM running on qemu on a Debian server,
and even tested it on a baremetal box running Fedora. I was
surprised that it worked on the baremetal box, but it does so
surprisingly consistently.

This series does not require the ring buffer mapping, but simply
takes a physical address that has been reserved via memmap (on x86 only)
An example of the kernel command line is:

  memmap=12M$0x28540  trace_instance=boot_mapped@0x28540:12M

The above will reserve 12M at physical address 0x28540 (done by the
existing memmap command line option), and then the trace_instance option was
extended to take an address and size (@0x28540:12M). It will then vmap()
that address and allocate a ring buffer in it. If a ring buffer already
exists, it will use it and expose the contents to user space.

The memory reserved is used by the ring buffer of this instance.
It acts like a memory mapped instance so it has some limitations. It does not
allow snapshots nor does it allow tracers which use a snapshot buffer (like
irqsoff and wakeup tracers).

On boot up, when setting up the ring buffer, it looks at the current
content and does a vigorous test to see if the content is valid.
It even walks the events in all the sub-buffers to make sure the
ring buffer meta data is correct. If it determines that the content
is valid, it will reconstruct the ring buffer to use the content
it has found.

If the buffer is valid, on the next boot, the boot_mapped instance
will contain the data from the previous boot. You can cat the
trace or trace_pipe file, or even run trace-cmd extract on it to
make a trace.dat file that holds the date. This is much better than
dealing with a ftrace_dump_on_opps (I wish I had this a decade ago!)

There are still some limitations of this buffer. One is that it assumes
that the kernel you are booting back into is the same one that crashed.
At least the trace_events (like sched_switch and friends) all have the
same ids. This would be true with the same kernel as the ids are determined
at link time.

Module events could possible be a problem as the ids may not match.

This version of the patch series saves a text function and a data
string address in the persistent memory, and this is used to calculate
the delta between text and data addresses of the new boot up. Now
function tracing and "%pS" still work across boots. Even the RCU
trace events that point to static strings work as well!

The delta is exported by a new file in the instance called "last_boot_info"
that has something like this:

 # cat last_boot_info
 text delta:-268435456
 data delta:-268435456

This can be used by trace-cmd that reads the trace_pipe_raw data and
now can figure out how to map the print_formats and kallsyms to the raw
data in the buffers.

This can be used to debug kernel shutdown. I ran the following:

  # trace-cmd start -B boot_mapped -p function
  # reboot

[after reboot]

  # trace-cmd show -B boot_mapped | tail -20
   swapper/0-1   [000] d..1.63.479667: preempt_count_add <-delay_tsc
   swapper/0-1   [000] d..2.63.479669: preempt_count_sub <-delay_tsc
   swapper/0-1   [000] d..1.63.479671: disable_local_APIC 
<-native_stop_other_cpus
   swapper/0-1   [000] d..1.63.479673: clear_local_APIC.part.0 
<-disable_local_APIC
   swapper/0-1   [000] d..1.63.479716: mcheck_cpu_clear 
<-native_stop_other_cpus
   swapper/0-1   [000] d..1.63.479718: mce_intel_feature_clear 
<-native_stop_other_cpus
   swapper/0-1   [000] d..1.63.479720: lmce_supported 
<-mce_intel_feature_clear
   swapper/0-1   [000] d..1.63.479732: lapic_shutdown 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479735: disable_local_APIC 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479736: clear_local_APIC.part.0 
<-disable_local_APIC
   swapper/0-1   [000] d..1.63.479763: restore_boot_irq_mode 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479763: native_restore_boot_irq_mode 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479764: disconnect_bsp_APIC 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479777: hpet_disable 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479778: iommu_shutdown_noop 
<-native_machine_restart
   swapper/0-1   [000] d..1.63.479779: 
native_machine_emergency_restart <-__do_sys_reboot
   swapper/0-1   [000] d..1.63.479779: tboot_shutdown 
<-native_machine_emergency_restart
   swapper/0-1   [000] d..1.63.479790: acpi_reboot 
<-native_machine_emergency_restart
   swapper/0-1   [000] d..1.63.479791: acpi_reset <-acpi_reboot
   swapper/0-1   [000] d..1.63.479791: acpi_os_write_p

[PATCH v5 02/13] ring-buffer: Add ring_buffer_alloc_range()

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

In preparation to allowing the trace ring buffer to be allocated in a
range of memory that is persistent across reboots, add
ring_buffer_alloc_range(). It takes a contiguous range of memory and will
split it up evenly for the per CPU ring buffers.

If there's not enough memory to handle all CPUs with the minimum size, it
will fail to allocate the ring buffer.

Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ring_buffer.h |  17 +++
 kernel/trace/ring_buffer.c  | 239 ++--
 2 files changed, 220 insertions(+), 36 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 96d2140b471e..a50b0223b1d3 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -89,6 +89,11 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
 struct trace_buffer *
 __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key 
*key);
 
+struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned 
flags,
+  int order, unsigned long start,
+  unsigned long range_size,
+  struct lock_class_key *key);
+
 /*
  * Because the ring buffer is generic, if other users of the ring buffer get
  * traced by ftrace, it can produce lockdep warnings. We need to keep each
@@ -100,6 +105,18 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, 
struct lock_class_key *k
__ring_buffer_alloc((size), (flags), &__key);   \
 })
 
+/*
+ * Because the ring buffer is generic, if other users of the ring buffer get
+ * traced by ftrace, it can produce lockdep warnings. We need to keep each
+ * ring buffer's lock class separate.
+ */
+#define ring_buffer_alloc_range(size, flags, order, start, range_size) \
+({ \
+   static struct lock_class_key __key; \
+   __ring_buffer_alloc_range((size), (flags), (order), (start),\
+ (range_size), &__key);\
+})
+
 typedef bool (*ring_buffer_cond_fn)(void *data);
 int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full,
 ring_buffer_cond_fn cond, void *data);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index aa8eb878e0d4..fd52cad34b0f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -42,6 +42,9 @@
 
 static void update_pages_handler(struct work_struct *work);
 
+struct ring_buffer_meta {
+};
+
 /*
  * The ring buffer header is special. We must manually up keep it.
  */
@@ -342,7 +345,8 @@ struct buffer_page {
local_t  entries;   /* entries on this page */
unsigned longreal_end;  /* real end of data */
unsigned order; /* order of the page */
-   u32  id;/* ID for external mapping */
+   u32  id:30; /* ID for external mapping */
+   u32  range:1;   /* Mapped via a range */
struct buffer_data_page *page;  /* Actual data page */
 };
 
@@ -373,7 +377,9 @@ static __always_inline unsigned int rb_page_commit(struct 
buffer_page *bpage)
 
 static void free_buffer_page(struct buffer_page *bpage)
 {
-   free_pages((unsigned long)bpage->page, bpage->order);
+   /* Range pages are not to be freed */
+   if (!bpage->range)
+   free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
 }
 
@@ -524,6 +530,9 @@ struct trace_buffer {
struct rb_irq_work  irq_work;
booltime_stamp_abs;
 
+   unsigned long   range_addr_start;
+   unsigned long   range_addr_end;
+
unsigned intsubbuf_size;
unsigned intsubbuf_order;
unsigned intmax_data_size;
@@ -1491,9 +1500,70 @@ static void rb_check_pages(struct ring_buffer_per_cpu 
*cpu_buffer)
}
 }
 
+/*
+ * Take an address, add the meta data size as well as the array of
+ * array subbuffer indexes, then align it to a subbuffer size.
+ *
+ * This is used to help find the next per cpu subbuffer within a mapped range.
+ */
+static unsigned long
+rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
+{
+   addr += sizeof(struct ring_buffer_meta) +
+   sizeof(int) * nr_subbufs;
+   return ALIGN(addr, subbuf_size);
+}
+
+/*
+ * Return a specific sub-buffer for a given @cpu defined by @idx.
+ */
+static void *rb_range_buffer(struct trace_buffer *buffer, int cpu, int 
nr_pages, int idx)
+{
+   unsigned long ptr;
+   int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+   int nr_subbufs;
+
+   /* Include the reader page */
+   nr_subbufs = nr_pag

[PATCH v2 2/2] virt: pvmemcontrol: add Yuanchu and Pasha as maintainers

2024-06-11 Thread Yuanchu Xie

The pvmemcontrol driver lives under drivers/virt/pvmemcontrol. We
specify maintainers for the driver.

Signed-off-by: Yuanchu Xie 
---
 MAINTAINERS | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index aacccb376c28..5e661f39e07d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18125,6 +18125,13 @@ L: linux-wirel...@vger.kernel.org
 S: Supported
 F: drivers/net/wireless/purelifi/plfxlc/
 
+PVMEMCONTROL GUEST DRIVER
+M: Yuanchu Xie 
+M: Pasha Tatashin 
+L: linux-kernel@vger.kernel.org
+S: Supported
+F: drivers/virt/pvmemcontrol/
+
 PVRUSB2 VIDEO4LINUX DRIVER
 M: Mike Isely 
 L: pvru...@isely.net   (subscribers-only)
-- 
2.45.2.505.gda0bf45e8d-goog

[PATCH v2 1/2] virt: pvmemcontrol: control guest physical memory properties

2024-06-11 Thread Yuanchu Xie

Pvmemcontrol provides a way for the guest to control its physical memory
properties, and enables optimizations and security features. For
example, the guest can provide information to the host where parts of a
hugepage may be unbacked, or sensitive data may not be swapped out, etc.

Pvmemcontrol allows guests to manipulate its gPTE entries in the SLAT,
and also some other properties of the memory map the back's host memory.
This is achieved by using the KVM_CAP_SYNC_MMU capability. When this
capability is available, the changes in the backing of the memory region
on the host are automatically reflected into the guest. For example, an
mmap() or madvise() that affects the region will be made visible
immediately.

There are two components of the implementation: the guest Linux driver
and Virtual Machine Monitor (VMM) device. A guest-allocated shared
buffer is negotiated per-cpu through a few PCI MMIO registers, the VMM
device assigns a unique command for each per-cpu buffer. The guest
writes its pvmemcontrol request in the per-cpu buffer, then writes the
corresponding command into the command register, calling into the VMM
device to perform the pvmemcontrol request.

The synchronous per-cpu shared buffer approach avoids the kick and busy
waiting that the guest would have to do with virtio virtqueue transport.

User API
>From the userland, the pvmemcontrol guest driver is controlled via
ioctl(2) call. It requires CAP_SYS_ADMIN.

ioctl(fd, PVMEMCONTROL_IOCTL, struct pvmemcontrol_buf *buf);

Guest userland applications can tag VMAs and guest hugepages, or advise
the host on how to handle sensitive guest pages.

Supported function codes and their use cases:
PVMEMCONTROL_FREE/REMOVE/DONTNEED/PAGEOUT. For the guest. One can reduce
the struct page and page table lookup overhead by using hugepages backed
by smaller pages on the host. These pvmemcontrol commands can allow for
partial freeing of private guest hugepages to save memory. They also
allow kernel memory, such as kernel stacks and task_structs to be
paravirtualized if we expose kernel APIs.

PVMEMCONTROL_UNMERGEABLE is useful for security, when the VM does not
want to share its backing pages.
The same with PVMEMCONTROL_DONTDUMP, so sensitive pages are not included
in a dump.
MLOCK/UNLOCK can advise the host that sensitive information is not
swapped out on the host.

PVMEMCONTROL_MPROTECT_NONE/R/W/RW. For guest stacks backed by hugepages,
stack guard pages can be handled in the host and memory can be saved in
the hugepage.

PVMEMCONTROL_SET_VMA_ANON_NAME is useful for observability and debugging
how guest memory is being mapped on the host.

Sample program making use of PVMEMCONTROL_DONTNEED:
https://github.com/Dummyc0m/pvmemcontrol-user

The VMM implementation is being proposed for Cloud Hypervisor:
https://github.com/Dummyc0m/cloud-hypervisor/

Cloud Hypervisor issue:
https://github.com/cloud-hypervisor/cloud-hypervisor/issues/6318

-
Changelog
PATCH v1 -> v2
- fixed byte order sparse warning. ioread/write already does
  little-endian.
- add include for linux/percpu.h
RFC v1 -> PATCH v1
- renamed memctl to pvmemcontrol
- defined device endianness as little endian

Signed-off-by: Yuanchu Xie 
---
 .../userspace-api/ioctl/ioctl-number.rst  |   2 +
 drivers/virt/Kconfig  |   2 +
 drivers/virt/Makefile |   1 +
 drivers/virt/pvmemcontrol/Kconfig |  10 +
 drivers/virt/pvmemcontrol/Makefile|   2 +
 drivers/virt/pvmemcontrol/pvmemcontrol.c  | 459 ++
 include/uapi/linux/pvmemcontrol.h |  75 +++
 7 files changed, 551 insertions(+)
 create mode 100644 drivers/virt/pvmemcontrol/Kconfig
 create mode 100644 drivers/virt/pvmemcontrol/Makefile
 create mode 100644 drivers/virt/pvmemcontrol/pvmemcontrol.c
 create mode 100644 include/uapi/linux/pvmemcontrol.h

diff --git a/Documentation/userspace-api/ioctl/ioctl-number.rst 
b/Documentation/userspace-api/ioctl/ioctl-number.rst
index a141e8e65c5d..34a9954cafc7 100644
--- a/Documentation/userspace-api/ioctl/ioctl-number.rst
+++ b/Documentation/userspace-api/ioctl/ioctl-number.rst
@@ -372,6 +372,8 @@ Code  Seq#Include File  
 Comments
 0xCD  01 linux/reiserfs_fs.h
 0xCE  01-02  uapi/linux/cxl_mem.hCompute 
Express Link Memory Devices
 0xCF  02 fs/smb/client/cifs_ioctl.h
+0xDA  00 uapi/linux/pvmemcontrol.h   
Pvmemcontrol Device
+ 

 0xDB  00-0F  drivers/char/mwave/mwavepub.h
 0xDD  00-3F  ZFCP 
device driver see drivers/s390/scsi/
  

diff --git a/drivers/virt/Kconfig b/drivers/virt/Kconfig
index d8c848cf09a6..454e347a90cf 100644
--- a/drivers/virt/Kconfig
+++ b/dr

Re: [PATCH 1/2] vdpa: support set mac address from vdpa tool

2024-06-11 Thread Jakub Kicinski

On Tue, 11 Jun 2024 13:32:32 +0800 Cindy Lu wrote:
> Add new UAPI to support the mac address from vdpa tool
> Function vdpa_nl_cmd_dev_config_set_doit() will get the
> MAC address from the vdpa tool and then set it to the device.
> 
> The usage is: vdpa dev set name vdpa_name mac **:**:**:**:**:**

Why don't you use devlink?

Re: [PATCH v4 01/13] ring-buffer: Allow mapped field to be set without mapping

2024-06-11 Thread Steven Rostedt

On Tue, 11 Jun 2024 21:39:37 -0400
Steven Rostedt  wrote:

> > 
> > Maybe explain why sometimes __rb_inc_dec_mapped() is called to
> > increment or decrement ->mapped, and sometimes it id done directly ?
> > I can see that the function also acquires the buffer mutex, which
> > isn't needed at the places where mapped is incremented/decremented
> > directly, but common code would still be nice, and it is odd to see
> > over/underflows handled sometimes but not always.  
> 
> Sure. I'll add comments explaining more.

And I found a bug with this code. It assumes that mapped will be equal
to 1 if it's the last mapping. That will no longer be the case.

-- Steve

Re: [PATCH v4 01/13] ring-buffer: Allow mapped field to be set without mapping

2024-06-11 Thread Steven Rostedt

On Tue, 11 Jun 2024 16:53:43 -0700
Guenter Roeck  wrote:

> >>> @@ -6403,7 +6407,8 @@ int ring_buffer_unmap(struct trace_buffer *buffer, 
> >>> int cpu)
> >>>   mutex_lock(&buffer->mutex);
> >>>   raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
> >>>
> >>> - cpu_buffer->mapped = 0;
> >>> + WARN_ON_ONCE(!cpu_buffer->mapped);
> >>> + cpu_buffer->mapped--;  
> >>
> >> This will wrap to UINT_MAX if it was 0. Is that intentional ?  
> > 
> > If mapped is non zero, it limits what it can do. If it enters here as zero,
> > we are really in a unknown state, so yeah, wrapping will just keep it
> > limited. Which is a good thing.
> > 
> > Do you want me to add a comment there?
> >   
> 
> Maybe. I just wondered if something like
>   if (!WARN_ON_ONCE(!cpu_buffer->mapped))
>   cpu_buffer->mapped--;
> 
> would be better than wrapping because 'mapped' is used as flag elsewhere,
> but then I can see that it is also manipulated in __rb_inc_dec_mapped(),
> and that it is checked against UINT_MAX there (and not decremented if it is 
> 0).

Yeah, the __rb_inc_dec_mapped() is used as it is called when external
sources map the ring buffer. 

This is incremented and decremented internally. That is, we increment
it the first time the ring buffer is mapped, and decrement it again the
last time it is mapped.

I could add the above logic as well. I hit a bug in my more vigorous
testing so I need to make another revision anyway.

> 
> Maybe explain why sometimes __rb_inc_dec_mapped() is called to
> increment or decrement ->mapped, and sometimes it id done directly ?
> I can see that the function also acquires the buffer mutex, which
> isn't needed at the places where mapped is incremented/decremented
> directly, but common code would still be nice, and it is odd to see
> over/underflows handled sometimes but not always.

Sure. I'll add comments explaining more.

Thanks,

-- Steve

Re: [PATCH 3/3] tracing/kprobe: Remove cleanup code unrelated to selftest

2024-06-11 Thread Steven Rostedt

On Tue, 11 Jun 2024 06:26:53 +0900
"Masami Hiramatsu (Google)"  wrote:

> From: Masami Hiramatsu (Google) 
> 
> This cleanup all kprobe events code is not related to the selftest
> itself, and it can fail by the reason unrelated to this test.
> If the test is successful, the generated events are cleaned up.
> And if not, we cannot guarantee that the kprobe events will work
> correctly. So, anyway, there is no need to clean it up.
> 
> Signed-off-by: Masami Hiramatsu (Google) 

Reviewed-by: Steven Rostedt (Google) 

-- Steve

Re: [PATCH v4 01/13] ring-buffer: Allow mapped field to be set without mapping

2024-06-11 Thread Guenter Roeck


On 6/11/24 15:53, Steven Rostedt wrote:

On Tue, 11 Jun 2024 15:43:59 -0700
Guenter Roeck  wrote:


On 6/11/24 12:28, Steven Rostedt wrote:

From: "Steven Rostedt (Google)" 

In preparation for having the ring buffer mapped to a dedicated location,
which will have the same restrictions as user space memory mapped buffers,
allow it to use the "mapped" field of the ring_buffer_per_cpu structure
without having the user space meta page mapping.

When this starts using the mapped field, it will need to handle adding a
user space mapping (and removing it) from a ring buffer that is using a
dedicated memory range.

Signed-off-by: Steven Rostedt (Google) 
---
   kernel/trace/ring_buffer.c | 11 ---
   1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 28853966aa9a..78beaccf9c8c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5224,6 +5224,9 @@ static void rb_update_meta_page(struct 
ring_buffer_per_cpu *cpu_buffer)
   {
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
   
+	if (!meta)

+   return;
+
meta->reader.read = cpu_buffer->reader_page->read;
meta->reader.id = cpu_buffer->reader_page->id;
meta->reader.lost_events = cpu_buffer->lost_events;
@@ -6167,7 +6170,7 @@ rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
   
   	mutex_lock(&cpu_buffer->mapping_lock);
   
-	if (!cpu_buffer->mapped) {

+   if (!cpu_buffer->mapped || !cpu_buffer->meta_page) {
mutex_unlock(&cpu_buffer->mapping_lock);
return ERR_PTR(-ENODEV);
}
@@ -6359,12 +6362,13 @@ int ring_buffer_map(struct trace_buffer *buffer, int 
cpu,
 */
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+


Picky again. Is that a leftover from something ? I don't see an immediate reason
for the added newline.


Hmm, I could remove it.




raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
   
   	err = __rb_map_vma(cpu_buffer, vma);

if (!err) {
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-   cpu_buffer->mapped = 1;
+   cpu_buffer->mapped++;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
} else {
kfree(cpu_buffer->subbuf_ids);
@@ -6403,7 +6407,8 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int 
cpu)
mutex_lock(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
   
-	cpu_buffer->mapped = 0;

+   WARN_ON_ONCE(!cpu_buffer->mapped);
+   cpu_buffer->mapped--;


This will wrap to UINT_MAX if it was 0. Is that intentional ?


If mapped is non zero, it limits what it can do. If it enters here as zero,
we are really in a unknown state, so yeah, wrapping will just keep it
limited. Which is a good thing.

Do you want me to add a comment there?



Maybe. I just wondered if something like
if (!WARN_ON_ONCE(!cpu_buffer->mapped))
cpu_buffer->mapped--;

would be better than wrapping because 'mapped' is used as flag elsewhere,
but then I can see that it is also manipulated in __rb_inc_dec_mapped(),
and that it is checked against UINT_MAX there (and not decremented if it is 0).

Maybe explain why sometimes __rb_inc_dec_mapped() is called to increment
or decrement ->mapped, and sometimes it id done directly ? I can see that
the function also acquires the buffer mutex, which isn't needed at the places
where mapped is incremented/decremented directly, but common code would
still be nice, and it is odd to see over/underflows handled sometimes but
not always.

Thanks,
Guenter

Re: [PATCH v3 3/3] tracing/kprobe: Remove cleanup code unrelated to selftest

2024-06-11 Thread Google

On Tue, 11 Jun 2024 10:25:00 -0400
Steven Rostedt  wrote:

> On Tue, 11 Jun 2024 22:30:56 +0900
> "Masami Hiramatsu (Google)"  wrote:
> 
> > From: Masami Hiramatsu (Google) 
> > 
> > This cleanup all kprobe events code is not related to the selftest
> > itself, and it can fail by the reason unrelated to this test.
> > If the test is successful, the generated events are cleaned up.
> > And if not, we cannot guarantee that the kprobe events will work
> > correctly. So, anyway, there is no need to clean it up.
> > 
> > Signed-off-by: Masami Hiramatsu (Google) 
> 
> Reviewed-by: Steven Rostedt (Google) 

Thanks for review!

> 
> -- Steve
> 
> > ---
> >  kernel/trace/trace_kprobe.c |4 
> >  1 file changed, 4 deletions(-)
> > 
> > diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> > index 8c5816c04bd2..7fd0f8576e4c 100644
> > --- a/kernel/trace/trace_kprobe.c
> > +++ b/kernel/trace/trace_kprobe.c
> > @@ -2114,10 +2114,6 @@ static __init int kprobe_trace_self_tests_init(void)
> >  
> >  
> >  end:
> > -   ret = dyn_events_release_all(&trace_kprobe_ops);
> > -   if (WARN_ONCE(ret, "error on cleaning up probes."))
> > -   warn++;
> > -
> > /*
> >  * Wait for the optimizer work to finish. Otherwise it might fiddle
> >  * with probes in already freed __init text.
> 
> 


-- 
Masami Hiramatsu (Google)

Re: [PATCH V2 2/2] soc: qcom: smp2p: Introduce tracepoint support

2024-06-11 Thread Chris Lew





On 6/11/2024 5:33 AM, Sudeepgoud Patil wrote:

This commit introduces tracepoint support for smp2p,
enabling logging of communication between local and remote processors.
The tracepoints include information about the remote processor ID,
remote subsystem name, negotiation details, supported features,
bit change notifications, and ssr activity.
These tracepoints are valuable for debugging issues between subsystems.

Signed-off-by: Sudeepgoud Patil 
---

...

diff --git a/drivers/soc/qcom/trace-smp2p.h b/drivers/soc/qcom/trace-smp2p.h
new file mode 100644
index ..833782460b57
--- /dev/null
+++ b/drivers/soc/qcom/trace-smp2p.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM qcom_smp2p
+
+#if !defined(__QCOM_SMP2P_TRACE_H__) || defined(TRACE_HEADER_MULTI_READ)
+#define __QCOM_SMP2P_TRACE_H__
+
+#include 
+
+#define SMP2P_FEATURE_SSR_ACK 0x1


Now that I see it, redefining the the feature flag here seems a bit out 
of place. I'm not sure if it's worth kicking off a header file for this 
single define though.



+
+TRACE_EVENT(smp2p_ssr_ack,
+   TP_PROTO(unsigned int remote_pid, char *irq_devname),
+   TP_ARGS(remote_pid, irq_devname),
+   TP_STRUCT__entry(
+   __field(u32, remote_pid)
+   __string(irq_devname, irq_devname)
+   ),
+   TP_fast_assign(
+   __entry->remote_pid = remote_pid;
+   __assign_str(irq_devname, irq_devname);
+   ),
+   TP_printk("%d: %s: SSR detected, doing SSR Handshake",
+   __entry->remote_pid,
+   __get_str(irq_devname)
+   )
+);
+


I don't think we need to pass remote_pid into all of the traces if we 
have a unique name "irq_devname" to identify the remote now. We could 
remove remote_pid from all the trace event arguments.


We can probably drop the "doing SSR Handshake" part of this print. I 
think it can be assumed that we're doing the handshake once we've 
detected SSR.

Re: [PATCH v4 01/13] ring-buffer: Allow mapped field to be set without mapping

2024-06-11 Thread Steven Rostedt

On Tue, 11 Jun 2024 15:43:59 -0700
Guenter Roeck  wrote:

> On 6/11/24 12:28, Steven Rostedt wrote:
> > From: "Steven Rostedt (Google)" 
> > 
> > In preparation for having the ring buffer mapped to a dedicated location,
> > which will have the same restrictions as user space memory mapped buffers,
> > allow it to use the "mapped" field of the ring_buffer_per_cpu structure
> > without having the user space meta page mapping.
> > 
> > When this starts using the mapped field, it will need to handle adding a
> > user space mapping (and removing it) from a ring buffer that is using a
> > dedicated memory range.
> > 
> > Signed-off-by: Steven Rostedt (Google) 
> > ---
> >   kernel/trace/ring_buffer.c | 11 ---
> >   1 file changed, 8 insertions(+), 3 deletions(-)
> > 
> > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> > index 28853966aa9a..78beaccf9c8c 100644
> > --- a/kernel/trace/ring_buffer.c
> > +++ b/kernel/trace/ring_buffer.c
> > @@ -5224,6 +5224,9 @@ static void rb_update_meta_page(struct 
> > ring_buffer_per_cpu *cpu_buffer)
> >   {
> > struct trace_buffer_meta *meta = cpu_buffer->meta_page;
> >   
> > +   if (!meta)
> > +   return;
> > +
> > meta->reader.read = cpu_buffer->reader_page->read;
> > meta->reader.id = cpu_buffer->reader_page->id;
> > meta->reader.lost_events = cpu_buffer->lost_events;
> > @@ -6167,7 +6170,7 @@ rb_get_mapped_buffer(struct trace_buffer *buffer, int 
> > cpu)
> >   
> > mutex_lock(&cpu_buffer->mapping_lock);
> >   
> > -   if (!cpu_buffer->mapped) {
> > +   if (!cpu_buffer->mapped || !cpu_buffer->meta_page) {
> > mutex_unlock(&cpu_buffer->mapping_lock);
> > return ERR_PTR(-ENODEV);
> > }
> > @@ -6359,12 +6362,13 @@ int ring_buffer_map(struct trace_buffer *buffer, 
> > int cpu,
> >  */
> > raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
> > rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
> > +  
> 
> Picky again. Is that a leftover from something ? I don't see an immediate 
> reason
> for the added newline.

Hmm, I could remove it.

> 
> > raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
> >   
> > err = __rb_map_vma(cpu_buffer, vma);
> > if (!err) {
> > raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
> > -   cpu_buffer->mapped = 1;
> > +   cpu_buffer->mapped++;
> > raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
> > } else {
> > kfree(cpu_buffer->subbuf_ids);
> > @@ -6403,7 +6407,8 @@ int ring_buffer_unmap(struct trace_buffer *buffer, 
> > int cpu)
> > mutex_lock(&buffer->mutex);
> > raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
> >   
> > -   cpu_buffer->mapped = 0;
> > +   WARN_ON_ONCE(!cpu_buffer->mapped);
> > +   cpu_buffer->mapped--;  
> 
> This will wrap to UINT_MAX if it was 0. Is that intentional ?

If mapped is non zero, it limits what it can do. If it enters here as zero,
we are really in a unknown state, so yeah, wrapping will just keep it
limited. Which is a good thing.

Do you want me to add a comment there?

-- Steve


> 
> >   
> > raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
> >

Re: [PATCH v4 01/13] ring-buffer: Allow mapped field to be set without mapping

2024-06-11 Thread Guenter Roeck


On 6/11/24 12:28, Steven Rostedt wrote:

From: "Steven Rostedt (Google)" 

In preparation for having the ring buffer mapped to a dedicated location,
which will have the same restrictions as user space memory mapped buffers,
allow it to use the "mapped" field of the ring_buffer_per_cpu structure
without having the user space meta page mapping.

When this starts using the mapped field, it will need to handle adding a
user space mapping (and removing it) from a ring buffer that is using a
dedicated memory range.

Signed-off-by: Steven Rostedt (Google) 
---
  kernel/trace/ring_buffer.c | 11 ---
  1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 28853966aa9a..78beaccf9c8c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5224,6 +5224,9 @@ static void rb_update_meta_page(struct 
ring_buffer_per_cpu *cpu_buffer)
  {
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
  
+	if (!meta)

+   return;
+
meta->reader.read = cpu_buffer->reader_page->read;
meta->reader.id = cpu_buffer->reader_page->id;
meta->reader.lost_events = cpu_buffer->lost_events;
@@ -6167,7 +6170,7 @@ rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
  
  	mutex_lock(&cpu_buffer->mapping_lock);
  
-	if (!cpu_buffer->mapped) {

+   if (!cpu_buffer->mapped || !cpu_buffer->meta_page) {
mutex_unlock(&cpu_buffer->mapping_lock);
return ERR_PTR(-ENODEV);
}
@@ -6359,12 +6362,13 @@ int ring_buffer_map(struct trace_buffer *buffer, int 
cpu,
 */
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+


Picky again. Is that a leftover from something ? I don't see an immediate reason
for the added newline.


raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
  
  	err = __rb_map_vma(cpu_buffer, vma);

if (!err) {
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-   cpu_buffer->mapped = 1;
+   cpu_buffer->mapped++;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
} else {
kfree(cpu_buffer->subbuf_ids);
@@ -6403,7 +6407,8 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int 
cpu)
mutex_lock(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
  
-	cpu_buffer->mapped = 0;

+   WARN_ON_ONCE(!cpu_buffer->mapped);
+   cpu_buffer->mapped--;


This will wrap to UINT_MAX if it was 0. Is that intentional ?

  
  	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);

Re: [PATCH v14 14/14] selftests/sgx: Add scripts for EPC cgroup testing

2024-06-11 Thread Huang, Kai

On Tue, 2024-06-11 at 07:57 -0500, Haitao Huang wrote:
> On Mon, 10 Jun 2024 17:39:53 -0500, Huang, Kai  wrote:
> 
> > 
> > > --- a/arch/x86/kernel/cpu/sgx/main.c
> > > +++ b/arch/x86/kernel/cpu/sgx/main.c
> > > @@ -1045,7 +1045,7 @@ static int __init sgx_init(void)
> > >if (!sgx_page_cache_init())
> > >return -ENOMEM;
> > >  -if (!sgx_page_reclaimer_init()) {
> > > +if (!sgx_page_reclaimer_init() || !sgx_cgroup_init()) {
> > >ret = -ENOMEM;
> > >goto err_page_cache;
> > >}
> > 
> > Does it make more sense to move the sgx_cgroup_init() to the  
> > sgx_drv_init()?  The SGX cgroup only works for the driver side anyway.  
> > In this case, if something went wrong in sgx_cgroup_init(), the  
> > sgx_vepc_init() could still have a chance to work.
> > 
> 
> vepc reclamation is not done by cgroup/ksgxd but try_charge() won't work  
> if user expecting cgroup to limit vepc allocation. 
> 

Oh ok.

> Would it be more  
> consistent to just disable vepc, i.e., on system with MISC, sgx/vepc  
> always go with cgroup enabled?
> 

Yes fine to me.
>

[PATCH 1/1] s390/virtio_ccw: fix config change notifications

2024-06-11 Thread Halil Pasic

Commit e3e9bda38e6d ("s390/virtio_ccw: use DMA handle from DMA API")
broke configuration change notifications for virtio-ccw by putting the
DMA address of *indicatorp directly into ccw->cda disregarding the fact
that if !!(vcdev->is_thinint) then the function
virtio_ccw_register_adapter_ind() will overwrite that ccw->cda value
with the address of the virtio_thinint_area so it can actually set up
the adapter interrupts via CCW_CMD_SET_IND_ADAPTER.  Thus we end up
pointing to the wrong object for both CCW_CMD_SET_IND if setting up the
adapter interrupts fails, and for CCW_CMD_SET_CONF_IND regardless
whether it succeeds or fails.

To fix this, let us save away the dma address of *indicatorp in a local
variable, and copy it to ccw->cda after the "vcdev->is_thinint" branch.

Reported-by: Boqiao Fu 
Reported-by: Sebastian Mitterle 
Fixes: e3e9bda38e6d ("s390/virtio_ccw: use DMA handle from DMA API")
Signed-off-by: Halil Pasic 
---
I know that checkpatch.pl complains about a missing 'Closes' tag.
Unfortunately I don't have an appropriate URL at hand. @Sebastian,
@Boqiao: do you have any suggetions?
---
 drivers/s390/virtio/virtio_ccw.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c
index d7569f395559..d6491fc84e8c 100644
--- a/drivers/s390/virtio/virtio_ccw.c
+++ b/drivers/s390/virtio/virtio_ccw.c
@@ -698,6 +698,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, 
unsigned nvqs,
dma64_t *indicatorp = NULL;
int ret, i, queue_idx = 0;
struct ccw1 *ccw;
+   dma32_t indicatorp_dma = 0;
 
ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw), NULL);
if (!ccw)
@@ -725,7 +726,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, 
unsigned nvqs,
*/
indicatorp = ccw_device_dma_zalloc(vcdev->cdev,
   sizeof(*indicatorp),
-  &ccw->cda);
+  &indicatorp_dma);
if (!indicatorp)
goto out;
*indicatorp = indicators_dma(vcdev);
@@ -735,6 +736,7 @@ static int virtio_ccw_find_vqs(struct virtio_device *vdev, 
unsigned nvqs,
/* no error, just fall back to legacy interrupts */
vcdev->is_thinint = false;
}
+   ccw->cda = indicatorp_dma;
if (!vcdev->is_thinint) {
/* Register queue indicators with host. */
*indicators(vcdev) = 0;

base-commit: 83a7eefedc9b56fe7bfeff13b6c7356688ffa670
-- 
2.40.1

[PATCH v4 net-next 7/7] af_packet: use sk_skb_reason_drop to free rx packets

2024-06-11 Thread Yan Zhai

Replace kfree_skb_reason with sk_skb_reason_drop and pass the receiving
socket to the tracepoint.

Reported-by: kernel test robot 
Closes: https://lore.kernel.org/r/202406011859.aacus8gv-...@intel.com/
Signed-off-by: Yan Zhai 
---
v2->v3: fixed uninitialized sk, added missing report tags.
---
 net/packet/af_packet.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index fce390887591..42d29b8a84fc 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2121,7 +2121,7 @@ static int packet_rcv(struct sk_buff *skb, struct 
net_device *dev,
  struct packet_type *pt, struct net_device *orig_dev)
 {
enum skb_drop_reason drop_reason = SKB_CONSUMED;
-   struct sock *sk;
+   struct sock *sk = NULL;
struct sockaddr_ll *sll;
struct packet_sock *po;
u8 *skb_head = skb->data;
@@ -2226,7 +2226,7 @@ static int packet_rcv(struct sk_buff *skb, struct 
net_device *dev,
skb->len = skb_len;
}
 drop:
-   kfree_skb_reason(skb, drop_reason);
+   sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
 }
 
@@ -2234,7 +2234,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct 
net_device *dev,
   struct packet_type *pt, struct net_device *orig_dev)
 {
enum skb_drop_reason drop_reason = SKB_CONSUMED;
-   struct sock *sk;
+   struct sock *sk = NULL;
struct packet_sock *po;
struct sockaddr_ll *sll;
union tpacket_uhdr h;
@@ -2494,7 +2494,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct 
net_device *dev,
skb->len = skb_len;
}
 drop:
-   kfree_skb_reason(skb, drop_reason);
+   sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
 
 drop_n_account:
@@ -2503,7 +2503,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct 
net_device *dev,
drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;
 
sk->sk_data_ready(sk);
-   kfree_skb_reason(copy_skb, drop_reason);
+   sk_skb_reason_drop(sk, copy_skb, drop_reason);
goto drop_n_restore;
 }
 
-- 
2.30.2

[PATCH v4 net-next 6/7] udp: use sk_skb_reason_drop to free rx packets

2024-06-11 Thread Yan Zhai

Replace kfree_skb_reason with sk_skb_reason_drop and pass the receiving
socket to the tracepoint.

Reported-by: kernel test robot 
Closes: https://lore.kernel.org/r/202406011751.npvn0ssk-...@intel.com/
Signed-off-by: Yan Zhai 
---
v2->v3: added missing report tags
---
 net/ipv4/udp.c | 10 +-
 net/ipv6/udp.c | 10 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 189c9113fe9a..ecafb1695999 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2074,7 +2074,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
}
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
trace_udp_fail_queue_rcv_skb(rc, sk, skb);
-   kfree_skb_reason(skb, drop_reason);
+   sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
 
@@ -2196,7 +2196,7 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct 
sk_buff *skb)
 drop:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
-   kfree_skb_reason(skb, drop_reason);
+   sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
 }
 
@@ -2383,7 +2383,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct 
sk_buff *skb,
 int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
   int proto)
 {
-   struct sock *sk;
+   struct sock *sk = NULL;
struct udphdr *uh;
unsigned short ulen;
struct rtable *rt = skb_rtable(skb);
@@ -2460,7 +2460,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table 
*udptable,
 * Hmm.  We got an UDP packet to a port to which we
 * don't wanna listen.  Ignore it.
 */
-   kfree_skb_reason(skb, drop_reason);
+   sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
 
 short_packet:
@@ -2485,7 +2485,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table 
*udptable,
__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
 drop:
__UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
-   kfree_skb_reason(skb, drop_reason);
+   sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
 }
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c81a07ac0463..b56f0b9f4307 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -673,7 +673,7 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
}
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
trace_udp_fail_queue_rcv_skb(rc, sk, skb);
-   kfree_skb_reason(skb, drop_reason);
+   sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
 
@@ -776,7 +776,7 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct 
sk_buff *skb)
 drop:
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
-   kfree_skb_reason(skb, drop_reason);
+   sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
 }
 
@@ -940,8 +940,8 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table 
*udptable,
enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
const struct in6_addr *saddr, *daddr;
struct net *net = dev_net(skb->dev);
+   struct sock *sk = NULL;
struct udphdr *uh;
-   struct sock *sk;
bool refcounted;
u32 ulen = 0;
 
@@ -1033,7 +1033,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table 
*udptable,
__UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
 
-   kfree_skb_reason(skb, reason);
+   sk_skb_reason_drop(sk, skb, reason);
return 0;
 
 short_packet:
@@ -1054,7 +1054,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table 
*udptable,
__UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
 discard:
__UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
-   kfree_skb_reason(skb, reason);
+   sk_skb_reason_drop(sk, skb, reason);
return 0;
 }
 
-- 
2.30.2

[PATCH v4 net-next 5/7] tcp: use sk_skb_reason_drop to free rx packets

2024-06-11 Thread Yan Zhai

Replace kfree_skb_reason with sk_skb_reason_drop and pass the receiving
socket to the tracepoint.

Reported-by: kernel test robot 
Closes: https://lore.kernel.org/r/202406011539.jhwbd7dx-...@intel.com/
Signed-off-by: Yan Zhai 
---
v2->v3: added missing report tags
---
 net/ipv4/syncookies.c | 2 +-
 net/ipv4/tcp_input.c  | 2 +-
 net/ipv4/tcp_ipv4.c   | 6 +++---
 net/ipv6/syncookies.c | 2 +-
 net/ipv6/tcp_ipv6.c   | 6 +++---
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index b61d36810fe3..1948d15f1f28 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -496,6 +496,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct 
sk_buff *skb)
 out_free:
reqsk_free(req);
 out_drop:
-   kfree_skb_reason(skb, reason);
+   sk_skb_reason_drop(sk, skb, reason);
return NULL;
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5aadf64e554d..bedb079de1f0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4859,7 +4859,7 @@ static void tcp_drop_reason(struct sock *sk, struct 
sk_buff *skb,
enum skb_drop_reason reason)
 {
sk_drops_add(sk, skb);
-   kfree_skb_reason(skb, reason);
+   sk_skb_reason_drop(sk, skb, reason);
 }
 
 /* This one checks to see if we can put data from the
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 041c7eda9abe..f7a046bc4b27 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1939,7 +1939,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 reset:
tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
 discard:
-   kfree_skb_reason(skb, reason);
+   sk_skb_reason_drop(sk, skb, reason);
/* Be careful here. If this function gets more complicated and
 * gcc suffers from register pressure on the x86, sk (in %ebx)
 * might be destroyed here. This current version compiles correctly,
@@ -2176,8 +2176,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
int dif = inet_iif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
+   struct sock *sk = NULL;
bool refcounted;
-   struct sock *sk;
int ret;
u32 isn;
 
@@ -2376,7 +2376,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
 discard_it:
SKB_DR_OR(drop_reason, NOT_SPECIFIED);
/* Discard frame. */
-   kfree_skb_reason(skb, drop_reason);
+   sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
 
 discard_and_relse:
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index bfad1e89b6a6..9d83eadd308b 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -275,6 +275,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct 
sk_buff *skb)
 out_free:
reqsk_free(req);
 out_drop:
-   kfree_skb_reason(skb, reason);
+   sk_skb_reason_drop(sk, skb, reason);
return NULL;
 }
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 1ac7502e1bf5..93967accc35d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1678,7 +1678,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 discard:
if (opt_skb)
__kfree_skb(opt_skb);
-   kfree_skb_reason(skb, reason);
+   sk_skb_reason_drop(sk, skb, reason);
return 0;
 csum_err:
reason = SKB_DROP_REASON_TCP_CSUM;
@@ -1751,8 +1751,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
int dif = inet6_iif(skb);
const struct tcphdr *th;
const struct ipv6hdr *hdr;
+   struct sock *sk = NULL;
bool refcounted;
-   struct sock *sk;
int ret;
u32 isn;
struct net *net = dev_net(skb->dev);
@@ -1944,7 +1944,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff 
*skb)
 
 discard_it:
SKB_DR_OR(drop_reason, NOT_SPECIFIED);
-   kfree_skb_reason(skb, drop_reason);
+   sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
 
 discard_and_relse:
-- 
2.30.2

[PATCH v4 net-next 4/7] net: raw: use sk_skb_reason_drop to free rx packets

2024-06-11 Thread Yan Zhai

Replace kfree_skb_reason with sk_skb_reason_drop and pass the receiving
socket to the tracepoint.

Signed-off-by: Yan Zhai 
---
 net/ipv4/raw.c | 4 ++--
 net/ipv6/raw.c | 8 
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 1a0953650356..474dfd263c8b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -301,7 +301,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
ipv4_pktinfo_prepare(sk, skb, true);
if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
-   kfree_skb_reason(skb, reason);
+   sk_skb_reason_drop(sk, skb, reason);
return NET_RX_DROP;
}
 
@@ -312,7 +312,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
 {
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
atomic_inc(&sk->sk_drops);
-   kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY);
+   sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
nf_reset_ct(skb);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index f838366e8256..608fa9d05b55 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -362,14 +362,14 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) &&
skb_checksum_complete(skb)) {
atomic_inc(&sk->sk_drops);
-   kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM);
+   sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
return NET_RX_DROP;
}
 
/* Charge it to the socket. */
skb_dst_drop(skb);
if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
-   kfree_skb_reason(skb, reason);
+   sk_skb_reason_drop(sk, skb, reason);
return NET_RX_DROP;
}
 
@@ -390,7 +390,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
 
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
atomic_inc(&sk->sk_drops);
-   kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY);
+   sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
nf_reset_ct(skb);
@@ -415,7 +415,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
if (inet_test_bit(HDRINCL, sk)) {
if (skb_checksum_complete(skb)) {
atomic_inc(&sk->sk_drops);
-   kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM);
+   sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
return NET_RX_DROP;
}
}
-- 
2.30.2

[PATCH v4 net-next 3/7] ping: use sk_skb_reason_drop to free rx packets

2024-06-11 Thread Yan Zhai

Replace kfree_skb_reason with sk_skb_reason_drop and pass the receiving
socket to the tracepoint.

Signed-off-by: Yan Zhai 
---
 net/ipv4/ping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 823306487a82..619ddc087957 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -946,7 +946,7 @@ static enum skb_drop_reason __ping_queue_rcv_skb(struct 
sock *sk,
pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
 inet_sk(sk), inet_sk(sk)->inet_num, skb);
if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
-   kfree_skb_reason(skb, reason);
+   sk_skb_reason_drop(sk, skb, reason);
pr_debug("ping_queue_rcv_skb -> failed\n");
return reason;
}
-- 
2.30.2

[PATCH v4 net-next 2/7] net: introduce sk_skb_reason_drop function

2024-06-11 Thread Yan Zhai

Long used destructors kfree_skb and kfree_skb_reason do not pass
receiving socket to packet drop tracepoints trace_kfree_skb.
This makes it hard to track packet drops of a certain netns (container)
or a socket (user application).

The naming of these destructors are also not consistent with most sk/skb
operating functions, i.e. functions named "sk_xxx" or "skb_xxx".
Introduce a new functions sk_skb_reason_drop as drop-in replacement for
kfree_skb_reason on local receiving path. Callers can now pass receiving
sockets to the tracepoints.

kfree_skb and kfree_skb_reason are still usable but they are now just
inline helpers that call sk_skb_reason_drop.

Note it is not feasible to do the same to consume_skb. Packets not
dropped can flow through multiple receive handlers, and have multiple
receiving sockets. Leave it untouched for now.

Suggested-by: Eric Dumazet 
Signed-off-by: Yan Zhai 
---
v1->v2: changes function names to be more consistent with common sk/skb
operations
---
 include/linux/skbuff.h | 10 --
 net/core/skbuff.c  | 22 --
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe7d8dbef77e..c479a2515a62 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1251,8 +1251,14 @@ static inline bool skb_data_unref(const struct sk_buff 
*skb,
return true;
 }
 
-void __fix_address
-kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason);
+void __fix_address sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason reason);
+
+static inline void
+kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+{
+   sk_skb_reason_drop(NULL, skb, reason);
+}
 
 /**
  * kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2854afdd713f..9def11fe42c4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1190,7 +1190,8 @@ void __kfree_skb(struct sk_buff *skb)
 EXPORT_SYMBOL(__kfree_skb);
 
 static __always_inline
-bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason reason)
 {
if (unlikely(!skb_unref(skb)))
return false;
@@ -1203,26 +1204,27 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum 
skb_drop_reason reason)
if (reason == SKB_CONSUMED)
trace_consume_skb(skb, __builtin_return_address(0));
else
-   trace_kfree_skb(skb, __builtin_return_address(0), reason, NULL);
+   trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
return true;
 }
 
 /**
- * kfree_skb_reason - free an sk_buff with special reason
+ * sk_skb_reason_drop - free an sk_buff with special reason
+ * @sk: the socket to receive @skb, or NULL if not applicable
  * @skb: buffer to free
  * @reason: reason why this skb is dropped
  *
- * Drop a reference to the buffer and free it if the usage count has
- * hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
- * tracepoint.
+ * Drop a reference to the buffer and free it if the usage count has hit
+ * zero. Meanwhile, pass the receiving socket and drop reason to
+ * 'kfree_skb' tracepoint.
  */
 void __fix_address
-kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason 
reason)
 {
-   if (__kfree_skb_reason(skb, reason))
+   if (__sk_skb_reason_drop(sk, skb, reason))
__kfree_skb(skb);
 }
-EXPORT_SYMBOL(kfree_skb_reason);
+EXPORT_SYMBOL(sk_skb_reason_drop);
 
 #define KFREE_SKB_BULK_SIZE16
 
@@ -1261,7 +1263,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum 
skb_drop_reason reason)
while (segs) {
struct sk_buff *next = segs->next;
 
-   if (__kfree_skb_reason(segs, reason)) {
+   if (__sk_skb_reason_drop(NULL, segs, reason)) {
skb_poison_list(segs);
kfree_skb_add_bulk(segs, &sa, reason);
}
-- 
2.30.2

[PATCH v4 net-next 1/7] net: add rx_sk to trace_kfree_skb

2024-06-11 Thread Yan Zhai

skb does not include enough information to find out receiving
sockets/services and netns/containers on packet drops. In theory
skb->dev tells about netns, but it can get cleared/reused, e.g. by TCP
stack for OOO packet lookup. Similarly, skb->sk often identifies a local
sender, and tells nothing about a receiver.

Allow passing an extra receiving socket to the tracepoint to improve
the visibility on receiving drops.

Signed-off-by: Yan Zhai 
---
v3->v4: adjusted the TP_STRUCT field order to be consistent
v2->v3: fixed drop_monitor function prototype
---
 include/trace/events/skb.h | 11 +++
 net/core/dev.c |  2 +-
 net/core/drop_monitor.c|  9 ++---
 net/core/skbuff.c  |  2 +-
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
index 07e0715628ec..3e9ea1cca6f2 100644
--- a/include/trace/events/skb.h
+++ b/include/trace/events/skb.h
@@ -24,13 +24,14 @@ DEFINE_DROP_REASON(FN, FN)
 TRACE_EVENT(kfree_skb,
 
TP_PROTO(struct sk_buff *skb, void *location,
-enum skb_drop_reason reason),
+enum skb_drop_reason reason, struct sock *rx_sk),
 
-   TP_ARGS(skb, location, reason),
+   TP_ARGS(skb, location, reason, rx_sk),
 
TP_STRUCT__entry(
__field(void *, skbaddr)
__field(void *, location)
+   __field(void *, rx_skaddr)
__field(unsigned short, protocol)
__field(enum skb_drop_reason,   reason)
),
@@ -38,12 +39,14 @@ TRACE_EVENT(kfree_skb,
TP_fast_assign(
__entry->skbaddr = skb;
__entry->location = location;
+   __entry->rx_skaddr = rx_sk;
__entry->protocol = ntohs(skb->protocol);
__entry->reason = reason;
),
 
-   TP_printk("skbaddr=%p protocol=%u location=%pS reason: %s",
- __entry->skbaddr, __entry->protocol, __entry->location,
+   TP_printk("skbaddr=%p rx_skaddr=%p protocol=%u location=%pS reason: %s",
+ __entry->skbaddr, __entry->rx_skaddr, __entry->protocol,
+ __entry->location,
  __print_symbolic(__entry->reason,
   DEFINE_DROP_REASON(FN, FNe)))
 );
diff --git a/net/core/dev.c b/net/core/dev.c
index 85fe8138f3e4..7844227ecbfd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5233,7 +5233,7 @@ static __latent_entropy void net_tx_action(struct 
softirq_action *h)
trace_consume_skb(skb, net_tx_action);
else
trace_kfree_skb(skb, net_tx_action,
-   get_kfree_skb_cb(skb)->reason);
+   get_kfree_skb_cb(skb)->reason, 
NULL);
 
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 430ed18f8584..2e0ae3328232 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -109,7 +109,8 @@ static u32 net_dm_queue_len = 1000;
 struct net_dm_alert_ops {
void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
void *location,
-   enum skb_drop_reason reason);
+   enum skb_drop_reason reason,
+   struct sock *rx_sk);
void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
int work, int budget);
void (*work_item_func)(struct work_struct *work);
@@ -264,7 +265,8 @@ static void trace_drop_common(struct sk_buff *skb, void 
*location)
 
 static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb,
void *location,
-   enum skb_drop_reason reason)
+   enum skb_drop_reason reason,
+   struct sock *rx_sk)
 {
trace_drop_common(skb, location);
 }
@@ -491,7 +493,8 @@ static const struct net_dm_alert_ops 
net_dm_alert_summary_ops = {
 static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
  struct sk_buff *skb,
  void *location,
- enum skb_drop_reason reason)
+ enum skb_drop_reason reason,
+ struct sock *rx_sk)
 {
ktime_t tstamp = ktime_get_real();
struct per_cpu_dm_data *data;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 466999a7515e..2854afdd713f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1203,7 +1203,7 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum 
skb_drop_reason reason)
if (reason == SKB

[PATCH v4 net-next 0/7] net: pass receive socket to drop tracepoint

2024-06-11 Thread Yan Zhai

We set up our production packet drop monitoring around the kfree_skb
tracepoint. While this tracepoint is extremely valuable for diagnosing
critical problems, it also has some limitation with drops on the local
receive path: this tracepoint can only inspect the dropped skb itself,
but such skb might not carry enough information to:

1. determine in which netns/container this skb gets dropped
2. determine by which socket/service this skb oughts to be received

The 1st issue is because skb->dev is the only member field with valid
netns reference. But skb->dev can get cleared or reused. For example,
tcp_v4_rcv will clear skb->dev and in later processing it might be reused
for OFO tree.

The 2nd issue is because there is no reference on an skb that reliably
points to a receiving socket. skb->sk usually points to the local
sending socket, and it only points to a receive socket briefly after
early demux stage, yet the socket can get stolen later. For certain drop
reason like TCP OFO_MERGE, Zerowindow, UDP at PROTO_MEM error, etc, it
is hard to infer which receiving socket is impacted. This cannot be
overcome by simply looking at the packet header, because of
complications like sk lookup programs. In the past, single purpose
tracepoints like trace_udp_fail_queue_rcv_skb, trace_sock_rcvqueue_full,
etc are added as needed to provide more visibility. This could be
handled in a more generic way.

In this change set we propose a new 'sk_skb_reason_drop' call as a drop-in
replacement for kfree_skb_reason at various local input path. It accepts
an extra receiving socket argument. Both issues above can be resolved
via this new argument.

V3->V4: adjusted the TP_STRUCT field order to align better, suggested by
Steven Rostedt.

V2->V3: fixed drop_monitor function signatures; fixed a few uninitialized sks;
Added a few missing report tags from test bots (also noticed by Dan
Carpenter and Simon Horman).

V1->V2: instead of using skb->cb, directly add the needed argument to
trace_kfree_skb tracepoint. Also renamed functions as Eric Dumazet
suggested.

V3: https://lore.kernel.org/netdev/cover.1717529533.git@cloudflare.com/
V2: 
https://lore.kernel.org/linux-kernel/cover.1717206060.git@cloudflare.com/
V1: https://lore.kernel.org/netdev/cover.1717105215.git@cloudflare.com/

Yan Zhai (7):
  net: add rx_sk to trace_kfree_skb
  net: introduce sk_skb_reason_drop function
  ping: use sk_skb_reason_drop to free rx packets
  net: raw: use sk_skb_reason_drop to free rx packets
  tcp: use sk_skb_reason_drop to free rx packets
  udp: use sk_skb_reason_drop to free rx packets
  af_packet: use sk_skb_reason_drop to free rx packets

 include/linux/skbuff.h | 10 --
 include/trace/events/skb.h | 11 +++
 net/core/dev.c |  2 +-
 net/core/drop_monitor.c|  9 ++---
 net/core/skbuff.c  | 22 --
 net/ipv4/ping.c|  2 +-
 net/ipv4/raw.c |  4 ++--
 net/ipv4/syncookies.c  |  2 +-
 net/ipv4/tcp_input.c   |  2 +-
 net/ipv4/tcp_ipv4.c|  6 +++---
 net/ipv4/udp.c | 10 +-
 net/ipv6/raw.c |  8 
 net/ipv6/syncookies.c  |  2 +-
 net/ipv6/tcp_ipv6.c|  6 +++---
 net/ipv6/udp.c | 10 +-
 net/packet/af_packet.c | 10 +-
 16 files changed, 65 insertions(+), 51 deletions(-)

-- 
2.30.2

Re: [PATCH v5 2/2] misc: fastrpc: use coherent pool for untranslated Compute Banks

2024-06-11 Thread Dmitry Baryshkov

On Fri, May 24, 2024 at 06:14:03PM +0200, Dylan Van Assche wrote:
> Use fastrpc_remote_heap_alloc to allocate from the FastRPC device
> instead of the Compute Bank when the session ID is 0. This ensures
> that the allocation is inside the coherent DMA pool which is already
> accessible to the DSP. This is necessary to support FastRPC devices
> which do not have dedicated Compute Banks such as the SLPI on the SDM845.
> The latter uses an allocated CMA region instead of FastRPC Compute Banks.
> 
> Signed-off-by: Dylan Van Assche 
> Reviewed-by: Caleb Connolly 
> ---
>  drivers/misc/fastrpc.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 

Reviewed-by: Dmitry Baryshkov 


-- 
With best wishes
Dmitry

Re: [PATCH v5 1/2] misc: fastrpc: support complete DMA pool access to the DSP

2024-06-11 Thread Dmitry Baryshkov

On Fri, May 24, 2024 at 06:14:02PM +0200, Dylan Van Assche wrote:
> To support FastRPC Context Banks which aren't mapped via the SMMU,
> make the whole reserved memory region available to the DSP to allow
> access to coherent buffers.
> 
> This is performed by assigning the memory to the DSP via a hypervisor
> call to set the correct permissions for the Virtual Machines on the DSP.
> This is only necessary when a memory region is provided for SLPI DSPs
> so guard this with a domain ID check.
> 
> Signed-off-by: Dylan Van Assche 
> Reviewed-by: Caleb Connolly 
> ---
>  drivers/misc/fastrpc.c | 19 +++
>  1 file changed, 19 insertions(+)
> 
> diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
> index 4c67e2c5a82e..c06667b29055 100644
> --- a/drivers/misc/fastrpc.c
> +++ b/drivers/misc/fastrpc.c
> @@ -2255,6 +2255,8 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device 
> *rpdev)
>   int i, err, domain_id = -1, vmcount;
>   const char *domain;
>   bool secure_dsp;
> + struct device_node *rmem_node;
> + struct reserved_mem *rmem;
>   unsigned int vmids[FASTRPC_MAX_VMIDS];
>  
>   err = of_property_read_string(rdev->of_node, "label", &domain);
> @@ -2297,6 +2299,23 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device 
> *rpdev)
>   }
>   }
>  
> + rmem_node = of_parse_phandle(rdev->of_node, "memory-region", 0);
> + if (domain_id == SDSP_DOMAIN_ID && rmem_node) {

I think we can drop the domain_id check here.

The rest looks good to me.

Reviewed-by: Dmitry Baryshkov 


> + u64 src_perms;
> +
> + rmem = of_reserved_mem_lookup(rmem_node);
> + if (!rmem) {
> + err = -EINVAL;
> + goto fdev_error;
> + }
> +
> + src_perms = BIT(QCOM_SCM_VMID_HLOS);
> +
> + qcom_scm_assign_mem(rmem->base, rmem->size, &src_perms,
> + data->vmperms, data->vmcount);
> +
> + }
> +
>   secure_dsp = !(of_property_read_bool(rdev->of_node, 
> "qcom,non-secure-domain"));
>   data->secure = secure_dsp;
>  
> -- 
> 2.45.1
> 

-- 
With best wishes
Dmitry

[PATCH v4 13/13] tracing: Add last boot delta offset for stack traces

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

The addresses of a stack trace event are relative to the kallsyms. As that
can change between boots, when printing the stack trace from a buffer that
was from the last boot, it needs all the addresses to be added to the
"text_delta" that gives the delta between the addresses of the functions
for the current boot compared to the address of the last boot. Then it can
be passed to kallsyms to find the function name, otherwise it just shows a
useless list of addresses.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace_output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b9d2c64c0648..48de93598897 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1233,6 +1233,7 @@ static enum print_line_t trace_stack_print(struct 
trace_iterator *iter,
struct trace_seq *s = &iter->seq;
unsigned long *p;
unsigned long *end;
+   long delta = iter->tr->text_delta;
 
trace_assign_type(field, iter->ent);
end = (unsigned long *)((long)iter->ent + iter->ent_size);
@@ -1245,7 +1246,7 @@ static enum print_line_t trace_stack_print(struct 
trace_iterator *iter,
break;
 
trace_seq_puts(s, " => ");
-   seq_print_ip_sym(s, *p, flags);
+   seq_print_ip_sym(s, (*p) + delta, flags);
trace_seq_putc(s, '\n');
}
 
-- 
2.43.0

[PATCH v4 11/13] tracing: Handle old buffer mappings for event strings and functions

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Use the saved text_delta and data_delta of a persistent memory mapped ring
buffer that was saved from a previous boot, and use the delta in the trace
event print output so that strings and functions show up normally.

That is, for an event like trace_kmalloc() that prints the callsite via
"%pS", if it used the address saved in the ring buffer it will not match
the function that was saved in the previous boot if the kernel remaps
itself between boots.

For RCU events that point to saved static strings where only the address
of the string is saved in the ring buffer, it too will be adjusted to
point to where the string is on the current boot.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace.c | 42 +++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc4eee33d920..71cca10581d6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3671,8 +3671,11 @@ static void test_can_verify(void)
 void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
 va_list ap)
 {
+   long text_delta = iter->tr->text_delta;
+   long data_delta = iter->tr->data_delta;
const char *p = fmt;
const char *str;
+   bool good;
int i, j;
 
if (WARN_ON_ONCE(!fmt))
@@ -3691,7 +3694,10 @@ void trace_check_vprintf(struct trace_iterator *iter, 
const char *fmt,
 
j = 0;
 
-   /* We only care about %s and variants */
+   /*
+* We only care about %s and variants
+* as well as %p[sS] if delta is non-zero
+*/
for (i = 0; p[i]; i++) {
if (i + 1 >= iter->fmt_size) {
/*
@@ -3720,6 +3726,11 @@ void trace_check_vprintf(struct trace_iterator *iter, 
const char *fmt,
}
if (p[i+j] == 's')
break;
+
+   if (text_delta && p[i+1] == 'p' &&
+   ((p[i+2] == 's' || p[i+2] == 'S')))
+   break;
+
star = false;
}
j = 0;
@@ -3733,6 +3744,24 @@ void trace_check_vprintf(struct trace_iterator *iter, 
const char *fmt,
iter->fmt[i] = '\0';
trace_seq_vprintf(&iter->seq, iter->fmt, ap);
 
+   /* Add delta to %pS pointers */
+   if (p[i+1] == 'p') {
+   unsigned long addr;
+   char fmt[4];
+
+   fmt[0] = '%';
+   fmt[1] = 'p';
+   fmt[2] = p[i+2]; /* Either %ps or %pS */
+   fmt[3] = '\0';
+
+   addr = va_arg(ap, unsigned long);
+   addr += text_delta;
+   trace_seq_printf(&iter->seq, fmt, (void *)addr);
+
+   p += i + 3;
+   continue;
+   }
+
/*
 * If iter->seq is full, the above call no longer guarantees
 * that ap is in sync with fmt processing, and further calls
@@ -3751,6 +3780,14 @@ void trace_check_vprintf(struct trace_iterator *iter, 
const char *fmt,
/* The ap now points to the string data of the %s */
str = va_arg(ap, const char *);
 
+   good = trace_safe_str(iter, str, star, len);
+
+   /* Could be from the last boot */
+   if (data_delta && !good) {
+   str += data_delta;
+   good = trace_safe_str(iter, str, star, len);
+   }
+
/*
 * If you hit this warning, it is likely that the
 * trace event in question used %s on a string that
@@ -3760,8 +3797,7 @@ void trace_check_vprintf(struct trace_iterator *iter, 
const char *fmt,
 * instead. See samples/trace_events/trace-events-sample.h
 * for reference.
 */
-   if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
- "fmt: '%s' current_buffer: '%s'",
+   if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'",
  fmt, seq_buf_str(&iter->seq.seq))) {
int ret;
 
-- 
2.43.0

[PATCH v4 12/13] tracing: Update function tracing output for previous boot buffer

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

For a persistent ring buffer that is saved across boots, if function
tracing was performed in the previous boot, it only saves the address of
the functions and uses "%pS" to print their names. But the current boot,
those functions may be in different locations. The persistent meta-data
saves the text delta between the two boots and can be used to find the
address of the saved function of where it is located in the current boot.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace_output.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d8b302d01083..b9d2c64c0648 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -990,8 +990,11 @@ enum print_line_t trace_nop_print(struct trace_iterator 
*iter, int flags,
 }
 
 static void print_fn_trace(struct trace_seq *s, unsigned long ip,
-  unsigned long parent_ip, int flags)
+  unsigned long parent_ip, long delta, int flags)
 {
+   ip += delta;
+   parent_ip += delta;
+
seq_print_ip_sym(s, ip, flags);
 
if ((flags & TRACE_ITER_PRINT_PARENT) && parent_ip) {
@@ -1009,7 +1012,7 @@ static enum print_line_t trace_fn_trace(struct 
trace_iterator *iter, int flags,
 
trace_assign_type(field, iter->ent);
 
-   print_fn_trace(s, field->ip, field->parent_ip, flags);
+   print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, 
flags);
trace_seq_putc(s, '\n');
 
return trace_handle_return(s);
@@ -1674,7 +1677,7 @@ trace_func_repeats_print(struct trace_iterator *iter, int 
flags,
 
trace_assign_type(field, iter->ent);
 
-   print_fn_trace(s, field->ip, field->parent_ip, flags);
+   print_fn_trace(s, field->ip, field->parent_ip, iter->tr->text_delta, 
flags);
trace_seq_printf(s, " (repeats: %u, last_ts:", field->count);
trace_print_time(s, iter,
 iter->ts - FUNC_REPEATS_GET_DELTA_TS(field));
-- 
2.43.0

[PATCH v4 10/13] tracing/ring-buffer: Add last_boot_info file to boot instance

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

If an instance is mapped to memory on boot up, create a new file called
"last_boot_info" that will hold information that can be used to properly
parse the raw data in the ring buffer.

It will export the delta of the addresses for text and data from what it
was from the last boot. It does not expose actually addresses (unless you
knew what the actual address was from the last boot).

The output will look like:

 # cat last_boot_info
 text delta:-268435456
 data delta:-268435456

The text and data are kept separate in case they are ever made different.

Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ring_buffer.h |  3 +++
 kernel/trace/ring_buffer.c  | 23 ++
 kernel/trace/trace.c| 47 -
 kernel/trace/trace.h|  2 ++
 4 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index a50b0223b1d3..55de3798a9b9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -94,6 +94,9 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned long 
size, unsigned flag
   unsigned long range_size,
   struct lock_class_key *key);
 
+bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
+long *data);
+
 /*
  * Because the ring buffer is generic, if other users of the ring buffer get
  * traced by ftrace, it can produce lockdep warnings. We need to keep each
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ab6b8a0ee8e1..8c1d7ea01e6f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2394,6 +2394,29 @@ struct trace_buffer *__ring_buffer_alloc_range(unsigned 
long size, unsigned flag
return alloc_buffer(size, flags, order, start, start + range_size, key);
 }
 
+/**
+ * ring_buffer_last_boot_delta - return the delta offset from last boot
+ * @buffer: The buffer to return the delta from
+ * @text: Return text delta
+ * @data: Return data delta
+ *
+ * Returns: The true if the delta is non zero
+ */
+bool ring_buffer_last_boot_delta(struct trace_buffer *buffer, long *text,
+long *data)
+{
+   if (!buffer)
+   return false;
+
+   if (!buffer->last_text_delta)
+   return false;
+
+   *text = buffer->last_text_delta;
+   *data = buffer->last_data_delta;
+
+   return true;
+}
+
 /**
  * ring_buffer_free - free a ring buffer.
  * @buffer: the buffer to free.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dfde26aa3211..dc4eee33d920 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -6041,6 +6041,18 @@ ssize_t tracing_resize_ring_buffer(struct trace_array 
*tr,
return ret;
 }
 
+static void update_last_data(struct trace_array *tr)
+{
+   if (!tr->text_delta && !tr->data_delta)
+   return;
+
+   /* Clear old data */
+   tracing_reset_online_cpus(&tr->array_buffer);
+
+   /* Using current data now */
+   tr->text_delta = 0;
+   tr->data_delta = 0;
+}
 
 /**
  * tracing_update_buffers - used by tracing facility to expand ring buffers
@@ -6058,6 +6070,9 @@ int tracing_update_buffers(struct trace_array *tr)
int ret = 0;
 
mutex_lock(&trace_types_lock);
+
+   update_last_data(tr);
+
if (!tr->ring_buffer_expanded)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6113,6 +6128,8 @@ int tracing_set_tracer(struct trace_array *tr, const char 
*buf)
 
mutex_lock(&trace_types_lock);
 
+   update_last_data(tr);
+
if (!tr->ring_buffer_expanded) {
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
@@ -6860,6 +6877,21 @@ tracing_total_entries_read(struct file *filp, char 
__user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+static ssize_t
+tracing_last_boot_read(struct file *filp, char __user *ubuf, size_t cnt, 
loff_t *ppos)
+{
+   struct trace_array *tr = filp->private_data;
+   struct seq_buf seq;
+   char buf[64];
+
+   seq_buf_init(&seq, buf, 64);
+
+   seq_buf_printf(&seq, "text delta:\t%ld\n", tr->text_delta);
+   seq_buf_printf(&seq, "data delta:\t%ld\n", tr->data_delta);
+
+   return simple_read_from_buffer(ubuf, cnt, ppos, buf, 
seq_buf_used(&seq));
+}
+
 static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
 {
struct trace_array *tr = inode->i_private;
@@ -7499,6 +7531,13 @@ static const struct file_operations 
trace_time_stamp_mode_fops = {
.release= tracing_single_release_tr,
 };
 
+static const struct file_operations last_boot_fops = {
+   .open

[PATCH v4 08/13] tracing: Add option to use memmapped memory for trace boot instance

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Add an option to the trace_instance kernel command line parameter that
allows it to use the reserved memory from memmap boot parameter.

  memmap=12M$0x28450 trace_instance=boot_mapped@0x28450:12M

The above will reserves 12 megs at the physical address 0x28450.
The second parameter will create a "boot_mapped" instance and use the
memory reserved as the memory for the ring buffer.

That will create an instance called "boot_mapped":

  /sys/kernel/tracing/instances/boot_mapped

Note, because the ring buffer is using a defined memory ranged, it will
act just like a memory mapped ring buffer. It will not have a snapshot
buffer, as it can't swap out the buffer. The snapshot files as well as any
tracers that uses a snapshot will not be present in the boot_mapped
instance.

Cc: linux...@kvack.org
Signed-off-by: Steven Rostedt (Google) 
---
 .../admin-guide/kernel-parameters.txt |  9 +++
 kernel/trace/trace.c  | 75 +--
 2 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index b600df82669d..ff26b6094e79 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6754,6 +6754,15 @@
the same thing would happen if it was left off). The 
irq_handler_entry
event, and all events under the "initcall" system.
 
+   If memory has been reserved (see memmap for x86), the 
instance
+   can use that memory:
+
+   memmap=12M$0x28450 
trace_instance=boot_map@0x28450:12M
+
+   The above will create a "boot_map" instance that uses 
the physical
+   memory at 0x28450 that is 12Megs. The per CPU 
buffers of that
+   instance will be split up accordingly.
+
trace_options=[option-list]
[FTRACE] Enable or disable tracer options at boot.
The option-list is a comma delimited list of options
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 622fe670949d..dfde26aa3211 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -9504,6 +9504,31 @@ static int instance_mkdir(const char *name)
return ret;
 }
 
+static u64 map_pages(u64 start, u64 size)
+{
+   struct page **pages;
+   phys_addr_t page_start;
+   unsigned int page_count;
+   unsigned int i;
+   void *vaddr;
+
+   page_count = DIV_ROUND_UP(size, PAGE_SIZE);
+
+   page_start = start;
+   pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
+   if (!pages)
+   return 0;
+
+   for (i = 0; i < page_count; i++) {
+   phys_addr_t addr = page_start + i * PAGE_SIZE;
+   pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
+   }
+   vaddr = vmap(pages, page_count, VM_MAP, PAGE_KERNEL);
+   kfree(pages);
+
+   return (u64)(unsigned long)vaddr;
+}
+
 /**
  * trace_array_get_by_name - Create/Lookup a trace array, given its name.
  * @name: The name of the trace array to be looked up/created.
@@ -10350,6 +10375,7 @@ __init static void enable_instances(void)
 {
struct trace_array *tr;
char *curr_str;
+   char *name;
char *str;
char *tok;
 
@@ -10358,19 +10384,56 @@ __init static void enable_instances(void)
str = boot_instance_info;
 
while ((curr_str = strsep(&str, "\t"))) {
+   unsigned long start = 0;
+   unsigned long size = 0;
+   unsigned long addr = 0;
 
tok = strsep(&curr_str, ",");
+   name = strsep(&tok, "@");
+   if (tok) {
+   start = memparse(tok, &tok);
+   if (!start) {
+   pr_warn("Tracing: Invalid boot instance address 
for %s\n",
+   name);
+   continue;
+   }
+   }
 
-   if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
-   do_allocate_snapshot(tok);
+   if (start) {
+   if (*tok != ':') {
+   pr_warn("Tracing: No size specified for 
instance %s\n", name);
+   continue;
+   }
+   tok++;
+   size = memparse(tok, &tok);
+   if (!size) {
+   pr_warn("Tracing: Invalid boot instance size 
for %s\n",
+   name);
+   continue;
+   }
+   addr = map_pages(start, size);
+   if (addr) {
+   pr_info("Tracing: mapped

[PATCH v4 09/13] ring-buffer: Save text and data locations in mapped meta data

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

When a ring buffer is mapped to a specific address, save the address of a
text function and some data. This will be used to determine the delta
between the last boot and the current boot for pointers to functions as
well as to data.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 86f03c0ba4c0..ab6b8a0ee8e1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -45,6 +45,8 @@
 static void update_pages_handler(struct work_struct *work);
 
 struct ring_buffer_meta {
+   unsigned long   text_addr;
+   unsigned long   data_addr;
unsigned long   first_buffer;
unsigned long   head_buffer;
unsigned long   commit_buffer;
@@ -541,6 +543,9 @@ struct trace_buffer {
unsigned long   range_addr_start;
unsigned long   range_addr_end;
 
+   longlast_text_delta;
+   longlast_data_delta;
+
unsigned intsubbuf_size;
unsigned intsubbuf_order;
unsigned intmax_data_size;
@@ -1819,10 +1824,15 @@ static void rb_meta_validate_events(struct 
ring_buffer_per_cpu *cpu_buffer)
}
 }
 
+/* Used to calculate data delta */
+static char rb_data_ptr[] = "";
+
 static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
 {
struct ring_buffer_meta *meta;
unsigned long delta;
+   unsigned long this_text = (unsigned long)rb_range_meta_init;
+   unsigned long this_data = (unsigned long)rb_data_ptr;
void *subbuf;
int cpu;
int i;
@@ -1839,6 +1849,10 @@ static void rb_range_meta_init(struct trace_buffer 
*buffer, int nr_pages)
meta->first_buffer += delta;
meta->head_buffer += delta;
meta->commit_buffer += delta;
+   buffer->last_text_delta = this_text - meta->text_addr;
+   buffer->last_data_delta = this_data - meta->data_addr;
+   meta->text_addr = this_text;
+   meta->data_addr = this_data;
continue;
}
 
@@ -1855,6 +1869,8 @@ static void rb_range_meta_init(struct trace_buffer 
*buffer, int nr_pages)
subbuf = rb_subbufs_from_meta(meta);
 
meta->first_buffer = (unsigned long)subbuf;
+   meta->text_addr = this_text;
+   meta->data_addr = this_data;
 
/*
 * The buffers[] array holds the order of the sub-buffers
-- 
2.43.0

[PATCH v4 07/13] ring-buffer: Validate boot range memory events

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Make sure all the events in each of the sub-buffers that were mapped in a
memory region are valid. This moves the code that walks the buffers for
time-stamp validation out of the CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
ifdef block and is used to validate the content. Only the ring buffer
event meta data and time stamps are checked and not the data load.

This also has a second purpose. The buffer_page structure that points to
the data sub-buffers has accounting that keeps track of the number of
events that are on the sub-buffer. This updates that counter as well. That
counter is used in reading the buffer and knowing if the ring buffer is
empty or not.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 189 +
 1 file changed, 151 insertions(+), 38 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index aecd4a7d62be..86f03c0ba4c0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1674,10 +1674,151 @@ static bool rb_meta_valid(struct ring_buffer_meta 
*meta, int cpu,
subbuf = (void *)subbuf + subbuf_size;
}
 
-   pr_info("Ring buffer meta is from previous boot!\n");
return true;
 }
 
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+
+static int rb_read_data_buffer(struct buffer_data_page *dpage, int tail, int 
cpu,
+  unsigned long long *timestamp, u64 *delta_ptr)
+{
+   struct ring_buffer_event *event;
+   u64 ts, delta;
+   int events = 0;
+   int e;
+
+   *delta_ptr = 0;
+   *timestamp = 0;
+
+   ts = dpage->time_stamp;
+
+   for (e = 0; e < tail; e += rb_event_length(event)) {
+
+   event = (struct ring_buffer_event *)(dpage->data + e);
+
+   switch (event->type_len) {
+
+   case RINGBUF_TYPE_TIME_EXTEND:
+   delta = rb_event_time_stamp(event);
+   ts += delta;
+   break;
+
+   case RINGBUF_TYPE_TIME_STAMP:
+   delta = rb_event_time_stamp(event);
+   if (delta < ts) {
+   *delta_ptr = delta;
+   *timestamp = ts;
+   return -1;
+   }
+   ts = delta;
+   break;
+
+   case RINGBUF_TYPE_PADDING:
+   if (event->time_delta == 1)
+   break;
+   fallthrough;
+   case RINGBUF_TYPE_DATA:
+   events++;
+   ts += event->time_delta;
+   break;
+
+   default:
+   return -1;
+   }
+   }
+   *timestamp = ts;
+   return events;
+}
+
+static int rb_validate_buffer(struct buffer_data_page *dpage, int cpu)
+{
+   unsigned long long ts;
+   u64 delta;
+   int tail;
+
+   tail = local_read(&dpage->commit);
+   return rb_read_data_buffer(dpage, tail, cpu, &ts, &delta);
+}
+
+/* If the meta data has been validated, now validate the events */
+static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
+{
+   struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+   struct buffer_page *head_page;
+   unsigned long entry_bytes = 0;
+   unsigned long entries = 0;
+   int ret;
+   int i;
+
+   if (!meta || !meta->head_buffer)
+   return;
+
+   /* Do the reader page first */
+   ret = rb_validate_buffer(cpu_buffer->reader_page->page, 
cpu_buffer->cpu);
+   if (ret < 0) {
+   pr_info("Ring buffer reader page is invalid\n");
+   goto invalid;
+   }
+   entries += ret;
+   entry_bytes += local_read(&cpu_buffer->reader_page->page->commit);
+   local_set(&cpu_buffer->reader_page->entries, ret);
+
+   head_page = cpu_buffer->head_page;
+
+   /* If both the head and commit are on the reader_page then we are done. 
*/
+   if (head_page == cpu_buffer->reader_page &&
+   head_page == cpu_buffer->commit_page)
+   goto done;
+
+   /* Iterate until finding the commit page */
+   for (i = 0; i < meta->nr_subbufs + 1; i++, rb_inc_page(&head_page)) {
+
+   /* Reader page has already been done */
+   if (head_page == cpu_buffer->reader_page)
+   continue;
+
+   ret = rb_validate_buffer(head_page->page, cpu_buffer->cpu);
+   if (ret < 0) {
+   pr_info("Ring buffer meta [%d] invalid buffer page\n",
+   cpu_buffer->cpu);
+   goto invalid;
+   }
+   entries += ret;
+   entry_bytes += local_read(&head_page->page->commit);
+   local_set(&cpu_buffe

[PATCH v4 06/13] ring-buffer: Add test if range of boot buffer is valid

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Add a test against the ring buffer memory range to see if it has valid
data. The ring_buffer_meta structure is given a new field called
"first_buffer" which holds the address of the first sub-buffer. This is
used to both determine if the other fields are valid as well as finding
the offset between the old addresses of the sub-buffer from the previous
boot to the new addresses of the current boot.

Since the values for nr_subbufs and subbuf_size is to be the same, check
if the values in the meta page match the values calculated.

Take the range of the first_buffer and the total size of all the buffers
and make sure the saved head_buffer and commit_buffer fall in the range.

Iterate through all the sub-buffers to make sure that the values in the
sub-buffer "commit" field (the field that holds the amount of data on the
sub-buffer) is within the end of the sub-buffer. Also check the index
array to make sure that all the indexes are within nr_subbufs.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 143 ++---
 1 file changed, 135 insertions(+), 8 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 093c73c617cc..aecd4a7d62be 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -45,6 +45,7 @@
 static void update_pages_handler(struct work_struct *work);
 
 struct ring_buffer_meta {
+   unsigned long   first_buffer;
unsigned long   head_buffer;
unsigned long   commit_buffer;
__u32   subbuf_size;
@@ -1617,21 +1618,103 @@ static void *rb_range_buffer(struct 
ring_buffer_per_cpu *cpu_buffer, int idx)
return (void *)ptr;
 }
 
+/*
+ * See if the existing memory contains valid ring buffer data.
+ * As the previous kernel must be the same as this kernel, all
+ * the calculations (size of buffers and number of buffers)
+ * must be the same.
+ */
+static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
+ struct trace_buffer *buffer, int nr_pages)
+{
+   int subbuf_size = PAGE_SIZE;
+   struct buffer_data_page *subbuf;
+   unsigned long buffers_start;
+   unsigned long buffers_end;
+   int i;
+
+   /* The subbuffer's size and number of subbuffers must match */
+   if (meta->subbuf_size != subbuf_size ||
+   meta->nr_subbufs != nr_pages + 1) {
+   pr_info("Ring buffer boot meta [%d] mismatch of 
subbuf_size/nr_pages\n", cpu);
+   return false;
+   }
+
+   buffers_start = meta->first_buffer;
+   buffers_end = meta->first_buffer + (subbuf_size * meta->nr_subbufs);
+
+   /* Is the head and commit buffers within the range of buffers? */
+   if (meta->head_buffer < buffers_start ||
+   meta->head_buffer >= buffers_end) {
+   pr_info("Ring buffer boot meta [%d] head buffer out of 
range\n", cpu);
+   return false;
+   }
+
+   if (meta->commit_buffer < buffers_start ||
+   meta->commit_buffer >= buffers_end) {
+   pr_info("Ring buffer boot meta [%d] commit buffer out of 
range\n", cpu);
+   return false;
+   }
+
+   subbuf = rb_subbufs_from_meta(meta);
+
+   /* Is the meta buffers and the subbufs themselves have correct data? */
+   for (i = 0; i < meta->nr_subbufs; i++) {
+   if (meta->buffers[i] < 0 ||
+   meta->buffers[i] >= meta->nr_subbufs) {
+   pr_info("Ring buffer boot meta [%d] array out of 
range\n", cpu);
+   return false;
+   }
+
+   if ((unsigned)local_read(&subbuf->commit) > subbuf_size) {
+   pr_info("Ring buffer boot meta [%d] buffer invalid 
commit\n", cpu);
+   return false;
+   }
+
+   subbuf = (void *)subbuf + subbuf_size;
+   }
+
+   pr_info("Ring buffer meta is from previous boot!\n");
+   return true;
+}
+
 static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
 {
struct ring_buffer_meta *meta;
+   unsigned long delta;
void *subbuf;
int cpu;
int i;
 
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+   void *next_meta;
+
meta = rb_range_meta(buffer, nr_pages, cpu);
 
+   if (rb_meta_valid(meta, cpu, buffer, nr_pages)) {
+   /* Make the mappings match the current address */
+   subbuf = rb_subbufs_from_meta(meta);
+   delta = (unsigned long)subbuf - meta->first_buffer;
+   meta->first_buffer += delta;
+   meta->head_buffer += delta;
+   meta->commit_buffer += delta;
+   continue;
+   }
+
+   if (cpu < nr_cpu_ids - 1)
+   next_meta = rb_range_meta(buffer, nr_pages, cpu + 1);
+

[PATCH v4 05/13] ring-buffer: Add output of ring buffer meta page

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Add a buffer_meta per-cpu file for the trace instance that is mapped to
boot memory. This shows the current meta-data and can be used by user
space tools to record off the current mappings to help reconstruct the
ring buffer after a reboot.

It does not expose any virtual addresses, just indexes into the sub-buffer
pages.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 77 ++
 kernel/trace/trace.c   | 30 ++-
 kernel/trace/trace.h   |  2 +
 3 files changed, 107 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 385dc1750fc7..093c73c617cc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -32,6 +32,8 @@
 #include 
 #include 
 
+#include "trace.h"
+
 /*
  * The "absolute" timestamp in the buffer is only 59 bits.
  * If a clock has the 5 MSBs set, it needs to be saved and
@@ -1646,6 +1648,81 @@ static void rb_range_meta_init(struct trace_buffer 
*buffer, int nr_pages)
}
 }
 
+static void *rbm_start(struct seq_file *m, loff_t *pos)
+{
+   struct ring_buffer_per_cpu *cpu_buffer = m->private;
+   struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+   unsigned long val;
+
+   if (!meta)
+   return NULL;
+
+   if (*pos > meta->nr_subbufs)
+   return NULL;
+
+   val = *pos;
+   val++;
+
+   return (void *)val;
+}
+
+static void *rbm_next(struct seq_file *m, void *v, loff_t *pos)
+{
+   (*pos)++;
+
+   return rbm_start(m, pos);
+}
+
+static int rb_meta_subbuf_idx(struct ring_buffer_meta *meta, void *subbuf);
+
+static int rbm_show(struct seq_file *m, void *v)
+{
+   struct ring_buffer_per_cpu *cpu_buffer = m->private;
+   struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+   unsigned long val = (unsigned long)v;
+
+   if (val == 1) {
+   seq_printf(m, "head_buffer:   %d\n",
+  rb_meta_subbuf_idx(meta, (void *)meta->head_buffer));
+   seq_printf(m, "commit_buffer: %d\n",
+  rb_meta_subbuf_idx(meta, (void 
*)meta->commit_buffer));
+   seq_printf(m, "subbuf_size:   %d\n", meta->subbuf_size);
+   seq_printf(m, "nr_subbufs:%d\n", meta->nr_subbufs);
+   return 0;
+   }
+
+   val -= 2;
+   seq_printf(m, "buffer[%ld]:%d\n", val, meta->buffers[val]);
+
+   return 0;
+}
+
+static void rbm_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations rb_meta_seq_ops = {
+   .start  = rbm_start,
+   .next   = rbm_next,
+   .show   = rbm_show,
+   .stop   = rbm_stop,
+};
+
+int ring_buffer_meta_seq_init(struct file *file, struct trace_buffer *buffer, 
int cpu)
+{
+   struct seq_file *m;
+   int ret;
+
+   ret = seq_open(file, &rb_meta_seq_ops);
+   if (ret)
+   return ret;
+
+   m = file->private_data;
+   m->private = buffer->buffers[cpu];
+
+   return 0;
+}
+
 static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
long nr_pages, struct list_head *pages)
 {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ff2b504fbe00..622fe670949d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5018,7 +5018,7 @@ static int show_traces_open(struct inode *inode, struct 
file *file)
return 0;
 }
 
-static int show_traces_release(struct inode *inode, struct file *file)
+static int tracing_seq_release(struct inode *inode, struct file *file)
 {
struct trace_array *tr = inode->i_private;
 
@@ -5059,7 +5059,7 @@ static const struct file_operations show_traces_fops = {
.open   = show_traces_open,
.read   = seq_read,
.llseek = seq_lseek,
-   .release= show_traces_release,
+   .release= tracing_seq_release,
 };
 
 static ssize_t
@@ -6860,6 +6860,22 @@ tracing_total_entries_read(struct file *filp, char 
__user *ubuf,
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+static int tracing_buffer_meta_open(struct inode *inode, struct file *filp)
+{
+   struct trace_array *tr = inode->i_private;
+   int cpu = tracing_get_cpu(inode);
+   int ret;
+
+   ret = tracing_check_open_get_tr(tr);
+   if (ret)
+   return ret;
+
+   ret = ring_buffer_meta_seq_init(filp, tr->array_buffer.buffer, cpu);
+   if (ret < 0)
+   __trace_array_put(tr);
+   return ret;
+}
+
 static ssize_t
 tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
  size_t cnt, loff_t *ppos)
@@ -7436,6 +7452,13 @@ static const struct file_operations tracing_entries_fops 
= {
.release= tracing_release_generic_tr,
 };
 
+static const struct file_operations tracing_buffer_meta_fops = {
+   .open

[PATCH v4 03/13] ring-buffer: Add ring_buffer_meta data

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Populate the ring_buffer_meta array. It holds the pointer to the
head_buffer (next to read), the commit_buffer (next to write) the size of
the sub-buffers, number of sub-buffers and an array that keeps track of
the order of the sub-buffers.

This information will be stored in the persistent memory to help on reboot
to reconstruct the ring buffer.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 209 -
 1 file changed, 184 insertions(+), 25 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 53abe7916f2b..385dc1750fc7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -43,6 +43,11 @@
 static void update_pages_handler(struct work_struct *work);
 
 struct ring_buffer_meta {
+   unsigned long   head_buffer;
+   unsigned long   commit_buffer;
+   __u32   subbuf_size;
+   __u32   nr_subbufs;
+   int buffers[];
 };
 
 /*
@@ -500,6 +505,7 @@ struct ring_buffer_per_cpu {
struct mutexmapping_lock;
unsigned long   *subbuf_ids;/* ID to subbuf VA */
struct trace_buffer_meta*meta_page;
+   struct ring_buffer_meta *ring_meta;
 
/* ring buffer pages to update, > 0 to add, < 0 to remove */
longnr_pages_to_update;
@@ -1260,6 +1266,11 @@ static void rb_head_page_activate(struct 
ring_buffer_per_cpu *cpu_buffer)
 * Set the previous list pointer to have the HEAD flag.
 */
rb_set_list_to_head(head->list.prev);
+
+   if (cpu_buffer->ring_meta) {
+   struct ring_buffer_meta *meta = cpu_buffer->ring_meta;
+   meta->head_buffer = (unsigned long)head->page;
+   }
 }
 
 static void rb_list_head_clear(struct list_head *list)
@@ -1514,51 +1525,127 @@ rb_range_align_subbuf(unsigned long addr, int 
subbuf_size, int nr_subbufs)
 }
 
 /*
- * Return a specific sub-buffer for a given @cpu defined by @idx.
+ * Return the ring_buffer_meta for a given @cpu.
  */
-static void *rb_range_buffer(struct trace_buffer *buffer, int cpu, int 
nr_pages, int idx)
+static void *rb_range_meta(struct trace_buffer *buffer, int nr_pages, int cpu)
 {
-   unsigned long ptr;
int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+   unsigned long ptr = buffer->range_addr_start;
+   struct ring_buffer_meta *meta;
int nr_subbufs;
 
-   /* Include the reader page */
-   nr_subbufs = nr_pages + 1;
+   if (!ptr)
+   return NULL;
+
+   /* When nr_pages passed in is zero, the first meta has already been 
initialized */
+   if (!nr_pages) {
+   meta = (struct ring_buffer_meta *)ptr;
+   nr_subbufs = meta->nr_subbufs;
+   } else {
+   meta = NULL;
+   /* Include the reader page */
+   nr_subbufs = nr_pages + 1;
+   }
 
/*
 * The first chunk may not be subbuffer aligned, where as
 * the rest of the chunks are.
 */
-   ptr = buffer->range_addr_start;
-   ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
if (cpu) {
-   unsigned long p;
-
-   ptr += subbuf_size * nr_subbufs;
-
-   /* Save the beginning of this CPU chunk */
-   p = ptr;
-
ptr = rb_range_align_subbuf(ptr, subbuf_size, nr_subbufs);
+   ptr += subbuf_size * nr_subbufs;
 
/* We can use multiplication to find chunks greater than 1 */
if (cpu > 1) {
unsigned long size;
+   unsigned long p;
 
+   /* Save the beginning of this CPU chunk */
+   p = ptr;
+   ptr = rb_range_align_subbuf(ptr, subbuf_size, 
nr_subbufs);
ptr += subbuf_size * nr_subbufs;
 
/* Now all chunks after this are the same size */
size = ptr - p;
ptr += size * (cpu - 2);
-
-   ptr = rb_range_align_subbuf(ptr, subbuf_size, 
nr_subbufs);
}
}
-   if (ptr + subbuf_size * nr_subbufs > buffer->range_addr_end)
+   return (void *)ptr;
+}
+
+/* Return the start of subbufs given the meta pointer */
+static void *rb_subbufs_from_meta(struct ring_buffer_meta *meta)
+{
+   int subbuf_size = meta->subbuf_size;
+   unsigned long ptr;
+
+   ptr = (unsigned long)meta;
+   ptr = rb_range_align_subbuf(ptr, subbuf_size, meta->nr_subbufs);
+
+   return (void *)ptr;
+}
+
+/*
+ * Return a specific sub-buffer for a given @cpu defined by @idx.
+ */
+static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
+{
+   struct ring_buffer_meta *meta;
+   unsigned long ptr;
+   int subbuf_size;
+
+   meta =

[PATCH v4 02/13] ring-buffer: Add ring_buffer_alloc_range()

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

In preparation to allowing the trace ring buffer to be allocated in a
range of memory that is persistent across reboots, add
ring_buffer_alloc_range(). It takes a contiguous range of memory and will
split it up evenly for the per CPU ring buffers.

If there's not enough memory to handle all CPUs with the minimum size, it
will fail to allocate the ring buffer.

Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ring_buffer.h |  17 +++
 kernel/trace/ring_buffer.c  | 239 ++--
 2 files changed, 220 insertions(+), 36 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 96d2140b471e..a50b0223b1d3 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -89,6 +89,11 @@ void ring_buffer_discard_commit(struct trace_buffer *buffer,
 struct trace_buffer *
 __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key 
*key);
 
+struct trace_buffer *__ring_buffer_alloc_range(unsigned long size, unsigned 
flags,
+  int order, unsigned long start,
+  unsigned long range_size,
+  struct lock_class_key *key);
+
 /*
  * Because the ring buffer is generic, if other users of the ring buffer get
  * traced by ftrace, it can produce lockdep warnings. We need to keep each
@@ -100,6 +105,18 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, 
struct lock_class_key *k
__ring_buffer_alloc((size), (flags), &__key);   \
 })
 
+/*
+ * Because the ring buffer is generic, if other users of the ring buffer get
+ * traced by ftrace, it can produce lockdep warnings. We need to keep each
+ * ring buffer's lock class separate.
+ */
+#define ring_buffer_alloc_range(size, flags, order, start, range_size) \
+({ \
+   static struct lock_class_key __key; \
+   __ring_buffer_alloc_range((size), (flags), (order), (start),\
+ (range_size), &__key);\
+})
+
 typedef bool (*ring_buffer_cond_fn)(void *data);
 int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full,
 ring_buffer_cond_fn cond, void *data);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 78beaccf9c8c..53abe7916f2b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -42,6 +42,9 @@
 
 static void update_pages_handler(struct work_struct *work);
 
+struct ring_buffer_meta {
+};
+
 /*
  * The ring buffer header is special. We must manually up keep it.
  */
@@ -342,7 +345,8 @@ struct buffer_page {
local_t  entries;   /* entries on this page */
unsigned longreal_end;  /* real end of data */
unsigned order; /* order of the page */
-   u32  id;/* ID for external mapping */
+   u32  id:30; /* ID for external mapping */
+   u32  range:1;   /* Mapped via a range */
struct buffer_data_page *page;  /* Actual data page */
 };
 
@@ -373,7 +377,9 @@ static __always_inline unsigned int rb_page_commit(struct 
buffer_page *bpage)
 
 static void free_buffer_page(struct buffer_page *bpage)
 {
-   free_pages((unsigned long)bpage->page, bpage->order);
+   /* Range pages are not to be freed */
+   if (!bpage->range)
+   free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
 }
 
@@ -523,6 +529,9 @@ struct trace_buffer {
struct rb_irq_work  irq_work;
booltime_stamp_abs;
 
+   unsigned long   range_addr_start;
+   unsigned long   range_addr_end;
+
unsigned intsubbuf_size;
unsigned intsubbuf_order;
unsigned intmax_data_size;
@@ -1490,9 +1499,70 @@ static void rb_check_pages(struct ring_buffer_per_cpu 
*cpu_buffer)
}
 }
 
+/*
+ * Take an address, add the meta data size as well as the array of
+ * array subbuffer indexes, then align it to a subbuffer size.
+ *
+ * This is used to help find the next per cpu subbuffer within a mapped range.
+ */
+static unsigned long
+rb_range_align_subbuf(unsigned long addr, int subbuf_size, int nr_subbufs)
+{
+   addr += sizeof(struct ring_buffer_meta) +
+   sizeof(int) * nr_subbufs;
+   return ALIGN(addr, subbuf_size);
+}
+
+/*
+ * Return a specific sub-buffer for a given @cpu defined by @idx.
+ */
+static void *rb_range_buffer(struct trace_buffer *buffer, int cpu, int 
nr_pages, int idx)
+{
+   unsigned long ptr;
+   int subbuf_size = buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+   int nr_subbufs;
+
+   /* Include the reader page */
+   nr_subbufs = nr_pag

[PATCH v4 04/13] tracing: Implement creating an instance based on a given memory region

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

Allow for creating a new instance by passing in an address and size to map
the ring buffer for the instance to.

This will allow features like a pstore memory mapped region to be used for
an tracing instance ring buffer that can be retrieved from one boot to the
next.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace.c | 50 +++-
 kernel/trace/trace.h |  4 
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 578a49ff5c32..ff2b504fbe00 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4921,6 +4921,11 @@ static int tracing_open(struct inode *inode, struct file 
*file)
 static bool
 trace_ok_for_array(struct tracer *t, struct trace_array *tr)
 {
+#ifdef CONFIG_TRACER_SNAPSHOT
+   /* arrays with mapped buffer range do not have snapshots */
+   if (tr->range_addr_start && t->use_max_tr)
+   return false;
+#endif
return (tr->flags & TRACE_ARRAY_FL_GLOBAL) || t->allow_instances;
 }
 
@@ -8664,11 +8669,13 @@ tracing_init_tracefs_percpu(struct trace_array *tr, 
long cpu)
tr, cpu, &tracing_entries_fops);
 
 #ifdef CONFIG_TRACER_SNAPSHOT
-   trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
-   tr, cpu, &snapshot_fops);
+   if (!tr->range_addr_start) {
+   trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu,
+ tr, cpu, &snapshot_fops);
 
-   trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
-   tr, cpu, &snapshot_raw_fops);
+   trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu,
+ tr, cpu, &snapshot_raw_fops);
+   }
 #endif
 }
 
@@ -9205,7 +9212,18 @@ allocate_trace_buffer(struct trace_array *tr, struct 
array_buffer *buf, int size
 
buf->tr = tr;
 
-   buf->buffer = ring_buffer_alloc(size, rb_flags);
+   if (tr->range_addr_start && tr->range_addr_size) {
+   buf->buffer = ring_buffer_alloc_range(size, rb_flags, 0,
+ tr->range_addr_start,
+ tr->range_addr_size);
+   /*
+* This is basically the same as a mapped buffer,
+* with the same restrictions.
+*/
+   tr->mapped++;
+   } else {
+   buf->buffer = ring_buffer_alloc(size, rb_flags);
+   }
if (!buf->buffer)
return -ENOMEM;
 
@@ -9242,6 +9260,10 @@ static int allocate_trace_buffers(struct trace_array 
*tr, int size)
return ret;
 
 #ifdef CONFIG_TRACER_MAX_TRACE
+   /* Fix mapped buffer trace arrays do not have snapshot buffers */
+   if (tr->range_addr_start)
+   return 0;
+
ret = allocate_trace_buffer(tr, &tr->max_buffer,
allocate_snapshot ? size : 1);
if (MEM_FAIL(ret, "Failed to allocate trace buffer\n")) {
@@ -9342,7 +9364,9 @@ static int trace_array_create_dir(struct trace_array *tr)
 }
 
 static struct trace_array *
-trace_array_create_systems(const char *name, const char *systems)
+trace_array_create_systems(const char *name, const char *systems,
+  unsigned long range_addr_start,
+  unsigned long range_addr_size)
 {
struct trace_array *tr;
int ret;
@@ -9368,6 +9392,10 @@ trace_array_create_systems(const char *name, const char 
*systems)
goto out_free_tr;
}
 
+   /* Only for boot up memory mapped ring buffers */
+   tr->range_addr_start = range_addr_start;
+   tr->range_addr_size = range_addr_size;
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
 
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9425,7 +9453,7 @@ trace_array_create_systems(const char *name, const char 
*systems)
 
 static struct trace_array *trace_array_create(const char *name)
 {
-   return trace_array_create_systems(name, NULL);
+   return trace_array_create_systems(name, NULL, 0, 0);
 }
 
 static int instance_mkdir(const char *name)
@@ -9479,7 +9507,7 @@ struct trace_array *trace_array_get_by_name(const char 
*name, const char *system
goto out_unlock;
}
 
-   tr = trace_array_create_systems(name, systems);
+   tr = trace_array_create_systems(name, systems, 0, 0);
 
if (IS_ERR(tr))
tr = NULL;
@@ -9672,8 +9700,10 @@ init_tracer_tracefs(struct trace_array *tr, struct 
dentry *d_tracer)
MEM_FAIL(1, "Could not allocate function filter files");
 
 #ifdef CONFIG_TRACER_SNAPSHOT
-   trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer,
- tr, &snapshot_fops);
+   if (!tr->ran

[PATCH v4 01/13] ring-buffer: Allow mapped field to be set without mapping

2024-06-11 Thread Steven Rostedt

From: "Steven Rostedt (Google)" 

In preparation for having the ring buffer mapped to a dedicated location,
which will have the same restrictions as user space memory mapped buffers,
allow it to use the "mapped" field of the ring_buffer_per_cpu structure
without having the user space meta page mapping.

When this starts using the mapped field, it will need to handle adding a
user space mapping (and removing it) from a ring buffer that is using a
dedicated memory range.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/ring_buffer.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 28853966aa9a..78beaccf9c8c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5224,6 +5224,9 @@ static void rb_update_meta_page(struct 
ring_buffer_per_cpu *cpu_buffer)
 {
struct trace_buffer_meta *meta = cpu_buffer->meta_page;
 
+   if (!meta)
+   return;
+
meta->reader.read = cpu_buffer->reader_page->read;
meta->reader.id = cpu_buffer->reader_page->id;
meta->reader.lost_events = cpu_buffer->lost_events;
@@ -6167,7 +6170,7 @@ rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
 
mutex_lock(&cpu_buffer->mapping_lock);
 
-   if (!cpu_buffer->mapped) {
+   if (!cpu_buffer->mapped || !cpu_buffer->meta_page) {
mutex_unlock(&cpu_buffer->mapping_lock);
return ERR_PTR(-ENODEV);
}
@@ -6359,12 +6362,13 @@ int ring_buffer_map(struct trace_buffer *buffer, int 
cpu,
 */
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_setup_ids_meta_page(cpu_buffer, subbuf_ids);
+
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
err = __rb_map_vma(cpu_buffer, vma);
if (!err) {
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
-   cpu_buffer->mapped = 1;
+   cpu_buffer->mapped++;
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
} else {
kfree(cpu_buffer->subbuf_ids);
@@ -6403,7 +6407,8 @@ int ring_buffer_unmap(struct trace_buffer *buffer, int 
cpu)
mutex_lock(&buffer->mutex);
raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
-   cpu_buffer->mapped = 0;
+   WARN_ON_ONCE(!cpu_buffer->mapped);
+   cpu_buffer->mapped--;
 
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
2.43.0

[PATCH v4 00/13] tracing: Persistent traces across a reboot or crash

2024-06-11 Thread Steven Rostedt

This is a way to map a ring buffer instance across reboots.
The requirement is that you have a memory region that is not erased.
I tested this on a Debian VM running on qemu on a Debian server,
and even tested it on a baremetal box running Fedora. I was
surprised that it worked on the baremetal box, but it does so
surprisingly consistently.

This series does not require the ring buffer mapping, but simply
takes a physical address that has been reserved via memmap (on x86 only)
An example of the kernel command line is:

  memmap=12M$0x28540  trace_instance=boot_mapped@0x28540:12M

The above will reserve 12M at physical address 0x28540 (done by the
existing memmap command line option), and then the trace_instance option was
extended to take an address and size (@0x28540:12M). It will then vmap()
that address and allocate a ring buffer in it. If a ring buffer already
exists, it will use it and expose the contents to user space.

The memory reserved is used by the ring buffer of this instance.
It acts like a memory mapped instance so it has some limitations. It does not
allow snapshots nor does it allow tracers which use a snapshot buffer (like
irqsoff and wakeup tracers).

On boot up, when setting up the ring buffer, it looks at the current
content and does a vigorous test to see if the content is valid.
It even walks the events in all the sub-buffers to make sure the
ring buffer meta data is correct. If it determines that the content
is valid, it will reconstruct the ring buffer to use the content
it has found.

If the buffer is valid, on the next boot, the boot_mapped instance
will contain the data from the previous boot. You can cat the
trace or trace_pipe file, or even run trace-cmd extract on it to
make a trace.dat file that holds the date. This is much better than
dealing with a ftrace_dump_on_opps (I wish I had this a decade ago!)

There are still some limitations of this buffer. One is that it assumes
that the kernel you are booting back into is the same one that crashed.
At least the trace_events (like sched_switch and friends) all have the
same ids. This would be true with the same kernel as the ids are determined
at link time.

Module events could possible be a problem as the ids may not match.

This version of the patch series saves a text function and a data
string address in the persistent memory, and this is used to calculate
the delta between text and data addresses of the new boot up. Now
function tracing and "%pS" still work across boots. Even the RCU
trace events that point to static strings work as well!

The delta is exported by a new file in the instance called "last_boot_info"
that has something like this:

 # cat last_boot_info
 text delta:-268435456
 data delta:-268435456

This can be used by trace-cmd that reads the trace_pipe_raw data and
now can figure out how to map the print_formats and kallsyms to the raw
data in the buffers.

This can be used to debug kernel shutdown. I ran the following:

  # trace-cmd start -B boot_mapped -p function
  # reboot

[after reboot]

  # trace-cmd show -B boot_mapped | tail -20
   swapper/0-1   [000] d..1.63.479667: preempt_count_add <-delay_tsc
   swapper/0-1   [000] d..2.63.479669: preempt_count_sub <-delay_tsc
   swapper/0-1   [000] d..1.63.479671: disable_local_APIC 
<-native_stop_other_cpus
   swapper/0-1   [000] d..1.63.479673: clear_local_APIC.part.0 
<-disable_local_APIC
   swapper/0-1   [000] d..1.63.479716: mcheck_cpu_clear 
<-native_stop_other_cpus
   swapper/0-1   [000] d..1.63.479718: mce_intel_feature_clear 
<-native_stop_other_cpus
   swapper/0-1   [000] d..1.63.479720: lmce_supported 
<-mce_intel_feature_clear
   swapper/0-1   [000] d..1.63.479732: lapic_shutdown 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479735: disable_local_APIC 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479736: clear_local_APIC.part.0 
<-disable_local_APIC
   swapper/0-1   [000] d..1.63.479763: restore_boot_irq_mode 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479763: native_restore_boot_irq_mode 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479764: disconnect_bsp_APIC 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479777: hpet_disable 
<-native_machine_shutdown
   swapper/0-1   [000] d..1.63.479778: iommu_shutdown_noop 
<-native_machine_restart
   swapper/0-1   [000] d..1.63.479779: 
native_machine_emergency_restart <-__do_sys_reboot
   swapper/0-1   [000] d..1.63.479779: tboot_shutdown 
<-native_machine_emergency_restart
   swapper/0-1   [000] d..1.63.479790: acpi_reboot 
<-native_machine_emergency_restart
   swapper/0-1   [000] d..1.63.479791: acpi_reset <-acpi_reboot
   swapper/0-1   [000] d..1.63.479791: acpi_os_write_por

[PATCH] vDPA: add missing MODULE_DESCRIPTION() macros

2024-06-11 Thread Jeff Johnson

With ARCH=x86, make allmodconfig && make W=1 C=1 reports:
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/vdpa/vdpa.o
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/vdpa/ifcvf/ifcvf.o

Add the missing invocations of the MODULE_DESCRIPTION() macro.

Signed-off-by: Jeff Johnson 
---
 drivers/vdpa/ifcvf/ifcvf_main.c | 1 +
 drivers/vdpa/vdpa.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c
index 80d0a0460885..ccf64d7bbfaa 100644
--- a/drivers/vdpa/ifcvf/ifcvf_main.c
+++ b/drivers/vdpa/ifcvf/ifcvf_main.c
@@ -894,4 +894,5 @@ static struct pci_driver ifcvf_driver = {
 
 module_pci_driver(ifcvf_driver);
 
+MODULE_DESCRIPTION("Intel IFC VF NIC driver for virtio dataplane offloading");
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c
index 8d391947eb8d..1ca445e31acb 100644
--- a/drivers/vdpa/vdpa.c
+++ b/drivers/vdpa/vdpa.c
@@ -1538,4 +1538,5 @@ core_initcall(vdpa_init);
 module_exit(vdpa_exit);
 
 MODULE_AUTHOR("Jason Wang ");
+MODULE_DESCRIPTION("vDPA bus");
 MODULE_LICENSE("GPL v2");

---
base-commit: 83a7eefedc9b56fe7bfeff13b6c7356688ffa670
change-id: 20240611-md-drivers-vdpa-391206d17ec3

Re: [PATCH V2 1/2] soc: qcom: smp2p: Add remote name into smp2p irq devname

2024-06-11 Thread Bjorn Andersson

On Tue, Jun 11, 2024 at 10:53:01AM -0700, Chris Lew wrote:
> 
> 
> On 6/11/2024 9:06 AM, Bjorn Andersson wrote:
> > On Tue, Jun 11, 2024 at 06:03:50PM +0530, Sudeepgoud Patil wrote:
> > > Add smp2p irq devname which fetches remote name from respective
> > > smp2p dtsi node, which makes the wakeup source distinguishable
> > > in irq wakeup prints.
> > > 
> > > Signed-off-by: Sudeepgoud Patil 
> > > ---
> > >   drivers/soc/qcom/smp2p.c | 14 +-
> > >   1 file changed, 13 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c
> > > index a21241cbeec7..a77fee048b38 100644
> > > --- a/drivers/soc/qcom/smp2p.c
> > > +++ b/drivers/soc/qcom/smp2p.c
> > > @@ -122,6 +122,7 @@ struct smp2p_entry {
> > >* @ssr_ack_enabled: SMP2P_FEATURE_SSR_ACK feature is supported and was 
> > > enabled
> > >* @ssr_ack: current cached state of the local ack bit
> > >* @negotiation_done: whether negotiating finished
> > > + * @irq_devname: poniter to the smp2p irq devname
> > >* @local_pid:  processor id of the inbound edge
> > >* @remote_pid: processor id of the outbound edge
> > >* @ipc_regmap: regmap for the outbound ipc
> > > @@ -146,6 +147,7 @@ struct qcom_smp2p {
> > >   bool ssr_ack;
> > >   bool negotiation_done;
> > > + char *irq_devname;
> > >   unsigned local_pid;
> > >   unsigned remote_pid;
> > > @@ -614,10 +616,16 @@ static int qcom_smp2p_probe(struct platform_device 
> > > *pdev)
> > >   /* Kick the outgoing edge after allocating entries */
> > >   qcom_smp2p_kick(smp2p);
> > > + smp2p->irq_devname = kasprintf(GFP_KERNEL, "%s", 
> > > pdev->dev.of_node->name);
> > 
> > That's a lot of extra instructions for copying a string, which doesn't
> > need to be copied because of_node->name is const char and the argument
> > to devm_request_threaded_irq() is const char.
> > 
> > So, kstrdup_const() is what you're looking for.
> > 
> > You can then go devm_kstrdup_const() and avoid the kfree() (then
> > kfree_const()) below.
> > 
> > 
> > That said, looking at /proc/interrupts, I think it would make sense to
> > make this devm_kasprintf(..., "smp2p-%s", name);
> > 
> 
> Is it ok to rely on the "of_node->name"? I think device tree tends to always
> have the node name as "smp2p-%s" already, so ("smp2p-%s", name) would result
> in "smp2p-smp2p-adsp".
> 

You're right, I forgot about that.

This actually means that if we replace "smp2p" with NULL, we should get
the descriptive names we're looking for automagically (as the node name
is used to build dev_name()).

> Also Sudeepgoud, I think this will update the irqname in /proc/interrupts
> for the ipcc irqchip entry. It would also be helpful if we could
> differentiate the instances of smp2p irqchips as well. That way we can see
> what processors the 'ready' and 'fatal' interrupts apply to in
> /proc/interrupts.
> 

But this would be a change on the consumer side, right? To replace the
"q6v5" that we have hard coded for all the PAS remoteproc instances.

I'd be happy to see such change.

Regards,
Bjorn

> Can you refer to my internal patch that adds .irq_print_chip() and
> incorporate those changes here?
> 
> > Regards,
> > Bjorn
> > 
> > > + if (!smp2p->irq_devname) {
> > > + ret = -ENOMEM;
> > > + goto unwind_interfaces;
> > > + }
> > > +
> > >   ret = devm_request_threaded_irq(&pdev->dev, irq,
> > >   NULL, qcom_smp2p_intr,
> > >   IRQF_ONESHOT,
> > > - "smp2p", (void *)smp2p);
> > > + smp2p->irq_devname, (void *)smp2p);
> > >   if (ret) {
> > >   dev_err(&pdev->dev, "failed to request interrupt\n");
> > >   goto unwind_interfaces;
> > > @@ -650,6 +658,8 @@ static int qcom_smp2p_probe(struct platform_device 
> > > *pdev)
> > >   list_for_each_entry(entry, &smp2p->outbound, node)
> > >   qcom_smem_state_unregister(entry->state);
> > > + kfree(smp2p->irq_devname);
> > > +
> > >   smp2p->out->valid_entries = 0;
> > >   release_mbox:
> > > @@ -677,6 +687,8 @@ static void qcom_smp2p_remove(struct platform_device 
> > > *pdev)
> > >   mbox_free_channel(smp2p->mbox_chan);
> > > + kfree(smp2p->irq_devname);
> > > +
> > >   smp2p->out->valid_entries = 0;
> > >   }
> > > -- 
> > >

Re: [PATCH 5/6] remoteproc: da8xx: Use devm action to release reserved memory

2024-06-11 Thread Mathieu Poirier

Hi Andrew,

On Mon, Jun 10, 2024 at 10:17:20AM -0500, Andrew Davis wrote:
> This helps prevent mistakes like freeing out of order in cleanup functions
> and forgetting to free on error paths.
> 
> Signed-off-by: Andrew Davis 
> ---
>  drivers/remoteproc/da8xx_remoteproc.c | 29 +--
>  1 file changed, 14 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/remoteproc/da8xx_remoteproc.c 
> b/drivers/remoteproc/da8xx_remoteproc.c
> index c8b7576937733..1ce91516fc6e5 100644
> --- a/drivers/remoteproc/da8xx_remoteproc.c
> +++ b/drivers/remoteproc/da8xx_remoteproc.c
> @@ -233,6 +233,13 @@ static int da8xx_rproc_get_internal_memories(struct 
> platform_device *pdev,
>   return 0;
>  }
>  
> +static void da8xx_rproc_mem_release(void *data)
> +{
> + struct device *dev = data;
> +
> + of_reserved_mem_device_release(dev);
> +}
> +
>  static int da8xx_rproc_probe(struct platform_device *pdev)
>  {
>   struct device *dev = &pdev->dev;
> @@ -293,14 +300,13 @@ static int da8xx_rproc_probe(struct platform_device 
> *pdev)
>   ret);
>   return ret;
>   }
> + devm_add_action_or_reset(&pdev->dev, da8xx_rproc_mem_release, 
> &pdev->dev);
>   }
>  
>   rproc = devm_rproc_alloc(dev, "dsp", &da8xx_rproc_ops, da8xx_fw_name,
>sizeof(*drproc));
> - if (!rproc) {
> - ret = -ENOMEM;
> - goto free_mem;
> - }
> + if (!rproc)
> + return -ENOMEM;
>  
>   /* error recovery is not supported at present */
>   rproc->recovery_disabled = true;
> @@ -313,7 +319,7 @@ static int da8xx_rproc_probe(struct platform_device *pdev)
>  
>   ret = da8xx_rproc_get_internal_memories(pdev, drproc);
>   if (ret)
> - goto free_mem;
> + return ret;
>  
>   platform_set_drvdata(pdev, rproc);
>  
> @@ -323,7 +329,7 @@ static int da8xx_rproc_probe(struct platform_device *pdev)
>   rproc);
>   if (ret) {
>   dev_err(dev, "devm_request_threaded_irq error: %d\n", ret);
> - goto free_mem;
> + return ret;
>   }
>  
>   /*
> @@ -333,7 +339,7 @@ static int da8xx_rproc_probe(struct platform_device *pdev)
>*/
>   ret = reset_control_assert(dsp_reset);
>   if (ret)
> - goto free_mem;
> + return ret;
>  
>   drproc->chipsig = chipsig;
>   drproc->bootreg = bootreg;
> @@ -344,15 +350,10 @@ static int da8xx_rproc_probe(struct platform_device 
> *pdev)
>   ret = rproc_add(rproc);
>   if (ret) {
>   dev_err(dev, "rproc_add failed: %d\n", ret);
> - goto free_mem;
> + return ret;
>   }
>  
>   return 0;
> -
> -free_mem:
> - if (dev->of_node)
> - of_reserved_mem_device_release(dev);
> - return ret;
>  }
>  
>  static void da8xx_rproc_remove(struct platform_device *pdev)
> @@ -369,8 +370,6 @@ static void da8xx_rproc_remove(struct platform_device 
> *pdev)
>   disable_irq(drproc->irq);
>  
>   rproc_del(rproc);
> - if (dev->of_node)
> - of_reserved_mem_device_release(dev);


This patch gives me the following compilation warning:

  CC  kernel/module/main.o
  CC  drivers/remoteproc/da8xx_remoteproc.o
  AR  drivers/base/firmware_loader/built-in.a
  AR  drivers/base/built-in.a
remoteproc/kernel/drivers/remoteproc/da8xx_remoteproc.c: In function 
‘da8xx_rproc_remove’:
remoteproc/kernel/drivers/remoteproc/da8xx_remoteproc.c:363:24: warning: unused 
variable ‘dev’ [-Wunused-variable]
  363 | struct device *dev = &pdev->dev;
  |^~~
  AR  drivers/remoteproc/built-in.a

which is then fixed in the following patch with the introduction of
devm_rproc_add().  I suggest doing the opposite, i.e introduce devm_rproc_add()
and then get rid of da8xx_rproc_remove() by introducing
da8xx_rproc_mem_release().

No need to resend the omap set, I have them.

Thanks,
Mathieu

>  }
>  
>  static const struct of_device_id davinci_rproc_of_match[] __maybe_unused = {
> -- 
> 2.39.2
>

Re: [PATCH V2 1/2] soc: qcom: smp2p: Add remote name into smp2p irq devname

2024-06-11 Thread Chris Lew





On 6/11/2024 9:06 AM, Bjorn Andersson wrote:

On Tue, Jun 11, 2024 at 06:03:50PM +0530, Sudeepgoud Patil wrote:

Add smp2p irq devname which fetches remote name from respective
smp2p dtsi node, which makes the wakeup source distinguishable
in irq wakeup prints.

Signed-off-by: Sudeepgoud Patil 
---
  drivers/soc/qcom/smp2p.c | 14 +-
  1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c
index a21241cbeec7..a77fee048b38 100644
--- a/drivers/soc/qcom/smp2p.c
+++ b/drivers/soc/qcom/smp2p.c
@@ -122,6 +122,7 @@ struct smp2p_entry {
   * @ssr_ack_enabled: SMP2P_FEATURE_SSR_ACK feature is supported and was 
enabled
   * @ssr_ack: current cached state of the local ack bit
   * @negotiation_done: whether negotiating finished
+ * @irq_devname: poniter to the smp2p irq devname
   * @local_pid:processor id of the inbound edge
   * @remote_pid:   processor id of the outbound edge
   * @ipc_regmap:   regmap for the outbound ipc
@@ -146,6 +147,7 @@ struct qcom_smp2p {
bool ssr_ack;
bool negotiation_done;
  
+	char *irq_devname;

unsigned local_pid;
unsigned remote_pid;
  
@@ -614,10 +616,16 @@ static int qcom_smp2p_probe(struct platform_device *pdev)

/* Kick the outgoing edge after allocating entries */
qcom_smp2p_kick(smp2p);
  
+	smp2p->irq_devname = kasprintf(GFP_KERNEL, "%s", pdev->dev.of_node->name);


That's a lot of extra instructions for copying a string, which doesn't
need to be copied because of_node->name is const char and the argument
to devm_request_threaded_irq() is const char.

So, kstrdup_const() is what you're looking for.

You can then go devm_kstrdup_const() and avoid the kfree() (then
kfree_const()) below.


That said, looking at /proc/interrupts, I think it would make sense to
make this devm_kasprintf(..., "smp2p-%s", name);



Is it ok to rely on the "of_node->name"? I think device tree tends to 
always have the node name as "smp2p-%s" already, so ("smp2p-%s", name) 
would result in "smp2p-smp2p-adsp".


Also Sudeepgoud, I think this will update the irqname in 
/proc/interrupts for the ipcc irqchip entry. It would also be helpful if 
we could differentiate the instances of smp2p irqchips as well. That way 
we can see what processors the 'ready' and 'fatal' interrupts apply to 
in /proc/interrupts.


Can you refer to my internal patch that adds .irq_print_chip() and 
incorporate those changes here?



Regards,
Bjorn


+   if (!smp2p->irq_devname) {
+   ret = -ENOMEM;
+   goto unwind_interfaces;
+   }
+
ret = devm_request_threaded_irq(&pdev->dev, irq,
NULL, qcom_smp2p_intr,
IRQF_ONESHOT,
-   "smp2p", (void *)smp2p);
+   smp2p->irq_devname, (void *)smp2p);
if (ret) {
dev_err(&pdev->dev, "failed to request interrupt\n");
goto unwind_interfaces;
@@ -650,6 +658,8 @@ static int qcom_smp2p_probe(struct platform_device *pdev)
list_for_each_entry(entry, &smp2p->outbound, node)
qcom_smem_state_unregister(entry->state);
  
+	kfree(smp2p->irq_devname);

+
smp2p->out->valid_entries = 0;
  
  release_mbox:

@@ -677,6 +687,8 @@ static void qcom_smp2p_remove(struct platform_device *pdev)
  
  	mbox_free_channel(smp2p->mbox_chan);
  
+	kfree(smp2p->irq_devname);

+
smp2p->out->valid_entries = 0;
  }
  
--

Re: [lvc-project] [PATCH] remoteproc: imx_rproc: Adjust phandle parsing issue while remapping optional addresses in imx_rproc_addr_init()

2024-06-11 Thread Mathieu Poirier

On Mon, Jun 10, 2024 at 08:36:19PM +0300, Fedor Pchelkin wrote:
> On Mon, 10. Jun 10:47, Mathieu Poirier wrote:
> > On Thu, Jun 06, 2024 at 10:52:04AM +0300, Aleksandr Mishin wrote:
> > > In imx_rproc_addr_init() "nph = of_count_phandle_with_args()" just counts
> > > number of phandles. But phandles may be empty. So of_parse_phandle() in
> > > the parsing loop (0 < a < nph) may return NULL which is later 
> > > dereferenced.
> > > Adjust this issue by adding NULL-return check.
> > > 
> > > Found by Linux Verification Center (linuxtesting.org) with SVACE.
> > > 
> > > Fixes: a0ff4aa6f010 ("remoteproc: imx_rproc: add a NXP/Freescale 
> > > imx_rproc driver")
> > > Signed-off-by: Aleksandr Mishin 
> > > ---
> > >  drivers/remoteproc/imx_rproc.c | 2 ++
> > >  1 file changed, 2 insertions(+)
> > > 
> > > diff --git a/drivers/remoteproc/imx_rproc.c 
> > > b/drivers/remoteproc/imx_rproc.c
> > > index 5a3fb902acc9..39eacd90af14 100644
> > > --- a/drivers/remoteproc/imx_rproc.c
> > > +++ b/drivers/remoteproc/imx_rproc.c
> > > @@ -726,6 +726,8 @@ static int imx_rproc_addr_init(struct imx_rproc *priv,
> > >   struct resource res;
> > >  
> > >   node = of_parse_phandle(np, "memory-region", a);
> > > + if (!node)
> > 
> > You're missing an "of_node_put()" before continuing.
> > 
> 
> The node is NULL in this case so of_node_put() is not needed..?

Oh yeah, doing a of_node_put() with a NULL value is are really good idea...

I will pickup this patch.

> 
> Btw, there is a "rsc-table" node->name check in the the end of the loop
> body. It was added recently with commit 5e4c1243071d ("remoteproc:
> imx_rproc: support remote cores booted before Linux Kernel"). Seems to me
> it forgot that of_node_put() is called way before that.
> 

I agree.

> Also commit 61afafe8b938 ("remoteproc: imx_rproc: Fix refcount leak in
> imx_rproc_addr_init") was dealing with the last of_node_put() call here
> but it's still not in the right place I'd say.
>

You mean becaue of node->name being used after the last of_node_put() or is
there something else?

Aleksandr - Can you send another patch for the above?

Thanks,
Mathieu

> > > + continue;
> > >   /* Not map vdevbuffer, vdevring region */
> > >   if (!strncmp(node->name, "vdev", strlen("vdev"))) {
> > >   of_node_put(node);
> > > -- 
> > > 2.30.2
> > > 
> > >

Re: [PATCH V2 1/2] soc: qcom: smp2p: Add remote name into smp2p irq devname

2024-06-11 Thread Bjorn Andersson

On Tue, Jun 11, 2024 at 06:03:50PM +0530, Sudeepgoud Patil wrote:
> Add smp2p irq devname which fetches remote name from respective
> smp2p dtsi node, which makes the wakeup source distinguishable
> in irq wakeup prints.
> 
> Signed-off-by: Sudeepgoud Patil 
> ---
>  drivers/soc/qcom/smp2p.c | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c
> index a21241cbeec7..a77fee048b38 100644
> --- a/drivers/soc/qcom/smp2p.c
> +++ b/drivers/soc/qcom/smp2p.c
> @@ -122,6 +122,7 @@ struct smp2p_entry {
>   * @ssr_ack_enabled: SMP2P_FEATURE_SSR_ACK feature is supported and was 
> enabled
>   * @ssr_ack: current cached state of the local ack bit
>   * @negotiation_done: whether negotiating finished
> + * @irq_devname: poniter to the smp2p irq devname
>   * @local_pid:   processor id of the inbound edge
>   * @remote_pid:  processor id of the outbound edge
>   * @ipc_regmap:  regmap for the outbound ipc
> @@ -146,6 +147,7 @@ struct qcom_smp2p {
>   bool ssr_ack;
>   bool negotiation_done;
>  
> + char *irq_devname;
>   unsigned local_pid;
>   unsigned remote_pid;
>  
> @@ -614,10 +616,16 @@ static int qcom_smp2p_probe(struct platform_device 
> *pdev)
>   /* Kick the outgoing edge after allocating entries */
>   qcom_smp2p_kick(smp2p);
>  
> + smp2p->irq_devname = kasprintf(GFP_KERNEL, "%s", 
> pdev->dev.of_node->name);

That's a lot of extra instructions for copying a string, which doesn't
need to be copied because of_node->name is const char and the argument
to devm_request_threaded_irq() is const char.

So, kstrdup_const() is what you're looking for.

You can then go devm_kstrdup_const() and avoid the kfree() (then
kfree_const()) below.


That said, looking at /proc/interrupts, I think it would make sense to
make this devm_kasprintf(..., "smp2p-%s", name);

Regards,
Bjorn

> + if (!smp2p->irq_devname) {
> + ret = -ENOMEM;
> + goto unwind_interfaces;
> + }
> +
>   ret = devm_request_threaded_irq(&pdev->dev, irq,
>   NULL, qcom_smp2p_intr,
>   IRQF_ONESHOT,
> - "smp2p", (void *)smp2p);
> + smp2p->irq_devname, (void *)smp2p);
>   if (ret) {
>   dev_err(&pdev->dev, "failed to request interrupt\n");
>   goto unwind_interfaces;
> @@ -650,6 +658,8 @@ static int qcom_smp2p_probe(struct platform_device *pdev)
>   list_for_each_entry(entry, &smp2p->outbound, node)
>   qcom_smem_state_unregister(entry->state);
>  
> + kfree(smp2p->irq_devname);
> +
>   smp2p->out->valid_entries = 0;
>  
>  release_mbox:
> @@ -677,6 +687,8 @@ static void qcom_smp2p_remove(struct platform_device 
> *pdev)
>  
>   mbox_free_channel(smp2p->mbox_chan);
>  
> + kfree(smp2p->irq_devname);
> +
>   smp2p->out->valid_entries = 0;
>  }
>  
> -- 
>

Re: [PATCHv8 bpf-next 0/9] uprobe: uretprobe speed up

2024-06-11 Thread Google

On Tue, 11 Jun 2024 13:21:49 +0200
Jiri Olsa  wrote:

> hi,
> as part of the effort on speeding up the uprobes [0] coming with
> return uprobe optimization by using syscall instead of the trap
> on the uretprobe trampoline.
> 
> The speed up depends on instruction type that uprobe is installed
> and depends on specific HW type, please check patch 1 for details.
> 
> Patches 1-8 are based on bpf-next/master, but patch 2 and 3 are
> apply-able on linux-trace.git tree probes/for-next branch.
> Patch 9 is based on man-pages master.
> 
> v8 changes:
> - rebased (another new syscall got merged)
> - added acks
> 
> Also available at:
>   https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
>   uretprobe_syscall

Applied patch [1/9] - [8/9] on probes/for-next in 
 git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git

Thank you!

> 
> thanks,
> jirka
> 
> 
> Notes to check list items in Documentation/process/adding-syscalls.rst:
> 
> - System Call Alternatives
>   New syscall seems like the best way in here, because we need
>   just to quickly enter kernel with no extra arguments processing,
>   which we'd need to do if we decided to use another syscall.
> 
> - Designing the API: Planning for Extension
>   The uretprobe syscall is very specific and most likely won't be
>   extended in the future.
> 
>   At the moment it does not take any arguments and even if it does
>   in future, it's allowed to be called only from trampoline prepared
>   by kernel, so there'll be no broken user.
> 
> - Designing the API: Other Considerations
>   N/A because uretprobe syscall does not return reference to kernel
>   object.
> 
> - Proposing the API
>   Wiring up of the uretprobe system call is in separate change,
>   selftests and man page changes are part of the patchset.
> 
> - Generic System Call Implementation
>   There's no CONFIG option for the new functionality because it
>   keeps the same behaviour from the user POV.
> 
> - x86 System Call Implementation
>   It's 64-bit syscall only.
> 
> - Compatibility System Calls (Generic)
>   N/A uretprobe syscall has no arguments and is not supported
>   for compat processes.
> 
> - Compatibility System Calls (x86)
>   N/A uretprobe syscall is not supported for compat processes.
> 
> - System Calls Returning Elsewhere
>   N/A.
> 
> - Other Details
>   N/A.
> 
> - Testing
>   Adding new bpf selftests and ran ltp on top of this change.
> 
> - Man Page
>   Attached.
> 
> - Do not call System Calls in the Kernel
>   N/A.
> 
> 
> [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
> ---
> Jiri Olsa (8):
>   x86/shstk: Make return uprobe work with shadow stack
>   uprobe: Wire up uretprobe system call
>   uprobe: Add uretprobe syscall to speed up return probe
>   selftests/x86: Add return uprobe shadow stack test
>   selftests/bpf: Add uretprobe syscall test for regs integrity
>   selftests/bpf: Add uretprobe syscall test for regs changes
>   selftests/bpf: Add uretprobe syscall call from user space test
>   selftests/bpf: Add uretprobe shadow stack test
> 
>  arch/x86/entry/syscalls/syscall_64.tbl  |   1 +
>  arch/x86/include/asm/shstk.h|   4 +
>  arch/x86/kernel/shstk.c |  16 
>  arch/x86/kernel/uprobes.c   | 124 
> -
>  include/linux/syscalls.h|   2 +
>  include/linux/uprobes.h |   3 +
>  include/uapi/asm-generic/unistd.h   |   5 +-
>  kernel/events/uprobes.c |  24 --
>  kernel/sys_ni.c |   2 +
>  tools/include/linux/compiler.h  |   4 +
>  tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c   | 123 
> -
>  tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 385 
> +++
>  tools/testing/selftests/bpf/progs/uprobe_syscall.c  |  15 
>  tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c |  17 
>  tools/testing/selftests/x86/test_shadow_stack.c | 145 
> ++
>  15 files changed, 860 insertions(+), 10 deletions(-)
>  create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
>  create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall.c
>  create mode 100644 
> tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
> 
> Jiri Olsa (1):
>   man2: Add uretprobe syscall page
> 
>  man/man2/uretprobe.2 | 56 
> 
>  1 file changed, 56 insertions(+)
>  create mode 100644 man/man2/uretprobe.2


-- 
Masami Hiramatsu (Google)

Re: [PATCHv8 9/9] man2: Add uretprobe syscall page

2024-06-11 Thread Alejandro Colomar

Hi,

On Tue, Jun 11, 2024 at 11:30:22PM GMT, Masami Hiramatsu wrote:
> On Tue, 11 Jun 2024 13:21:58 +0200
> Jiri Olsa  wrote:
> 
> > Adding man page for new uretprobe syscall.
> > 
> > Acked-by: Andrii Nakryiko 
> > Reviewed-by: Alejandro Colomar 
> > Signed-off-by: Jiri Olsa 
> 
> This looks good to me.
> 
> Reviewed-by: Masami Hiramatsu (Google) 
> 
> And this needs to be picked by linux-man@ project.

Yup; please ping me when the rest is merged and I should pick it.

Have a lovely day!
Alex

> 
> Thank you,
> 
> > ---
> >  man/man2/uretprobe.2 | 56 
> >  1 file changed, 56 insertions(+)
> >  create mode 100644 man/man2/uretprobe.2
> > 
> > diff --git a/man/man2/uretprobe.2 b/man/man2/uretprobe.2
> > new file mode 100644
> > index ..cf1c2b0d852e
> > --- /dev/null
> > +++ b/man/man2/uretprobe.2
> > @@ -0,0 +1,56 @@
> > +.\" Copyright (C) 2024, Jiri Olsa 
> > +.\"
> > +.\" SPDX-License-Identifier: Linux-man-pages-copyleft
> > +.\"
> > +.TH uretprobe 2 (date) "Linux man-pages (unreleased)"
> > +.SH NAME
> > +uretprobe \- execute pending return uprobes
> > +.SH SYNOPSIS
> > +.nf
> > +.B int uretprobe(void)
> > +.fi
> > +.SH DESCRIPTION
> > +The
> > +.BR uretprobe ()
> > +system call is an alternative to breakpoint instructions for triggering 
> > return
> > +uprobe consumers.
> > +.P
> > +Calls to
> > +.BR uretprobe ()
> > +system call are only made from the user-space trampoline provided by the 
> > kernel.
> > +Calls from any other place result in a
> > +.BR SIGILL .
> > +.SH RETURN VALUE
> > +The
> > +.BR uretprobe ()
> > +system call return value is architecture-specific.
> > +.SH ERRORS
> > +.TP
> > +.B SIGILL
> > +The
> > +.BR uretprobe ()
> > +system call was called by a user-space program.
> > +.SH VERSIONS
> > +Details of the
> > +.BR uretprobe ()
> > +system call behavior vary across systems.
> > +.SH STANDARDS
> > +None.
> > +.SH HISTORY
> > +TBD
> > +.SH NOTES
> > +The
> > +.BR uretprobe ()
> > +system call was initially introduced for the x86_64 architecture
> > +where it was shown to be faster than breakpoint traps.
> > +It might be extended to other architectures.
> > +.P
> > +The
> > +.BR uretprobe ()
> > +system call exists only to allow the invocation of return uprobe consumers.
> > +It should
> > +.B never
> > +be called directly.
> > +Details of the arguments (if any) passed to
> > +.BR uretprobe ()
> > +and the return value are architecture-specific.
> > -- 
> > 2.45.1
> > 
> 
> 
> -- 
> Masami Hiramatsu (Google) 
> 

-- 



signature.asc
Description: PGP signature

Re: [PATCHv8 9/9] man2: Add uretprobe syscall page

2024-06-11 Thread Google

On Tue, 11 Jun 2024 13:21:58 +0200
Jiri Olsa  wrote:

> Adding man page for new uretprobe syscall.
> 
> Acked-by: Andrii Nakryiko 
> Reviewed-by: Alejandro Colomar 
> Signed-off-by: Jiri Olsa 

This looks good to me.

Reviewed-by: Masami Hiramatsu (Google) 

And this needs to be picked by linux-man@ project.

Thank you,

> ---
>  man/man2/uretprobe.2 | 56 
>  1 file changed, 56 insertions(+)
>  create mode 100644 man/man2/uretprobe.2
> 
> diff --git a/man/man2/uretprobe.2 b/man/man2/uretprobe.2
> new file mode 100644
> index ..cf1c2b0d852e
> --- /dev/null
> +++ b/man/man2/uretprobe.2
> @@ -0,0 +1,56 @@
> +.\" Copyright (C) 2024, Jiri Olsa 
> +.\"
> +.\" SPDX-License-Identifier: Linux-man-pages-copyleft
> +.\"
> +.TH uretprobe 2 (date) "Linux man-pages (unreleased)"
> +.SH NAME
> +uretprobe \- execute pending return uprobes
> +.SH SYNOPSIS
> +.nf
> +.B int uretprobe(void)
> +.fi
> +.SH DESCRIPTION
> +The
> +.BR uretprobe ()
> +system call is an alternative to breakpoint instructions for triggering 
> return
> +uprobe consumers.
> +.P
> +Calls to
> +.BR uretprobe ()
> +system call are only made from the user-space trampoline provided by the 
> kernel.
> +Calls from any other place result in a
> +.BR SIGILL .
> +.SH RETURN VALUE
> +The
> +.BR uretprobe ()
> +system call return value is architecture-specific.
> +.SH ERRORS
> +.TP
> +.B SIGILL
> +The
> +.BR uretprobe ()
> +system call was called by a user-space program.
> +.SH VERSIONS
> +Details of the
> +.BR uretprobe ()
> +system call behavior vary across systems.
> +.SH STANDARDS
> +None.
> +.SH HISTORY
> +TBD
> +.SH NOTES
> +The
> +.BR uretprobe ()
> +system call was initially introduced for the x86_64 architecture
> +where it was shown to be faster than breakpoint traps.
> +It might be extended to other architectures.
> +.P
> +The
> +.BR uretprobe ()
> +system call exists only to allow the invocation of return uprobe consumers.
> +It should
> +.B never
> +be called directly.
> +Details of the arguments (if any) passed to
> +.BR uretprobe ()
> +and the return value are architecture-specific.
> -- 
> 2.45.1
> 


-- 
Masami Hiramatsu (Google)

Re: [PATCHv7 bpf-next 0/9] uprobe: uretprobe speed up

2024-06-11 Thread Google

On Tue, 11 Jun 2024 09:30:52 +0100
Andrii Nakryiko  wrote:

> 
> > I think it would be better to include those patches together in
> > linux-tree. Can you review and ack to the last patch ? ([9/9])
> 
> Sure. Jiri, please add my ack for the entire series in the next revision:
> 
> Acked-by: Andrii Nakryiko 

Thanks! let me pick the next version.


-- 
Masami Hiramatsu (Google)

Re: [PATCH v3 3/3] tracing/kprobe: Remove cleanup code unrelated to selftest

2024-06-11 Thread Steven Rostedt

On Tue, 11 Jun 2024 22:30:56 +0900
"Masami Hiramatsu (Google)"  wrote:

> From: Masami Hiramatsu (Google) 
> 
> This cleanup all kprobe events code is not related to the selftest
> itself, and it can fail by the reason unrelated to this test.
> If the test is successful, the generated events are cleaned up.
> And if not, we cannot guarantee that the kprobe events will work
> correctly. So, anyway, there is no need to clean it up.
> 
> Signed-off-by: Masami Hiramatsu (Google) 

Reviewed-by: Steven Rostedt (Google) 

-- Steve

> ---
>  kernel/trace/trace_kprobe.c |4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
> index 8c5816c04bd2..7fd0f8576e4c 100644
> --- a/kernel/trace/trace_kprobe.c
> +++ b/kernel/trace/trace_kprobe.c
> @@ -2114,10 +2114,6 @@ static __init int kprobe_trace_self_tests_init(void)
>  
>  
>  end:
> - ret = dyn_events_release_all(&trace_kprobe_ops);
> - if (WARN_ONCE(ret, "error on cleaning up probes."))
> - warn++;
> -
>   /*
>* Wait for the optimizer work to finish. Otherwise it might fiddle
>* with probes in already freed __init text.

Re: [PATCH v3 2/3] tracing/kprobe: Integrate test warnings into WARN_ONCE

2024-06-11 Thread Steven Rostedt

On Tue, 11 Jun 2024 22:30:46 +0900
"Masami Hiramatsu (Google)"  wrote:

> From: Masami Hiramatsu (Google) 
> 
> Cleanup the redundant WARN_ON_ONCE(cond) + pr_warn(msg) into
> WARN_ONCE(cond, msg). Also add some WARN_ONCE() for hitcount check.
> These WARN_ONCE() errors makes it easy to handle errors from ktest.
> 
> Suggested-by: Steven Rostedt 
> Signed-off-by: Masami Hiramatsu (Google) 
> ---
>  Changes in v3:
>   - integrate WARN_ON_ONCE() and pr_warn() instead of remove
> WARN_ONCE().

Reviewed-by: Steven Rostedt (Google) 

-- Steve

Re: [PATCH v6 3/3] sched/rt: Rename realtime_{prio, task}() to rt_or_dl_{prio, task}()

2024-06-11 Thread Steven Rostedt

On Tue, 11 Jun 2024 11:03:25 +0200
Daniel Bristot de Oliveira  wrote:

> On 6/10/24 21:20, Qais Yousef wrote:
> > -   if (realtime_prio(p->prio)) /* includes deadline */
> > +   if (rt_or_dl_prio(p->prio))  
> 
> that is it... no thinking, no recall, no comment, no confusion...

How about "not_normal_prio(p->prio)" ?

/me runs!

-- Steve

Re: [PATCH 05/14] tracefs: replace call_rcu by kfree_rcu for simple kmem_cache_free callback

2024-06-11 Thread Steven Rostedt

On Tue, 11 Jun 2024 10:42:28 +0200
Vlastimil Babka  wrote:

> AFAICS that documented way is for a different situation? I assume you mean
> this part:
> 
> * Specify any additional patch prerequisites for cherry picking::
> 
> Cc:  # 3.3.x: a1f84a3: sched: Check for idle
> 
> But that would assume we actively want to backport this cleanup patch in the
> first place. But as I understand Steven's intention, we want just to make
> sure that if in the future this patch is backported (i.e. as a dependency of
> something else) it won't be forgotten to also backport c9929f0e344a
> ("mm/slob: remove CONFIG_SLOB"). How to express that without actively
> marking this patch for backport at the same time?

Exactly! This isn't to be tagged as stable. It's just a way to say "if you
need this patch for any reason, you also need patch X".

I think "Depends-on" is the way to go, as it is *not* a stable thing, and
what is in stable rules is only about stable patches.

-- Steve

Re: [PATCH 05/14] tracefs: replace call_rcu by kfree_rcu for simple kmem_cache_free callback

2024-06-11 Thread Steven Rostedt

On Tue, 11 Jun 2024 08:23:11 +0200
Greg KH  wrote:

> > Depends-on: c9929f0e344a ("mm/slob: remove CONFIG_SLOB")  
> 
> Ick, no, use the documented way of handling this as described in the
> stable kernel rules file.

You mentioned this before, I guess you mean this:

> To send additional instructions to the stable team, use a shell-style inline
> comment to pass arbitrary or predefined notes:
> 
> * Specify any additional patch prerequisites for cherry picking::
> 
> Cc:  # 3.3.x: a1f84a3: sched: Check for idle
> Cc:  # 3.3.x: 1b9508f: sched: Rate-limit newidle
> Cc:  # 3.3.x: fd21073: sched: Fix affinity logic
> Cc:  # 3.3.x
> Signed-off-by: Ingo Molnar 
> 
>   The tag sequence has the meaning of::
> 
> git cherry-pick a1f84a3
> git cherry-pick 1b9508f
> git cherry-pick fd21073
> git cherry-pick 
> 
>   Note that for a patch series, you do not have to list as prerequisites the
>   patches present in the series itself. For example, if you have the following
>   patch series::
> 
> patch1
> patch2
> 
>   where patch2 depends on patch1, you do not have to list patch1 as
>   prerequisite of patch2 if you have already marked patch1 for stable
>   inclusion.

What's with the "3.3.x"? Isn't that obsolete? And honestly, I find the
above much more "ick" than "Depends-on:". That's because I like to read
human readable tags and not machine processing tags. I'm a human, not a machine.

-- Steve

[PATCH v3 3/3] tracing/kprobe: Remove cleanup code unrelated to selftest

2024-06-11 Thread Masami Hiramatsu (Google)

From: Masami Hiramatsu (Google) 

This cleanup all kprobe events code is not related to the selftest
itself, and it can fail by the reason unrelated to this test.
If the test is successful, the generated events are cleaned up.
And if not, we cannot guarantee that the kprobe events will work
correctly. So, anyway, there is no need to clean it up.

Signed-off-by: Masami Hiramatsu (Google) 
---
 kernel/trace/trace_kprobe.c |4 
 1 file changed, 4 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8c5816c04bd2..7fd0f8576e4c 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -2114,10 +2114,6 @@ static __init int kprobe_trace_self_tests_init(void)
 
 
 end:
-   ret = dyn_events_release_all(&trace_kprobe_ops);
-   if (WARN_ONCE(ret, "error on cleaning up probes."))
-   warn++;
-
/*
 * Wait for the optimizer work to finish. Otherwise it might fiddle
 * with probes in already freed __init text.

[PATCH v3 2/3] tracing/kprobe: Integrate test warnings into WARN_ONCE

2024-06-11 Thread Masami Hiramatsu (Google)

From: Masami Hiramatsu (Google) 

Cleanup the redundant WARN_ON_ONCE(cond) + pr_warn(msg) into
WARN_ONCE(cond, msg). Also add some WARN_ONCE() for hitcount check.
These WARN_ONCE() errors makes it easy to handle errors from ktest.

Suggested-by: Steven Rostedt 
Signed-off-by: Masami Hiramatsu (Google) 
---
 Changes in v3:
  - integrate WARN_ON_ONCE() and pr_warn() instead of remove
WARN_ONCE().
---
 kernel/trace/trace_kprobe.c |   54 +++
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 16383247bdbf..8c5816c04bd2 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -2023,19 +2023,16 @@ static __init int kprobe_trace_self_tests_init(void)
pr_info("Testing kprobe tracing: ");
 
ret = create_or_delete_trace_kprobe("p:testprobe 
kprobe_trace_selftest_target $stack $stack0 +0($stack)");
-   if (WARN_ON_ONCE(ret)) {
-   pr_warn("error on probing function entry.\n");
+   if (WARN_ONCE(ret, "error on probing function entry.")) {
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
-   if (WARN_ON_ONCE(tk == NULL)) {
-   pr_warn("error on getting new probe.\n");
+   if (WARN_ONCE(tk == NULL, "error on probing function entry.")) {
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
-   if (WARN_ON_ONCE(file == NULL)) {
-   pr_warn("error on getting probe file.\n");
+   if (WARN_ONCE(file == NULL, "error on getting probe 
file.")) {
warn++;
} else
enable_trace_kprobe(
@@ -2044,19 +2041,16 @@ static __init int kprobe_trace_self_tests_init(void)
}
 
ret = create_or_delete_trace_kprobe("r:testprobe2 
kprobe_trace_selftest_target $retval");
-   if (WARN_ON_ONCE(ret)) {
-   pr_warn("error on probing function return.\n");
+   if (WARN_ONCE(ret, "error on probing function return.")) {
warn++;
} else {
/* Enable trace point */
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
-   if (WARN_ON_ONCE(tk == NULL)) {
-   pr_warn("error on getting 2nd new probe.\n");
+   if (WARN_ONCE(tk == NULL, "error on getting 2nd new probe.")) {
warn++;
} else {
file = find_trace_probe_file(tk, top_trace_array());
-   if (WARN_ON_ONCE(file == NULL)) {
-   pr_warn("error on getting probe file.\n");
+   if (WARN_ONCE(file == NULL, "error on getting probe 
file.")) {
warn++;
} else
enable_trace_kprobe(
@@ -2079,18 +2073,15 @@ static __init int kprobe_trace_self_tests_init(void)
 
/* Disable trace points before removing it */
tk = find_trace_kprobe("testprobe", KPROBE_EVENT_SYSTEM);
-   if (WARN_ON_ONCE(tk == NULL)) {
-   pr_warn("error on getting test probe.\n");
+   if (WARN_ONCE(tk == NULL, "error on getting test probe.")) {
warn++;
} else {
-   if (trace_kprobe_nhit(tk) != 1) {
-   pr_warn("incorrect number of testprobe hits\n");
+   if (WARN_ONCE(trace_kprobe_nhit(tk) != 1,
+"incorrect number of testprobe hits."))
warn++;
-   }
 
file = find_trace_probe_file(tk, top_trace_array());
-   if (WARN_ON_ONCE(file == NULL)) {
-   pr_warn("error on getting probe file.\n");
+   if (WARN_ONCE(file == NULL, "error on getting probe file.")) {
warn++;
} else
disable_trace_kprobe(
@@ -2098,18 +2089,15 @@ static __init int kprobe_trace_self_tests_init(void)
}
 
tk = find_trace_kprobe("testprobe2", KPROBE_EVENT_SYSTEM);
-   if (WARN_ON_ONCE(tk == NULL)) {
-   pr_warn("error on getting 2nd test probe.\n");
+   if (WARN_ONCE(tk == NULL, "error on getting 2nd test probe.")) {
warn++;
} else {
-   if (trace_kprobe_nhit(tk) != 1) {
-   pr_warn("incorrect number of testprobe2 hits\n");
+   if (WARN_ONCE(trace_kprobe_nhit(tk) != 1,
+"incorrect number of testprobe2 hits."))
warn++;
-   }
 
file = find_trace_probe_file(tk, top_trace_array());
-   if (WARN_ON_ONCE(f

[PATCH v3 1/3] tracing: Build event generation tests only as modules

2024-06-11 Thread Masami Hiramatsu (Google)

From: Masami Hiramatsu (Google) 

The kprobes and synth event generation test modules add events and lock
(get a reference) those event file reference in module init function,
and unlock and delete it in module exit function. This is because those
are designed for playing as modules.

If we make those modules as built-in, those events are left locked in the
kernel, and never be removed. This causes kprobe event self-test failure
as below.

[   97.349708] [ cut here ]
[   97.353453] WARNING: CPU: 3 PID: 1 at kernel/trace/trace_kprobe.c:2133 
kprobe_trace_self_tests_init+0x3f1/0x480
[   97.357106] Modules linked in:
[   97.358488] CPU: 3 PID: 1 Comm: swapper/0 Not tainted 
6.9.0-g699646734ab5-dirty #14
[   97.361556] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.15.0-1 04/01/2014
[   97.363880] RIP: 0010:kprobe_trace_self_tests_init+0x3f1/0x480
[   97.365538] Code: a8 24 08 82 e9 ae fd ff ff 90 0f 0b 90 48 c7 c7 e5 aa 0b 
82 e9 ee fc ff ff 90 0f 0b 90 48 c7 c7 2d 61 06 82 e9 8e fd ff ff 90 <0f> 0b 90 
48 c7 c7 33 0b 0c 82 89 c6 e8 6e 03 1f ff 41 ff c7 e9 90
[   97.370429] RSP: :c9013b50 EFLAGS: 00010286
[   97.371852] RAX: fff0 RBX: 888005919c00 RCX: 
[   97.373829] RDX: 888003f4 RSI: 8236a598 RDI: 888003f40a68
[   97.375715] RBP:  R08: 0001 R09: 
[   97.377675] R10: 811c9ae5 R11: 8120c4e0 R12: 
[   97.379591] R13: 0001 R14: 0015 R15: 
[   97.381536] FS:  () GS:88807dcc() 
knlGS:
[   97.383813] CS:  0010 DS:  ES:  CR0: 80050033
[   97.385449] CR2:  CR3: 02244000 CR4: 06b0
[   97.387347] DR0:  DR1:  DR2: 
[   97.389277] DR3:  DR6: fffe0ff0 DR7: 0400
[   97.391196] Call Trace:
[   97.391967]  
[   97.392647]  ? __warn+0xcc/0x180
[   97.393640]  ? kprobe_trace_self_tests_init+0x3f1/0x480
[   97.395181]  ? report_bug+0xbd/0x150
[   97.396234]  ? handle_bug+0x3e/0x60
[   97.397311]  ? exc_invalid_op+0x1a/0x50
[   97.398434]  ? asm_exc_invalid_op+0x1a/0x20
[   97.399652]  ? trace_kprobe_is_busy+0x20/0x20
[   97.400904]  ? tracing_reset_all_online_cpus+0x15/0x90
[   97.402304]  ? kprobe_trace_self_tests_init+0x3f1/0x480
[   97.403773]  ? init_kprobe_trace+0x50/0x50
[   97.404972]  do_one_initcall+0x112/0x240
[   97.406113]  do_initcall_level+0x95/0xb0
[   97.407286]  ? kernel_init+0x1a/0x1a0
[   97.408401]  do_initcalls+0x3f/0x70
[   97.409452]  kernel_init_freeable+0x16f/0x1e0
[   97.410662]  ? rest_init+0x1f0/0x1f0
[   97.411738]  kernel_init+0x1a/0x1a0
[   97.412788]  ret_from_fork+0x39/0x50
[   97.413817]  ? rest_init+0x1f0/0x1f0
[   97.414844]  ret_from_fork_asm+0x11/0x20
[   97.416285]  
[   97.417134] irq event stamp: 13437323
[   97.418376] hardirqs last  enabled at (13437337): [] 
console_unlock+0x11c/0x150
[   97.421285] hardirqs last disabled at (13437370): [] 
console_unlock+0x101/0x150
[   97.423838] softirqs last  enabled at (13437366): [] 
handle_softirqs+0x23f/0x2a0
[   97.426450] softirqs last disabled at (13437393): [] 
__irq_exit_rcu+0x66/0xd0
[   97.428850] ---[ end trace  ]---

And also, since we can not cleanup dynamic_event file, ftracetest are
failed too.

To avoid these issues, build these tests only as modules.

Fixes: 9fe41efaca08 ("tracing: Add synth event generation test module")
Fixes: 64836248dda2 ("tracing: Add kprobe event command generation test module")
Signed-off-by: Masami Hiramatsu (Google) 
Reviewed-by: Steven Rostedt (Google) 
---
 kernel/trace/Kconfig |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 166ad5444eea..721c3b221048 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1136,7 +1136,7 @@ config PREEMPTIRQ_DELAY_TEST
 
 config SYNTH_EVENT_GEN_TEST
tristate "Test module for in-kernel synthetic event generation"
-   depends on SYNTH_EVENTS
+   depends on SYNTH_EVENTS && m
help
   This option creates a test module to check the base
   functionality of in-kernel synthetic event definition and
@@ -1149,7 +1149,7 @@ config SYNTH_EVENT_GEN_TEST
 
 config KPROBE_EVENT_GEN_TEST
tristate "Test module for in-kernel kprobe event generation"
-   depends on KPROBE_EVENTS
+   depends on KPROBE_EVENTS && m
help
   This option creates a test module to check the base
   functionality of in-kernel kprobe event definition.

[PATCH v3 0/3] tracing: Fix some selftest issues

2024-06-11 Thread Masami Hiramatsu (Google)

Hi,

Here is v3 of a series of some fixes/cleanups for the test modules and
boot time selftest of kprobe events. The previous version is here;

https://lore.kernel.org/all/171805478534.52471.6269290579314514778.stgit@devnote2/

In this version, I updated the 2nd patch to integrate WARN_ON_ONCE() and
pr_warn() instead of removing WARN_ONCE() because this warning messages
are needed to ktest to handle errors.

Thank you,

---

Masami Hiramatsu (Google) (3):
  tracing: Build event generation tests only as modules
  tracing/kprobe: Integrate test warnings into WARN_ONCE
  tracing/kprobe: Remove cleanup code unrelated to selftest


 kernel/trace/Kconfig|4 ++-
 kernel/trace/trace_kprobe.c |   54 ++-
 2 files changed, 19 insertions(+), 39 deletions(-)

--
Masami Hiramatsu (Google)

Re: [PATCH v14 14/14] selftests/sgx: Add scripts for EPC cgroup testing

2024-06-11 Thread Haitao Huang


On Mon, 10 Jun 2024 17:39:53 -0500, Huang, Kai  wrote:




--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -1045,7 +1045,7 @@ static int __init sgx_init(void)
   if (!sgx_page_cache_init())
   return -ENOMEM;
 -if (!sgx_page_reclaimer_init()) {
+if (!sgx_page_reclaimer_init() || !sgx_cgroup_init()) {
   ret = -ENOMEM;
   goto err_page_cache;
   }


Does it make more sense to move the sgx_cgroup_init() to the  
sgx_drv_init()?  The SGX cgroup only works for the driver side anyway.  
In this case, if something went wrong in sgx_cgroup_init(), the  
sgx_vepc_init() could still have a chance to work.




vepc reclamation is not done by cgroup/ksgxd but try_charge() won't work  
if user expecting cgroup to limit vepc allocation. Would it be more  
consistent to just disable vepc, i.e., on system with MISC, sgx/vepc  
always go with cgroup enabled?


And IIUC we need to reset the "capacity" to 0 if sgx_cgroup_init()  
fails, no matter it is called inside sgx_drv_init() or sgx_init(),  
otherwise the "epc" would appear in the cgroup hierarchy as a misc  
cgroup resource.


Another option is to defer setting the capacity to the point where we  
have made sure sgx_drv_init() and sgx_cgroup_init() cannot fail.




Yes agree we need do this.
Btw, I plan to review the rest from late of this week or next week  
because this week I have some other staff needs to be finished first.




Sure. Thanks
Haitao

Re: [PATCH v5 2/2] misc: fastrpc: use coherent pool for untranslated Compute Banks

2024-06-11 Thread Ekansh Gupta




On 5/24/2024 9:44 PM, Dylan Van Assche wrote:
> Use fastrpc_remote_heap_alloc to allocate from the FastRPC device
> instead of the Compute Bank when the session ID is 0. This ensures
> that the allocation is inside the coherent DMA pool which is already
> accessible to the DSP. This is necessary to support FastRPC devices
> which do not have dedicated Compute Banks such as the SLPI on the SDM845.
> The latter uses an allocated CMA region instead of FastRPC Compute Banks.
>
> Signed-off-by: Dylan Van Assche 
> Reviewed-by: Caleb Connolly 
> ---
Reviewed-by: Ekansh Gupta 
>  drivers/misc/fastrpc.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
> index c06667b29055..f53d20e2e07e 100644
> --- a/drivers/misc/fastrpc.c
> +++ b/drivers/misc/fastrpc.c
> @@ -953,7 +953,10 @@ static int fastrpc_get_args(u32 kernel, struct 
> fastrpc_invoke_ctx *ctx)
>  
>   ctx->msg_sz = pkt_size;
>  
> - err = fastrpc_buf_alloc(ctx->fl, dev, pkt_size, &ctx->buf);
> + if (ctx->fl->sctx->sid)
> + err = fastrpc_buf_alloc(ctx->fl, dev, pkt_size, &ctx->buf);
> + else
> + err = fastrpc_remote_heap_alloc(ctx->fl, dev, pkt_size, 
> &ctx->buf);
>   if (err)
>   return err;
>

Re: [PATCH v5 1/2] misc: fastrpc: support complete DMA pool access to the DSP

2024-06-11 Thread Ekansh Gupta




On 5/24/2024 9:44 PM, Dylan Van Assche wrote:
> To support FastRPC Context Banks which aren't mapped via the SMMU,
> make the whole reserved memory region available to the DSP to allow
> access to coherent buffers.
>
> This is performed by assigning the memory to the DSP via a hypervisor
> call to set the correct permissions for the Virtual Machines on the DSP.
> This is only necessary when a memory region is provided for SLPI DSPs
> so guard this with a domain ID check.
>
> Signed-off-by: Dylan Van Assche 
> Reviewed-by: Caleb Connolly 
> ---
Reviewed-by: Ekansh Gupta 
>  drivers/misc/fastrpc.c | 19 +++
>  1 file changed, 19 insertions(+)
>
> diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
> index 4c67e2c5a82e..c06667b29055 100644
> --- a/drivers/misc/fastrpc.c
> +++ b/drivers/misc/fastrpc.c
> @@ -2255,6 +2255,8 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device 
> *rpdev)
>   int i, err, domain_id = -1, vmcount;
>   const char *domain;
>   bool secure_dsp;
> + struct device_node *rmem_node;
> + struct reserved_mem *rmem;
>   unsigned int vmids[FASTRPC_MAX_VMIDS];
>  
>   err = of_property_read_string(rdev->of_node, "label", &domain);
> @@ -2297,6 +2299,23 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device 
> *rpdev)
>   }
>   }
>  
> + rmem_node = of_parse_phandle(rdev->of_node, "memory-region", 0);
> + if (domain_id == SDSP_DOMAIN_ID && rmem_node) {
> + u64 src_perms;
> +
> + rmem = of_reserved_mem_lookup(rmem_node);
> + if (!rmem) {
> + err = -EINVAL;
> + goto fdev_error;
> + }
> +
> + src_perms = BIT(QCOM_SCM_VMID_HLOS);
> +
> + qcom_scm_assign_mem(rmem->base, rmem->size, &src_perms,
> + data->vmperms, data->vmcount);
> +
> + }
> +
>   secure_dsp = !(of_property_read_bool(rdev->of_node, 
> "qcom,non-secure-domain"));
>   data->secure = secure_dsp;
>

[PATCH V2 2/2] soc: qcom: smp2p: Introduce tracepoint support

2024-06-11 Thread Sudeepgoud Patil

This commit introduces tracepoint support for smp2p,
enabling logging of communication between local and remote processors.
The tracepoints include information about the remote processor ID,
remote subsystem name, negotiation details, supported features,
bit change notifications, and ssr activity.
These tracepoints are valuable for debugging issues between subsystems.

Signed-off-by: Sudeepgoud Patil 
---
 drivers/soc/qcom/Makefile  |   1 +
 drivers/soc/qcom/smp2p.c   |  12 
 drivers/soc/qcom/trace-smp2p.h | 116 +
 3 files changed, 129 insertions(+)
 create mode 100644 drivers/soc/qcom/trace-smp2p.h

diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index ca0bece0dfff..30c1bf645501 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -23,6 +23,7 @@ qcom_rpmh-y   += rpmh.o
 obj-$(CONFIG_QCOM_SMD_RPM) += rpm-proc.o smd-rpm.o
 obj-$(CONFIG_QCOM_SMEM) += smem.o
 obj-$(CONFIG_QCOM_SMEM_STATE) += smem_state.o
+CFLAGS_smp2p.o := -I$(src)
 obj-$(CONFIG_QCOM_SMP2P)   += smp2p.o
 obj-$(CONFIG_QCOM_SMSM)+= smsm.o
 obj-$(CONFIG_QCOM_SOCINFO) += socinfo.o
diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c
index a77fee048b38..6eab8ff55691 100644
--- a/drivers/soc/qcom/smp2p.c
+++ b/drivers/soc/qcom/smp2p.c
@@ -20,6 +20,9 @@
 #include 
 #include 
 
+#define CREATE_TRACE_POINTS
+#include "trace-smp2p.h"
+
 /*
  * The Shared Memory Point to Point (SMP2P) protocol facilitates communication
  * of a single 32-bit value between two processors.  Each value has a single
@@ -193,6 +196,7 @@ static void qcom_smp2p_do_ssr_ack(struct qcom_smp2p *smp2p)
struct smp2p_smem_item *out = smp2p->out;
u32 val;
 
+   trace_smp2p_ssr_ack(smp2p->remote_pid, smp2p->irq_devname);
smp2p->ssr_ack = !smp2p->ssr_ack;
 
val = out->flags & ~BIT(SMP2P_FLAGS_RESTART_ACK_BIT);
@@ -215,6 +219,8 @@ static void qcom_smp2p_negotiate(struct qcom_smp2p *smp2p)
smp2p->ssr_ack_enabled = true;
 
smp2p->negotiation_done = true;
+   trace_smp2p_negotiate(smp2p->remote_pid, smp2p->irq_devname,
+   out->features);
}
 }
 
@@ -253,6 +259,9 @@ static void qcom_smp2p_notify_in(struct qcom_smp2p *smp2p)
status = val ^ entry->last_value;
entry->last_value = val;
 
+   trace_smp2p_notify_in(smp2p->remote_pid, smp2p->irq_devname,
+   entry->name, status, val);
+
/* No changes of this entry? */
if (!status)
continue;
@@ -408,6 +417,9 @@ static int smp2p_update_bits(void *data, u32 mask, u32 
value)
writel(val, entry->value);
spin_unlock_irqrestore(&entry->lock, flags);
 
+   trace_smp2p_update_bits(entry->smp2p->remote_pid, 
entry->smp2p->irq_devname,
+   entry->name, orig, val);
+
if (val != orig)
qcom_smp2p_kick(entry->smp2p);
 
diff --git a/drivers/soc/qcom/trace-smp2p.h b/drivers/soc/qcom/trace-smp2p.h
new file mode 100644
index ..833782460b57
--- /dev/null
+++ b/drivers/soc/qcom/trace-smp2p.h
@@ -0,0 +1,116 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM qcom_smp2p
+
+#if !defined(__QCOM_SMP2P_TRACE_H__) || defined(TRACE_HEADER_MULTI_READ)
+#define __QCOM_SMP2P_TRACE_H__
+
+#include 
+
+#define SMP2P_FEATURE_SSR_ACK 0x1
+
+TRACE_EVENT(smp2p_ssr_ack,
+   TP_PROTO(unsigned int remote_pid, char *irq_devname),
+   TP_ARGS(remote_pid, irq_devname),
+   TP_STRUCT__entry(
+   __field(u32, remote_pid)
+   __string(irq_devname, irq_devname)
+   ),
+   TP_fast_assign(
+   __entry->remote_pid = remote_pid;
+   __assign_str(irq_devname, irq_devname);
+   ),
+   TP_printk("%d: %s: SSR detected, doing SSR Handshake",
+   __entry->remote_pid,
+   __get_str(irq_devname)
+   )
+);
+
+TRACE_EVENT(smp2p_negotiate,
+   TP_PROTO(unsigned int remote_pid, char *irq_devname, unsigned int 
features),
+   TP_ARGS(remote_pid, irq_devname, features),
+   TP_STRUCT__entry(
+   __field(u32, remote_pid)
+   __string(irq_devname, irq_devname)
+   __field(u32, out_features)
+   ),
+   TP_fast_assign(
+   __entry->remote_pid = remote_pid;
+   __assign_str(irq_devname, irq_devname);
+   __entry->out_features = features;
+   ),
+   TP_printk("%d: %s: state=open out_features=%s",
+   __entry->remote_pid,
+   __get_str(irq_devname),
+   __print_flags(__entry->out_features, "|",
+   {SMP2P_FEATURE_SSR_ACK, "SMP2P_FEATURE_SSR_ACK"})
+   )
+);
+
+TRACE_EVENT(smp2p_notify_in,
+

[PATCH V2 1/2] soc: qcom: smp2p: Add remote name into smp2p irq devname

2024-06-11 Thread Sudeepgoud Patil

Add smp2p irq devname which fetches remote name from respective
smp2p dtsi node, which makes the wakeup source distinguishable
in irq wakeup prints.

Signed-off-by: Sudeepgoud Patil 
---
 drivers/soc/qcom/smp2p.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/soc/qcom/smp2p.c b/drivers/soc/qcom/smp2p.c
index a21241cbeec7..a77fee048b38 100644
--- a/drivers/soc/qcom/smp2p.c
+++ b/drivers/soc/qcom/smp2p.c
@@ -122,6 +122,7 @@ struct smp2p_entry {
  * @ssr_ack_enabled: SMP2P_FEATURE_SSR_ACK feature is supported and was enabled
  * @ssr_ack: current cached state of the local ack bit
  * @negotiation_done: whether negotiating finished
+ * @irq_devname: poniter to the smp2p irq devname
  * @local_pid: processor id of the inbound edge
  * @remote_pid:processor id of the outbound edge
  * @ipc_regmap:regmap for the outbound ipc
@@ -146,6 +147,7 @@ struct qcom_smp2p {
bool ssr_ack;
bool negotiation_done;
 
+   char *irq_devname;
unsigned local_pid;
unsigned remote_pid;
 
@@ -614,10 +616,16 @@ static int qcom_smp2p_probe(struct platform_device *pdev)
/* Kick the outgoing edge after allocating entries */
qcom_smp2p_kick(smp2p);
 
+   smp2p->irq_devname = kasprintf(GFP_KERNEL, "%s", 
pdev->dev.of_node->name);
+   if (!smp2p->irq_devname) {
+   ret = -ENOMEM;
+   goto unwind_interfaces;
+   }
+
ret = devm_request_threaded_irq(&pdev->dev, irq,
NULL, qcom_smp2p_intr,
IRQF_ONESHOT,
-   "smp2p", (void *)smp2p);
+   smp2p->irq_devname, (void *)smp2p);
if (ret) {
dev_err(&pdev->dev, "failed to request interrupt\n");
goto unwind_interfaces;
@@ -650,6 +658,8 @@ static int qcom_smp2p_probe(struct platform_device *pdev)
list_for_each_entry(entry, &smp2p->outbound, node)
qcom_smem_state_unregister(entry->state);
 
+   kfree(smp2p->irq_devname);
+
smp2p->out->valid_entries = 0;
 
 release_mbox:
@@ -677,6 +687,8 @@ static void qcom_smp2p_remove(struct platform_device *pdev)
 
mbox_free_channel(smp2p->mbox_chan);
 
+   kfree(smp2p->irq_devname);
+
smp2p->out->valid_entries = 0;
 }
 
--

[PATCH V2 0/2] Add tracepoint support and remote name mapping to smp2p

2024-06-11 Thread Sudeepgoud Patil

This commit introduces tracepoint support to smp2p module, enabling logging of 
communication
events between local and remote processors. The tracepoints capture essential 
details
such as remote processor ID, subsystem names, negotiation specifics, supported 
features,
bit changes, and subsystem restart (SSR) activity.
These tracepoints enhance debugging capabilities for inter-subsystem issues.

Addressing feedback from v1 to map remote PID (Process ID) along with the 
remote name
in tracepoints added new patch in V2 1/2, adding support to include the remote 
name
in the smp2p irq devname which fetches remote name from respective smp2p dtsi 
node,
which also makes the wakeup source distinguishable in irq wakeup prints.

Changes in v2:
- Added support to include the remote name in the smp2p IRQ devname, allowing 
for remote PID-name mapping
- Mapped the remote PID (Process ID) along with the remote name in tracepoints, 
as suggested by Chris
- Modified to capture all `out->features` instead of just the `ssr_ack`, 
following Chris's recommendation
- Expanded the commit description to provide additional context
- Link to v1: 
https://lore.kernel.org/all/20240429075528.1723133-1-quic_sudee...@quicinc.com

Sudeepgoud Patil (2):
  soc: qcom: smp2p: Add remote name into smp2p irq devname
  soc: qcom: smp2p: Introduce tracepoint support

 drivers/soc/qcom/Makefile  |   1 +
 drivers/soc/qcom/smp2p.c   |  26 +++-
 drivers/soc/qcom/trace-smp2p.h | 116 +
 3 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 drivers/soc/qcom/trace-smp2p.h

--

[PATCHv8 9/9] man2: Add uretprobe syscall page

2024-06-11 Thread Jiri Olsa

Adding man page for new uretprobe syscall.

Acked-by: Andrii Nakryiko 
Reviewed-by: Alejandro Colomar 
Signed-off-by: Jiri Olsa 
---
 man/man2/uretprobe.2 | 56 
 1 file changed, 56 insertions(+)
 create mode 100644 man/man2/uretprobe.2

diff --git a/man/man2/uretprobe.2 b/man/man2/uretprobe.2
new file mode 100644
index ..cf1c2b0d852e
--- /dev/null
+++ b/man/man2/uretprobe.2
@@ -0,0 +1,56 @@
+.\" Copyright (C) 2024, Jiri Olsa 
+.\"
+.\" SPDX-License-Identifier: Linux-man-pages-copyleft
+.\"
+.TH uretprobe 2 (date) "Linux man-pages (unreleased)"
+.SH NAME
+uretprobe \- execute pending return uprobes
+.SH SYNOPSIS
+.nf
+.B int uretprobe(void)
+.fi
+.SH DESCRIPTION
+The
+.BR uretprobe ()
+system call is an alternative to breakpoint instructions for triggering return
+uprobe consumers.
+.P
+Calls to
+.BR uretprobe ()
+system call are only made from the user-space trampoline provided by the 
kernel.
+Calls from any other place result in a
+.BR SIGILL .
+.SH RETURN VALUE
+The
+.BR uretprobe ()
+system call return value is architecture-specific.
+.SH ERRORS
+.TP
+.B SIGILL
+The
+.BR uretprobe ()
+system call was called by a user-space program.
+.SH VERSIONS
+Details of the
+.BR uretprobe ()
+system call behavior vary across systems.
+.SH STANDARDS
+None.
+.SH HISTORY
+TBD
+.SH NOTES
+The
+.BR uretprobe ()
+system call was initially introduced for the x86_64 architecture
+where it was shown to be faster than breakpoint traps.
+It might be extended to other architectures.
+.P
+The
+.BR uretprobe ()
+system call exists only to allow the invocation of return uprobe consumers.
+It should
+.B never
+be called directly.
+Details of the arguments (if any) passed to
+.BR uretprobe ()
+and the return value are architecture-specific.
-- 
2.45.1

[PATCHv8 bpf-next 8/9] selftests/bpf: Add uretprobe shadow stack test

2024-06-11 Thread Jiri Olsa

Adding uretprobe shadow stack test that runs all existing
uretprobe tests with shadow stack enabled if it's available.

Acked-by: Andrii Nakryiko 
Reviewed-by: Masami Hiramatsu (Google) 
Signed-off-by: Jiri Olsa 
---
 .../selftests/bpf/prog_tests/uprobe_syscall.c | 60 +++
 1 file changed, 60 insertions(+)

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index 11ccd693ef73..c8517c8f5313 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -9,6 +9,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include "uprobe_syscall.skel.h"
 #include "uprobe_syscall_executed.skel.h"
 
@@ -297,6 +300,56 @@ static void test_uretprobe_syscall_call(void)
close(go[1]);
close(go[0]);
 }
+
+/*
+ * Borrowed from tools/testing/selftests/x86/test_shadow_stack.c.
+ *
+ * For use in inline enablement of shadow stack.
+ *
+ * The program can't return from the point where shadow stack gets enabled
+ * because there will be no address on the shadow stack. So it can't use
+ * syscall() for enablement, since it is a function.
+ *
+ * Based on code from nolibc.h. Keep a copy here because this can't pull
+ * in all of nolibc.h.
+ */
+#define ARCH_PRCTL(arg1, arg2) \
+({ \
+   long _ret;  \
+   register long _num  asm("eax") = __NR_arch_prctl;   \
+   register long _arg1 asm("rdi") = (long)(arg1);  \
+   register long _arg2 asm("rsi") = (long)(arg2);  \
+   \
+   asm volatile (  \
+   "syscall\n" \
+   : "=a"(_ret)\
+   : "r"(_arg1), "r"(_arg2),   \
+ "0"(_num) \
+   : "rcx", "r11", "memory", "cc"  \
+   );  \
+   _ret;   \
+})
+
+#ifndef ARCH_SHSTK_ENABLE
+#define ARCH_SHSTK_ENABLE  0x5001
+#define ARCH_SHSTK_DISABLE 0x5002
+#define ARCH_SHSTK_SHSTK   (1ULL <<  0)
+#endif
+
+static void test_uretprobe_shadow_stack(void)
+{
+   if (ARCH_PRCTL(ARCH_SHSTK_ENABLE, ARCH_SHSTK_SHSTK)) {
+   test__skip();
+   return;
+   }
+
+   /* Run all of the uretprobe tests. */
+   test_uretprobe_regs_equal();
+   test_uretprobe_regs_change();
+   test_uretprobe_syscall_call();
+
+   ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK);
+}
 #else
 static void test_uretprobe_regs_equal(void)
 {
@@ -312,6 +365,11 @@ static void test_uretprobe_syscall_call(void)
 {
test__skip();
 }
+
+static void test_uretprobe_shadow_stack(void)
+{
+   test__skip();
+}
 #endif
 
 void test_uprobe_syscall(void)
@@ -322,4 +380,6 @@ void test_uprobe_syscall(void)
test_uretprobe_regs_change();
if (test__start_subtest("uretprobe_syscall_call"))
test_uretprobe_syscall_call();
+   if (test__start_subtest("uretprobe_shadow_stack"))
+   test_uretprobe_shadow_stack();
 }
-- 
2.45.1

[PATCHv8 bpf-next 7/9] selftests/bpf: Add uretprobe syscall call from user space test

2024-06-11 Thread Jiri Olsa

Adding test to verify that when called from outside of the
trampoline provided by kernel, the uretprobe syscall will cause
calling process to receive SIGILL signal and the attached bpf
program is not executed.

Acked-by: Andrii Nakryiko 
Reviewed-by: Masami Hiramatsu (Google) 
Signed-off-by: Jiri Olsa 
---
 .../selftests/bpf/prog_tests/uprobe_syscall.c | 95 +++
 .../bpf/progs/uprobe_syscall_executed.c   | 17 
 2 files changed, 112 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c

diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
index 1a50cd35205d..11ccd693ef73 100644
--- a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -7,7 +7,10 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include "uprobe_syscall.skel.h"
+#include "uprobe_syscall_executed.skel.h"
 
 __naked unsigned long uretprobe_regs_trigger(void)
 {
@@ -209,6 +212,91 @@ static void test_uretprobe_regs_change(void)
}
 }
 
+#ifndef __NR_uretprobe
+#define __NR_uretprobe 463
+#endif
+
+__naked unsigned long uretprobe_syscall_call_1(void)
+{
+   /*
+* Pretend we are uretprobe trampoline to trigger the return
+* probe invocation in order to verify we get SIGILL.
+*/
+   asm volatile (
+   "pushq %rax\n"
+   "pushq %rcx\n"
+   "pushq %r11\n"
+   "movq $" __stringify(__NR_uretprobe) ", %rax\n"
+   "syscall\n"
+   "popq %r11\n"
+   "popq %rcx\n"
+   "retq\n"
+   );
+}
+
+__naked unsigned long uretprobe_syscall_call(void)
+{
+   asm volatile (
+   "call uretprobe_syscall_call_1\n"
+   "retq\n"
+   );
+}
+
+static void test_uretprobe_syscall_call(void)
+{
+   LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
+   .retprobe = true,
+   );
+   struct uprobe_syscall_executed *skel;
+   int pid, status, err, go[2], c;
+
+   if (ASSERT_OK(pipe(go), "pipe"))
+   return;
+
+   skel = uprobe_syscall_executed__open_and_load();
+   if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
+   goto cleanup;
+
+   pid = fork();
+   if (!ASSERT_GE(pid, 0, "fork"))
+   goto cleanup;
+
+   /* child */
+   if (pid == 0) {
+   close(go[1]);
+
+   /* wait for parent's kick */
+   err = read(go[0], &c, 1);
+   if (err != 1)
+   exit(-1);
+
+   uretprobe_syscall_call();
+   _exit(0);
+   }
+
+   skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, 
pid,
+   "/proc/self/exe",
+   
"uretprobe_syscall_call", &opts);
+   if (!ASSERT_OK_PTR(skel->links.test, 
"bpf_program__attach_uprobe_multi"))
+   goto cleanup;
+
+   /* kick the child */
+   write(go[1], &c, 1);
+   err = waitpid(pid, &status, 0);
+   ASSERT_EQ(err, pid, "waitpid");
+
+   /* verify the child got killed with SIGILL */
+   ASSERT_EQ(WIFSIGNALED(status), 1, "WIFSIGNALED");
+   ASSERT_EQ(WTERMSIG(status), SIGILL, "WTERMSIG");
+
+   /* verify the uretprobe program wasn't called */
+   ASSERT_EQ(skel->bss->executed, 0, "executed");
+
+cleanup:
+   uprobe_syscall_executed__destroy(skel);
+   close(go[1]);
+   close(go[0]);
+}
 #else
 static void test_uretprobe_regs_equal(void)
 {
@@ -219,6 +307,11 @@ static void test_uretprobe_regs_change(void)
 {
test__skip();
 }
+
+static void test_uretprobe_syscall_call(void)
+{
+   test__skip();
+}
 #endif
 
 void test_uprobe_syscall(void)
@@ -227,4 +320,6 @@ void test_uprobe_syscall(void)
test_uretprobe_regs_equal();
if (test__start_subtest("uretprobe_regs_change"))
test_uretprobe_regs_change();
+   if (test__start_subtest("uretprobe_syscall_call"))
+   test_uretprobe_syscall_call();
 }
diff --git a/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c 
b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
new file mode 100644
index ..0d7f1a7db2e2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include 
+#include 
+
+struct pt_regs regs;
+
+char _license[] SEC("license") = "GPL";
+
+int executed = 0;
+
+SEC("uretprobe.multi")
+int test(struct pt_regs *regs)
+{
+   executed = 1;
+   return 0;
+}
-- 
2.45.1

[PATCHv8 bpf-next 6/9] selftests/bpf: Add uretprobe syscall test for regs changes

2024-06-11 Thread Jiri Olsa

Adding test that creates uprobe consumer on uretprobe which changes some
of the registers. Making sure the changed registers are propagated to the
user space when the ureptobe syscall trampoline is used on x86_64.

To be able to do this, adding support to bpf_testmod to create uprobe via
new attribute file:
  /sys/kernel/bpf_testmod_uprobe

This file is expecting file offset and creates related uprobe on current
process exe file and removes existing uprobe if offset is 0. The can be
only single uprobe at any time.

The uprobe has specific consumer that changes registers used in ureprobe
syscall trampoline and which are later checked in the test.

Acked-by: Andrii Nakryiko 
Reviewed-by: Masami Hiramatsu (Google) 
Signed-off-by: Jiri Olsa 
---
 .../selftests/bpf/bpf_testmod/bpf_testmod.c   | 123 +-
 .../selftests/bpf/prog_tests/uprobe_syscall.c |  67 ++
 2 files changed, 189 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c 
b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
index 0a09732cde4b..6cbbecb1e23c 100644
--- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "bpf_testmod.h"
 #include "bpf_testmod_kfunc.h"
 
@@ -358,6 +359,119 @@ static struct bin_attribute bin_attr_bpf_testmod_file 
__ro_after_init = {
.write = bpf_testmod_test_write,
 };
 
+/* bpf_testmod_uprobe sysfs attribute is so far enabled for x86_64 only,
+ * please see test_uretprobe_regs_change test
+ */
+#ifdef __x86_64__
+
+static int
+uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
+  struct pt_regs *regs)
+
+{
+   regs->ax  = 0x12345678deadbeef;
+   regs->cx  = 0x87654321feebdaed;
+   regs->r11 = (u64) -1;
+   return true;
+}
+
+struct testmod_uprobe {
+   struct path path;
+   loff_t offset;
+   struct uprobe_consumer consumer;
+};
+
+static DEFINE_MUTEX(testmod_uprobe_mutex);
+
+static struct testmod_uprobe uprobe = {
+   .consumer.ret_handler = uprobe_ret_handler,
+};
+
+static int testmod_register_uprobe(loff_t offset)
+{
+   int err = -EBUSY;
+
+   if (uprobe.offset)
+   return -EBUSY;
+
+   mutex_lock(&testmod_uprobe_mutex);
+
+   if (uprobe.offset)
+   goto out;
+
+   err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path);
+   if (err)
+   goto out;
+
+   err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry),
+offset, 0, &uprobe.consumer);
+   if (err)
+   path_put(&uprobe.path);
+   else
+   uprobe.offset = offset;
+
+out:
+   mutex_unlock(&testmod_uprobe_mutex);
+   return err;
+}
+
+static void testmod_unregister_uprobe(void)
+{
+   mutex_lock(&testmod_uprobe_mutex);
+
+   if (uprobe.offset) {
+   uprobe_unregister(d_real_inode(uprobe.path.dentry),
+ uprobe.offset, &uprobe.consumer);
+   uprobe.offset = 0;
+   }
+
+   mutex_unlock(&testmod_uprobe_mutex);
+}
+
+static ssize_t
+bpf_testmod_uprobe_write(struct file *file, struct kobject *kobj,
+struct bin_attribute *bin_attr,
+char *buf, loff_t off, size_t len)
+{
+   unsigned long offset = 0;
+   int err = 0;
+
+   if (kstrtoul(buf, 0, &offset))
+   return -EINVAL;
+
+   if (offset)
+   err = testmod_register_uprobe(offset);
+   else
+   testmod_unregister_uprobe();
+
+   return err ?: strlen(buf);
+}
+
+static struct bin_attribute bin_attr_bpf_testmod_uprobe_file __ro_after_init = 
{
+   .attr = { .name = "bpf_testmod_uprobe", .mode = 0666, },
+   .write = bpf_testmod_uprobe_write,
+};
+
+static int register_bpf_testmod_uprobe(void)
+{
+   return sysfs_create_bin_file(kernel_kobj, 
&bin_attr_bpf_testmod_uprobe_file);
+}
+
+static void unregister_bpf_testmod_uprobe(void)
+{
+   testmod_unregister_uprobe();
+   sysfs_remove_bin_file(kernel_kobj, &bin_attr_bpf_testmod_uprobe_file);
+}
+
+#else
+static int register_bpf_testmod_uprobe(void)
+{
+   return 0;
+}
+
+static void unregister_bpf_testmod_uprobe(void) { }
+#endif
+
 BTF_KFUNCS_START(bpf_testmod_common_kfunc_ids)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_new, KF_ITER_NEW)
 BTF_ID_FLAGS(func, bpf_iter_testmod_seq_next, KF_ITER_NEXT | KF_RET_NULL)
@@ -912,7 +1026,13 @@ static int bpf_testmod_init(void)
return -EINVAL;
sock = NULL;
mutex_init(&sock_lock);
-   return sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
+   ret = sysfs_create_bin_file(kernel_kobj, &bin_attr_bpf_testmod_file);
+   if (ret < 0)
+   return ret;
+   ret = register_bpf_testmod_uprobe();
+   if (ret < 0)
+   return ret;
+

[PATCHv8 bpf-next 5/9] selftests/bpf: Add uretprobe syscall test for regs integrity

2024-06-11 Thread Jiri Olsa

Add uretprobe syscall test that compares register values before
and after the uretprobe is hit. It also compares the register
values seen from attached bpf program.

Acked-by: Andrii Nakryiko 
Reviewed-by: Masami Hiramatsu (Google) 
Signed-off-by: Jiri Olsa 
---
 tools/include/linux/compiler.h|   4 +
 .../selftests/bpf/prog_tests/uprobe_syscall.c | 163 ++
 .../selftests/bpf/progs/uprobe_syscall.c  |  15 ++
 3 files changed, 182 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall.c

diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
index 8a63a9913495..6f7f22ac9da5 100644
--- a/tools/include/linux/compiler.h
+++ b/tools/include/linux/compiler.h
@@ -62,6 +62,10 @@
 #define __nocf_check __attribute__((nocf_check))
 #endif
 
+#ifndef __naked
+#define __naked __attribute__((__naked__))
+#endif
+
 /* Are two types/vars the same type (ignoring qualifiers)? */
 #ifndef __same_type
 # define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b))
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c 
b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
new file mode 100644
index ..311ac19d8992
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+
+#ifdef __x86_64__
+
+#include 
+#include 
+#include 
+#include "uprobe_syscall.skel.h"
+
+__naked unsigned long uretprobe_regs_trigger(void)
+{
+   asm volatile (
+   "movq $0xdeadbeef, %rax\n"
+   "ret\n"
+   );
+}
+
+__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
+{
+   asm volatile (
+   "movq %r15,   0(%rdi)\n"
+   "movq %r14,   8(%rdi)\n"
+   "movq %r13,  16(%rdi)\n"
+   "movq %r12,  24(%rdi)\n"
+   "movq %rbp,  32(%rdi)\n"
+   "movq %rbx,  40(%rdi)\n"
+   "movq %r11,  48(%rdi)\n"
+   "movq %r10,  56(%rdi)\n"
+   "movq  %r9,  64(%rdi)\n"
+   "movq  %r8,  72(%rdi)\n"
+   "movq %rax,  80(%rdi)\n"
+   "movq %rcx,  88(%rdi)\n"
+   "movq %rdx,  96(%rdi)\n"
+   "movq %rsi, 104(%rdi)\n"
+   "movq %rdi, 112(%rdi)\n"
+   "movq   $0, 120(%rdi)\n" /* orig_rax */
+   "movq   $0, 128(%rdi)\n" /* rip  */
+   "movq   $0, 136(%rdi)\n" /* cs   */
+   "pushf\n"
+   "pop %rax\n"
+   "movq %rax, 144(%rdi)\n" /* eflags   */
+   "movq %rsp, 152(%rdi)\n" /* rsp  */
+   "movq   $0, 160(%rdi)\n" /* ss   */
+
+   /* save 2nd argument */
+   "pushq %rsi\n"
+   "call uretprobe_regs_trigger\n"
+
+   /* save  return value and load 2nd argument pointer to rax */
+   "pushq %rax\n"
+   "movq 8(%rsp), %rax\n"
+
+   "movq %r15,   0(%rax)\n"
+   "movq %r14,   8(%rax)\n"
+   "movq %r13,  16(%rax)\n"
+   "movq %r12,  24(%rax)\n"
+   "movq %rbp,  32(%rax)\n"
+   "movq %rbx,  40(%rax)\n"
+   "movq %r11,  48(%rax)\n"
+   "movq %r10,  56(%rax)\n"
+   "movq  %r9,  64(%rax)\n"
+   "movq  %r8,  72(%rax)\n"
+   "movq %rcx,  88(%rax)\n"
+   "movq %rdx,  96(%rax)\n"
+   "movq %rsi, 104(%rax)\n"
+   "movq %rdi, 112(%rax)\n"
+   "movq   $0, 120(%rax)\n" /* orig_rax */
+   "movq   $0, 128(%rax)\n" /* rip  */
+   "movq   $0, 136(%rax)\n" /* cs   */
+
+   /* restore return value and 2nd argument */
+   "pop %rax\n"
+   "pop %rsi\n"
+
+   "movq %rax,  80(%rsi)\n"
+
+   "pushf\n"
+   "pop %rax\n"
+
+   "movq %rax, 144(%rsi)\n" /* eflags   */
+   "movq %rsp, 152(%rsi)\n" /* rsp  */
+   "movq   $0, 160(%rsi)\n" /* ss   */
+   "ret\n"
+);
+}
+
+static void test_uretprobe_regs_equal(void)
+{
+   struct uprobe_syscall *skel = NULL;
+   struct pt_regs before = {}, after = {};
+   unsigned long *pb = (unsigned long *) &before;
+   unsigned long *pa = (unsigned long *) &after;
+   unsigned long *pp;
+   unsigned int i, cnt;
+   int err;
+
+   skel = uprobe_syscall__open_and_load();
+   if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load"))
+   goto cleanup;
+
+   err = uprobe_syscall__attach(skel);
+   if (!ASSERT_OK(err, "uprobe_syscall__attach"))
+   goto cleanup;
+
+   uretprobe_regs(&before, &after);
+
+   pp = (unsigned long *) &skel->bss->regs;
+   cnt = sizeof(before)/sizeof(*pb);
+
+

[PATCHv8 bpf-next 4/9] selftests/x86: Add return uprobe shadow stack test

2024-06-11 Thread Jiri Olsa

Adding return uprobe test for shadow stack and making sure it's
working properly. Borrowed some of the code from bpf selftests.

Acked-by: Andrii Nakryiko 
Signed-off-by: Jiri Olsa 
---
 .../testing/selftests/x86/test_shadow_stack.c | 145 ++
 1 file changed, 145 insertions(+)

diff --git a/tools/testing/selftests/x86/test_shadow_stack.c 
b/tools/testing/selftests/x86/test_shadow_stack.c
index ee909a7927f9..21af54d5f4ea 100644
--- a/tools/testing/selftests/x86/test_shadow_stack.c
+++ b/tools/testing/selftests/x86/test_shadow_stack.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Define the ABI defines if needed, so people can run the tests
@@ -734,6 +735,144 @@ int test_32bit(void)
return !segv_triggered;
 }
 
+static int parse_uint_from_file(const char *file, const char *fmt)
+{
+   int err, ret;
+   FILE *f;
+
+   f = fopen(file, "re");
+   if (!f) {
+   err = -errno;
+   printf("failed to open '%s': %d\n", file, err);
+   return err;
+   }
+   err = fscanf(f, fmt, &ret);
+   if (err != 1) {
+   err = err == EOF ? -EIO : -errno;
+   printf("failed to parse '%s': %d\n", file, err);
+   fclose(f);
+   return err;
+   }
+   fclose(f);
+   return ret;
+}
+
+static int determine_uprobe_perf_type(void)
+{
+   const char *file = "/sys/bus/event_source/devices/uprobe/type";
+
+   return parse_uint_from_file(file, "%d\n");
+}
+
+static int determine_uprobe_retprobe_bit(void)
+{
+   const char *file = 
"/sys/bus/event_source/devices/uprobe/format/retprobe";
+
+   return parse_uint_from_file(file, "config:%d\n");
+}
+
+static ssize_t get_uprobe_offset(const void *addr)
+{
+   size_t start, end, base;
+   char buf[256];
+   bool found = false;
+   FILE *f;
+
+   f = fopen("/proc/self/maps", "r");
+   if (!f)
+   return -errno;
+
+   while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) 
== 4) {
+   if (buf[2] == 'x' && (uintptr_t)addr >= start && 
(uintptr_t)addr < end) {
+   found = true;
+   break;
+   }
+   }
+
+   fclose(f);
+
+   if (!found)
+   return -ESRCH;
+
+   return (uintptr_t)addr - start + base;
+}
+
+static __attribute__((noinline)) void uretprobe_trigger(void)
+{
+   asm volatile ("");
+}
+
+/*
+ * This test setups return uprobe, which is sensitive to shadow stack
+ * (crashes without extra fix). After executing the uretprobe we fail
+ * the test if we receive SIGSEGV, no crash means we're good.
+ *
+ * Helper functions above borrowed from bpf selftests.
+ */
+static int test_uretprobe(void)
+{
+   const size_t attr_sz = sizeof(struct perf_event_attr);
+   const char *file = "/proc/self/exe";
+   int bit, fd = 0, type, err = 1;
+   struct perf_event_attr attr;
+   struct sigaction sa = {};
+   ssize_t offset;
+
+   type = determine_uprobe_perf_type();
+   if (type < 0) {
+   if (type == -ENOENT)
+   printf("[SKIP]\tUretprobe test, uprobes are not 
available\n");
+   return 0;
+   }
+
+   offset = get_uprobe_offset(uretprobe_trigger);
+   if (offset < 0)
+   return 1;
+
+   bit = determine_uprobe_retprobe_bit();
+   if (bit < 0)
+   return 1;
+
+   sa.sa_sigaction = segv_gp_handler;
+   sa.sa_flags = SA_SIGINFO;
+   if (sigaction(SIGSEGV, &sa, NULL))
+   return 1;
+
+   /* Setup return uprobe through perf event interface. */
+   memset(&attr, 0, attr_sz);
+   attr.size = attr_sz;
+   attr.type = type;
+   attr.config = 1 << bit;
+   attr.config1 = (__u64) (unsigned long) file;
+   attr.config2 = offset;
+
+   fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */, -1 /* cpu */,
+-1 /* group_fd */, PERF_FLAG_FD_CLOEXEC);
+   if (fd < 0)
+   goto out;
+
+   if (sigsetjmp(jmp_buffer, 1))
+   goto out;
+
+   ARCH_PRCTL(ARCH_SHSTK_ENABLE, ARCH_SHSTK_SHSTK);
+
+   /*
+* This either segfaults and goes through sigsetjmp above
+* or succeeds and we're good.
+*/
+   uretprobe_trigger();
+
+   printf("[OK]\tUretprobe test\n");
+   err = 0;
+
+out:
+   ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK);
+   signal(SIGSEGV, SIG_DFL);
+   if (fd)
+   close(fd);
+   return err;
+}
+
 void segv_handler_ptrace(int signum, siginfo_t *si, void *uc)
 {
/* The SSP adjustment caused a segfault. */
@@ -926,6 +1065,12 @@ int main(int argc, char *argv[])
goto out;
}
 
+   if (test_uretprobe()) {
+   ret = 1;
+   printf("[FAIL]\turetprobe test\n");
+   goto out;
+   }
+
return ret;
 
 out:
-- 
2.45.1

[PATCHv8 bpf-next 3/9] uprobe: Add uretprobe syscall to speed up return probe

2024-06-11 Thread Jiri Olsa

Adding uretprobe syscall instead of trap to speed up return probe.

At the moment the uretprobe setup/path is:

  - install entry uprobe

  - when the uprobe is hit, it overwrites probed function's return address
on stack with address of the trampoline that contains breakpoint
instruction

  - the breakpoint trap code handles the uretprobe consumers execution and
jumps back to original return address

This patch replaces the above trampoline's breakpoint instruction with new
ureprobe syscall call. This syscall does exactly the same job as the trap
with some more extra work:

  - syscall trampoline must save original value for rax/r11/rcx registers
on stack - rax is set to syscall number and r11/rcx are changed and
used by syscall instruction

  - the syscall code reads the original values of those registers and
restore those values in task's pt_regs area

  - only caller from trampoline exposed in '[uprobes]' is allowed,
the process will receive SIGILL signal otherwise

Even with some extra work, using the uretprobes syscall shows speed
improvement (compared to using standard breakpoint):

  On Intel (11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz)

  current:
uretprobe-nop  :1.498 ± 0.000M/s
uretprobe-push :1.448 ± 0.001M/s
uretprobe-ret  :0.816 ± 0.001M/s

  with the fix:
uretprobe-nop  :1.969 ± 0.002M/s  < 31% speed up
uretprobe-push :1.910 ± 0.000M/s  < 31% speed up
uretprobe-ret  :0.934 ± 0.000M/s  < 14% speed up

  On Amd (AMD Ryzen 7 5700U)

  current:
uretprobe-nop  :0.778 ± 0.001M/s
uretprobe-push :0.744 ± 0.001M/s
uretprobe-ret  :0.540 ± 0.001M/s

  with the fix:
uretprobe-nop  :0.860 ± 0.001M/s  < 10% speed up
uretprobe-push :0.818 ± 0.001M/s  < 10% speed up
uretprobe-ret  :0.578 ± 0.000M/s  <  7% speed up

The performance test spawns a thread that runs loop which triggers
uprobe with attached bpf program that increments the counter that
gets printed in results above.

The uprobe (and uretprobe) kind is determined by which instruction
is being patched with breakpoint instruction. That's also important
for uretprobes, because uprobe is installed for each uretprobe.

The performance test is part of bpf selftests:
  tools/testing/selftests/bpf/run_bench_uprobes.sh

Note at the moment uretprobe syscall is supported only for native
64-bit process, compat process still uses standard breakpoint.

Note that when shadow stack is enabled the uretprobe syscall returns
via iret, which is slower than return via sysret, but won't cause the
shadow stack violation.

Suggested-by: Andrii Nakryiko 
Reviewed-by: Oleg Nesterov 
Reviewed-by: Masami Hiramatsu (Google) 
Acked-by: Andrii Nakryiko 
Signed-off-by: Oleg Nesterov 
Signed-off-by: Jiri Olsa 
---
 arch/x86/include/asm/shstk.h |   2 +
 arch/x86/kernel/shstk.c  |   5 ++
 arch/x86/kernel/uprobes.c| 117 +++
 include/linux/uprobes.h  |   3 +
 kernel/events/uprobes.c  |  24 ---
 5 files changed, 144 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index 896909f306e3..4cb77e004615 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -22,6 +22,7 @@ void shstk_free(struct task_struct *p);
 int setup_signal_shadow_stack(struct ksignal *ksig);
 int restore_signal_shadow_stack(void);
 int shstk_update_last_frame(unsigned long val);
+bool shstk_is_enabled(void);
 #else
 static inline long shstk_prctl(struct task_struct *task, int option,
   unsigned long arg2) { return -EINVAL; }
@@ -33,6 +34,7 @@ static inline void shstk_free(struct task_struct *p) {}
 static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
 static inline int restore_signal_shadow_stack(void) { return 0; }
 static inline int shstk_update_last_frame(unsigned long val) { return 0; }
+static inline bool shstk_is_enabled(void) { return false; }
 #endif /* CONFIG_X86_USER_SHADOW_STACK */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 9797d4cdb78a..059685612362 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -588,3 +588,8 @@ int shstk_update_last_frame(unsigned long val)
ssp = get_user_shstk_addr();
return write_user_shstk_64((u64 __user *)ssp, (u64)val);
 }
+
+bool shstk_is_enabled(void)
+{
+   return features_enabled(ARCH_SHSTK_SHSTK);
+}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6402fb3089d2..5a952c5ea66b 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -308,6 +309,122 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, 
struct insn *insn, bool
 }
 
 #ifdef CONFIG_X86_64
+
+asm (
+   ".pushsection .rodata\n"
+   ".global uretprobe_trampoline_entry\n"
+

[PATCHv8 bpf-next 2/9] uprobe: Wire up uretprobe system call

2024-06-11 Thread Jiri Olsa

Wiring up uretprobe system call, which comes in following changes.
We need to do the wiring before, because the uretprobe implementation
needs the syscall number.

Note at the moment uretprobe syscall is supported only for native
64-bit process.

Reviewed-by: Oleg Nesterov 
Reviewed-by: Masami Hiramatsu (Google) 
Acked-by: Andrii Nakryiko 
Signed-off-by: Jiri Olsa 
---
 arch/x86/entry/syscalls/syscall_64.tbl | 1 +
 include/linux/syscalls.h   | 2 ++
 include/uapi/asm-generic/unistd.h  | 5 -
 kernel/sys_ni.c| 2 ++
 4 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index a396f6e6ab5b..6452c2ec469a 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -384,6 +384,7 @@
 460common  lsm_set_self_attr   sys_lsm_set_self_attr
 461common  lsm_list_modulessys_lsm_list_modules
 462common  mseal   sys_mseal
+46364  uretprobe   sys_uretprobe
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 9104952d323d..494f5e0f61f7 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -973,6 +973,8 @@ asmlinkage long sys_lsm_list_modules(u64 *ids, u32 *size, 
u32 flags);
 /* x86 */
 asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
 
+asmlinkage long sys_uretprobe(void);
+
 /* pciconfig: alpha, arm, arm64, ia64, sparc */
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
unsigned long off, unsigned long len,
diff --git a/include/uapi/asm-generic/unistd.h 
b/include/uapi/asm-generic/unistd.h
index d983c48a3b6a..2378f88d5ad4 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -845,8 +845,11 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
 #define __NR_mseal 462
 __SYSCALL(__NR_mseal, sys_mseal)
 
+#define __NR_uretprobe 463
+__SYSCALL(__NR_uretprobe, sys_uretprobe)
+
 #undef __NR_syscalls
-#define __NR_syscalls 463
+#define __NR_syscalls 464
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d7eee421d4bc..5ce9fa0dc195 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -392,3 +392,5 @@ COND_SYSCALL(setuid16);
 
 /* restartable sequence */
 COND_SYSCALL(rseq);
+
+COND_SYSCALL(uretprobe);
-- 
2.45.1

[PATCHv8 bpf-next 1/9] x86/shstk: Make return uprobe work with shadow stack

2024-06-11 Thread Jiri Olsa

Currently the application with enabled shadow stack will crash
if it sets up return uprobe. The reason is the uretprobe kernel
code changes the user space task's stack, but does not update
shadow stack accordingly.

Adding new functions to update values on shadow stack and using
them in uprobe code to keep shadow stack in sync with uretprobe
changes to user stack.

Acked-by: Andrii Nakryiko 
Acked-by: Rick Edgecombe 
Reviewed-by: Oleg Nesterov 
Fixes: 488af8ea7131 ("x86/shstk: Wire in shadow stack interface")
Signed-off-by: Jiri Olsa 
---
 arch/x86/include/asm/shstk.h |  2 ++
 arch/x86/kernel/shstk.c  | 11 +++
 arch/x86/kernel/uprobes.c|  7 ++-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/shstk.h b/arch/x86/include/asm/shstk.h
index 42fee8959df7..896909f306e3 100644
--- a/arch/x86/include/asm/shstk.h
+++ b/arch/x86/include/asm/shstk.h
@@ -21,6 +21,7 @@ unsigned long shstk_alloc_thread_stack(struct task_struct *p, 
unsigned long clon
 void shstk_free(struct task_struct *p);
 int setup_signal_shadow_stack(struct ksignal *ksig);
 int restore_signal_shadow_stack(void);
+int shstk_update_last_frame(unsigned long val);
 #else
 static inline long shstk_prctl(struct task_struct *task, int option,
   unsigned long arg2) { return -EINVAL; }
@@ -31,6 +32,7 @@ static inline unsigned long shstk_alloc_thread_stack(struct 
task_struct *p,
 static inline void shstk_free(struct task_struct *p) {}
 static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
 static inline int restore_signal_shadow_stack(void) { return 0; }
+static inline int shstk_update_last_frame(unsigned long val) { return 0; }
 #endif /* CONFIG_X86_USER_SHADOW_STACK */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 6f1e9883f074..9797d4cdb78a 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -577,3 +577,14 @@ long shstk_prctl(struct task_struct *task, int option, 
unsigned long arg2)
return wrss_control(true);
return -EINVAL;
 }
+
+int shstk_update_last_frame(unsigned long val)
+{
+   unsigned long ssp;
+
+   if (!features_enabled(ARCH_SHSTK_SHSTK))
+   return 0;
+
+   ssp = get_user_shstk_addr();
+   return write_user_shstk_64((u64 __user *)ssp, (u64)val);
+}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c07f6daaa22..6402fb3089d2 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -1076,8 +1076,13 @@ arch_uretprobe_hijack_return_addr(unsigned long 
trampoline_vaddr, struct pt_regs
return orig_ret_vaddr;
 
nleft = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, 
rasize);
-   if (likely(!nleft))
+   if (likely(!nleft)) {
+   if (shstk_update_last_frame(trampoline_vaddr)) {
+   force_sig(SIGSEGV);
+   return -1;
+   }
return orig_ret_vaddr;
+   }
 
if (nleft != rasize) {
pr_err("return address clobbered: pid=%d, %%sp=%#lx, 
%%ip=%#lx\n",
-- 
2.45.1

[PATCHv8 bpf-next 0/9] uprobe: uretprobe speed up

2024-06-11 Thread Jiri Olsa

hi,
as part of the effort on speeding up the uprobes [0] coming with
return uprobe optimization by using syscall instead of the trap
on the uretprobe trampoline.

The speed up depends on instruction type that uprobe is installed
and depends on specific HW type, please check patch 1 for details.

Patches 1-8 are based on bpf-next/master, but patch 2 and 3 are
apply-able on linux-trace.git tree probes/for-next branch.
Patch 9 is based on man-pages master.

v8 changes:
- rebased (another new syscall got merged)
- added acks

Also available at:
  https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
  uretprobe_syscall

thanks,
jirka


Notes to check list items in Documentation/process/adding-syscalls.rst:

- System Call Alternatives
  New syscall seems like the best way in here, because we need
  just to quickly enter kernel with no extra arguments processing,
  which we'd need to do if we decided to use another syscall.

- Designing the API: Planning for Extension
  The uretprobe syscall is very specific and most likely won't be
  extended in the future.

  At the moment it does not take any arguments and even if it does
  in future, it's allowed to be called only from trampoline prepared
  by kernel, so there'll be no broken user.

- Designing the API: Other Considerations
  N/A because uretprobe syscall does not return reference to kernel
  object.

- Proposing the API
  Wiring up of the uretprobe system call is in separate change,
  selftests and man page changes are part of the patchset.

- Generic System Call Implementation
  There's no CONFIG option for the new functionality because it
  keeps the same behaviour from the user POV.

- x86 System Call Implementation
  It's 64-bit syscall only.

- Compatibility System Calls (Generic)
  N/A uretprobe syscall has no arguments and is not supported
  for compat processes.

- Compatibility System Calls (x86)
  N/A uretprobe syscall is not supported for compat processes.

- System Calls Returning Elsewhere
  N/A.

- Other Details
  N/A.

- Testing
  Adding new bpf selftests and ran ltp on top of this change.

- Man Page
  Attached.

- Do not call System Calls in the Kernel
  N/A.


[0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
---
Jiri Olsa (8):
  x86/shstk: Make return uprobe work with shadow stack
  uprobe: Wire up uretprobe system call
  uprobe: Add uretprobe syscall to speed up return probe
  selftests/x86: Add return uprobe shadow stack test
  selftests/bpf: Add uretprobe syscall test for regs integrity
  selftests/bpf: Add uretprobe syscall test for regs changes
  selftests/bpf: Add uretprobe syscall call from user space test
  selftests/bpf: Add uretprobe shadow stack test

 arch/x86/entry/syscalls/syscall_64.tbl  |   1 +
 arch/x86/include/asm/shstk.h|   4 +
 arch/x86/kernel/shstk.c |  16 
 arch/x86/kernel/uprobes.c   | 124 
-
 include/linux/syscalls.h|   2 +
 include/linux/uprobes.h |   3 +
 include/uapi/asm-generic/unistd.h   |   5 +-
 kernel/events/uprobes.c |  24 --
 kernel/sys_ni.c |   2 +
 tools/include/linux/compiler.h  |   4 +
 tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c   | 123 
-
 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c | 385 
+++
 tools/testing/selftests/bpf/progs/uprobe_syscall.c  |  15 
 tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c |  17 
 tools/testing/selftests/x86/test_shadow_stack.c | 145 
++
 15 files changed, 860 insertions(+), 10 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/uprobe_syscall.c
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall.c
 create mode 100644 tools/testing/selftests/bpf/progs/uprobe_syscall_executed.c

Jiri Olsa (1):
  man2: Add uretprobe syscall page

 man/man2/uretprobe.2 | 56 

 1 file changed, 56 insertions(+)
 create mode 100644 man/man2/uretprobe.2

Re: [RFC PATCH] ftrace: Skip fentry location of overridden weak functions

2024-06-11 Thread Zheng Yejian


On 2024/6/11 17:21, Peter Zijlstra wrote:

On Tue, Jun 11, 2024 at 09:56:51AM +0800, Zheng Yejian wrote:

On 2024/6/7 23:02, Peter Zijlstra wrote:



Oh gawd, sodding weak functions again.

I would suggest changing scipts/kallsyms.c to emit readily identifiable
symbol names for all the weak junk, eg:

__weak_junk_N



Sorry for the late reply, I just had a long noon holiday :>

scripts/kallsyms.c is compiled and used to handle symbols in vmlinux.o
or vmlinux.a, see kallsyms_step() in scripts/link-vmlinux.sh, those
overridden weak symbols has been removed from symbol table of vmlinux.o
or vmlinux.a. But we can found those symbols from original xx/xx.o file,
for example, the weak free_initmem() in in init/main.c is overridden,
its symbol is not in vmlinx but is still in init/main.o .

How about traversing all origin xx/xx.o and finding all weak junk symbols ?


You don't need to. ELF symbl tables have an entry size for FUNC type
objects, this means that you can readily find holes in the text and fill
them with a symbol.

Specifically, you can check the mcount locations against the symbol
table and for every one that falls in a hole, generate a new junk
symbol.

Also see 4adb23686795 where objtool adds these holes to the
ignore/unreachable code check.


The lack of size for kallsyms is in a large part what is causing the
problems.


Thanks for your suggestions, I'll try it soon.

--

Thanks,
ZYJ

Re: [RFC PATCH] ftrace: Skip fentry location of overridden weak functions

2024-06-11 Thread Peter Zijlstra

On Tue, Jun 11, 2024 at 09:56:51AM +0800, Zheng Yejian wrote:
> On 2024/6/7 23:02, Peter Zijlstra wrote:

> > Oh gawd, sodding weak functions again.
> > 
> > I would suggest changing scipts/kallsyms.c to emit readily identifiable
> > symbol names for all the weak junk, eg:
> > 
> >__weak_junk_N
> > 
> 
> Sorry for the late reply, I just had a long noon holiday :>
> 
> scripts/kallsyms.c is compiled and used to handle symbols in vmlinux.o
> or vmlinux.a, see kallsyms_step() in scripts/link-vmlinux.sh, those
> overridden weak symbols has been removed from symbol table of vmlinux.o
> or vmlinux.a. But we can found those symbols from original xx/xx.o file,
> for example, the weak free_initmem() in in init/main.c is overridden,
> its symbol is not in vmlinx but is still in init/main.o .
> 
> How about traversing all origin xx/xx.o and finding all weak junk symbols ?

You don't need to. ELF symbl tables have an entry size for FUNC type
objects, this means that you can readily find holes in the text and fill
them with a symbol.

Specifically, you can check the mcount locations against the symbol
table and for every one that falls in a hole, generate a new junk
symbol.

Also see 4adb23686795 where objtool adds these holes to the
ignore/unreachable code check.

The lack of size for kallsyms is in a large part what is causing the
problems.

Re: [PATCH 05/14] tracefs: replace call_rcu by kfree_rcu for simple kmem_cache_free callback

2024-06-11 Thread Thorsten Leemhuis

On 11.06.24 10:42, Vlastimil Babka wrote:
> On 6/11/24 8:23 AM, Greg KH wrote:
>> On Mon, Jun 10, 2024 at 11:40:54PM +0200, Vlastimil Babka wrote:
>>> On 6/10/24 10:36 PM, Steven Rostedt wrote:
 On Mon, 10 Jun 2024 08:46:42 -0700
 "Paul E. McKenney"  wrote:

>>> index 7c29f4afc23d..338c52168e61 100644
>>> --- a/fs/tracefs/inode.c
>>> +++ b/fs/tracefs/inode.c
>>> @@ -53,14 +53,6 @@ static struct inode *tracefs_alloc_inode(struct 
>>> super_block *sb)
>>> return &ti->vfs_inode;
>>>  }
>>>  
>>> -static void tracefs_free_inode_rcu(struct rcu_head *rcu)
>>> -{
>>> -   struct tracefs_inode *ti;
>>> -
>>> -   ti = container_of(rcu, struct tracefs_inode, rcu);
>>> -   kmem_cache_free(tracefs_inode_cachep, ti);  
>>
>> Does this work?
>>
>> tracefs needs to be freed via the tracefs_inode_cachep. Does
>> kfree_rcu() handle specific frees for objects that were not allocated
>> via kmalloc()?  
>
> A recent change to kfree() allows it to correctly handle memory allocated
> via kmem_cache_alloc().  News to me as of a few weeks ago.  ;-)

 If that's the case then:

 Acked-by: Steven Rostedt (Google) 

 Do we have a way to add a "Depends-on" tag so that anyone backporting this
 will know that it requires the change to whatever allowed that to happen?
>>>
>>> Looks like people use that tag, although no grep hits in Documentation, so
>>> Cc'ing workflows@ and Thorsten.
>>>
>>> In this case it would be
>>>
>>> Depends-on: c9929f0e344a ("mm/slob: remove CONFIG_SLOB")
>>
>> Ick, no, use the documented way of handling this as described in the
>> stable kernel rules file.
> 
> AFAICS that documented way is for a different situation? I assume you mean
> this part:
> 
> * Specify any additional patch prerequisites for cherry picking::
> 
> Cc:  # 3.3.x: a1f84a3: sched: Check for idle
> 
> But that would assume we actively want to backport this cleanup patch in the
> first place. But as I understand Steven's intention, we want just to make
> sure that if in the future this patch is backported (i.e. as a dependency of
> something else) it won't be forgotten to also backport c9929f0e344a
> ("mm/slob: remove CONFIG_SLOB"). How to express that without actively
> marking this patch for backport at the same time?

Hah, waiting a bit spared me the time to write a similar reply. :-D
Writing one now anyway to broaden the scope:

I recently noticed we have the same problem when it comes to the
"delayed backporting" aspect, e.g. this part:

"""
* Delay pick up of patches::

Cc:  # after -rc3
"""

I'll bring this up in a maintainers summit proposal I'm currently
preparing. But I have no idea how to solve this in an elegant way.
"Cc:  # after -rc3" could work,
but well, as indicated, that's kinda ugly.

Ciao, Thorsten

Re: [PATCH v6 3/3] sched/rt: Rename realtime_{prio, task}() to rt_or_dl_{prio, task}()

2024-06-11 Thread Daniel Bristot de Oliveira

On 6/10/24 21:20, Qais Yousef wrote:
> - if (realtime_prio(p->prio)) /* includes deadline */
> + if (rt_or_dl_prio(p->prio))

that is it... no thinking, no recall, no comment, no confusion...

-- Daniel.

Re: [PATCH 05/14] tracefs: replace call_rcu by kfree_rcu for simple kmem_cache_free callback

2024-06-11 Thread Vlastimil Babka

On 6/11/24 8:23 AM, Greg KH wrote:
> On Mon, Jun 10, 2024 at 11:40:54PM +0200, Vlastimil Babka wrote:
>> On 6/10/24 10:36 PM, Steven Rostedt wrote:
>> > On Mon, 10 Jun 2024 08:46:42 -0700
>> > "Paul E. McKenney"  wrote:
>> > 
>> >> > > index 7c29f4afc23d..338c52168e61 100644
>> >> > > --- a/fs/tracefs/inode.c
>> >> > > +++ b/fs/tracefs/inode.c
>> >> > > @@ -53,14 +53,6 @@ static struct inode *tracefs_alloc_inode(struct 
>> >> > > super_block *sb)
>> >> > >   return &ti->vfs_inode;
>> >> > >  }
>> >> > >  
>> >> > > -static void tracefs_free_inode_rcu(struct rcu_head *rcu)
>> >> > > -{
>> >> > > - struct tracefs_inode *ti;
>> >> > > -
>> >> > > - ti = container_of(rcu, struct tracefs_inode, rcu);
>> >> > > - kmem_cache_free(tracefs_inode_cachep, ti);  
>> >> > 
>> >> > Does this work?
>> >> > 
>> >> > tracefs needs to be freed via the tracefs_inode_cachep. Does
>> >> > kfree_rcu() handle specific frees for objects that were not allocated
>> >> > via kmalloc()?  
>> >> 
>> >> A recent change to kfree() allows it to correctly handle memory allocated
>> >> via kmem_cache_alloc().  News to me as of a few weeks ago.  ;-)
>> > 
>> > If that's the case then:
>> > 
>> > Acked-by: Steven Rostedt (Google) 
>> > 
>> > Do we have a way to add a "Depends-on" tag so that anyone backporting this
>> > will know that it requires the change to whatever allowed that to happen?
>> 
>> Looks like people use that tag, although no grep hits in Documentation, so
>> Cc'ing workflows@ and Thorsten.
>> 
>> In this case it would be
>> 
>> Depends-on: c9929f0e344a ("mm/slob: remove CONFIG_SLOB")
> 
> Ick, no, use the documented way of handling this as described in the
> stable kernel rules file.

AFAICS that documented way is for a different situation? I assume you mean
this part:

* Specify any additional patch prerequisites for cherry picking::

Cc:  # 3.3.x: a1f84a3: sched: Check for idle

But that would assume we actively want to backport this cleanup patch in the
first place. But as I understand Steven's intention, we want just to make
sure that if in the future this patch is backported (i.e. as a dependency of
something else) it won't be forgotten to also backport c9929f0e344a
("mm/slob: remove CONFIG_SLOB"). How to express that without actively
marking this patch for backport at the same time?

> thanks,
> 
> greg k-h

Re: [PATCHv7 bpf-next 0/9] uprobe: uretprobe speed up

2024-06-11 Thread Andrii Nakryiko

On Mon, Jun 10, 2024 at 10:46 PM Masami Hiramatsu  wrote:
>
> On Wed, 5 Jun 2024 09:42:45 -0700
> Andrii Nakryiko  wrote:
>
> > On Fri, May 31, 2024 at 10:52 AM Andrii Nakryiko
> >  wrote:
> > >
> > > On Thu, May 23, 2024 at 5:11 AM Jiri Olsa  wrote:
> > > >
> > > > hi,
> > > > as part of the effort on speeding up the uprobes [0] coming with
> > > > return uprobe optimization by using syscall instead of the trap
> > > > on the uretprobe trampoline.
> > > >
> > > > The speed up depends on instruction type that uprobe is installed
> > > > and depends on specific HW type, please check patch 1 for details.
> > > >
> > > > Patches 1-8 are based on bpf-next/master, but patch 2 and 3 are
> > > > apply-able on linux-trace.git tree probes/for-next branch.
> > > > Patch 9 is based on man-pages master.
> > > >
> > > > v7 changes:
> > > > - fixes in man page [Alejandro Colomar]
> > > > - fixed patch #1 fixes tag [Oleg]
> > > >
> > > > Also available at:
> > > >   https://git.kernel.org/pub/scm/linux/kernel/git/jolsa/perf.git
> > > >   uretprobe_syscall
> > > >
> > > > thanks,
> > > > jirka
> > > >
> > > >
> > > > Notes to check list items in Documentation/process/adding-syscalls.rst:
> > > >
> > > > - System Call Alternatives
> > > >   New syscall seems like the best way in here, because we need
> > > >   just to quickly enter kernel with no extra arguments processing,
> > > >   which we'd need to do if we decided to use another syscall.
> > > >
> > > > - Designing the API: Planning for Extension
> > > >   The uretprobe syscall is very specific and most likely won't be
> > > >   extended in the future.
> > > >
> > > >   At the moment it does not take any arguments and even if it does
> > > >   in future, it's allowed to be called only from trampoline prepared
> > > >   by kernel, so there'll be no broken user.
> > > >
> > > > - Designing the API: Other Considerations
> > > >   N/A because uretprobe syscall does not return reference to kernel
> > > >   object.
> > > >
> > > > - Proposing the API
> > > >   Wiring up of the uretprobe system call is in separate change,
> > > >   selftests and man page changes are part of the patchset.
> > > >
> > > > - Generic System Call Implementation
> > > >   There's no CONFIG option for the new functionality because it
> > > >   keeps the same behaviour from the user POV.
> > > >
> > > > - x86 System Call Implementation
> > > >   It's 64-bit syscall only.
> > > >
> > > > - Compatibility System Calls (Generic)
> > > >   N/A uretprobe syscall has no arguments and is not supported
> > > >   for compat processes.
> > > >
> > > > - Compatibility System Calls (x86)
> > > >   N/A uretprobe syscall is not supported for compat processes.
> > > >
> > > > - System Calls Returning Elsewhere
> > > >   N/A.
> > > >
> > > > - Other Details
> > > >   N/A.
> > > >
> > > > - Testing
> > > >   Adding new bpf selftests and ran ltp on top of this change.
> > > >
> > > > - Man Page
> > > >   Attached.
> > > >
> > > > - Do not call System Calls in the Kernel
> > > >   N/A.
> > > >
> > > >
> > > > [0] https://lore.kernel.org/bpf/ZeCXHKJ--iYYbmLj@krava/
> > > > ---
> > > > Jiri Olsa (8):
> > > >   x86/shstk: Make return uprobe work with shadow stack
> > > >   uprobe: Wire up uretprobe system call
> > > >   uprobe: Add uretprobe syscall to speed up return probe
> > > >   selftests/x86: Add return uprobe shadow stack test
> > > >   selftests/bpf: Add uretprobe syscall test for regs integrity
> > > >   selftests/bpf: Add uretprobe syscall test for regs changes
> > > >   selftests/bpf: Add uretprobe syscall call from user space test
> > > >   selftests/bpf: Add uretprobe shadow stack test
> > > >
> > >
> > > Masami, Steven,
> > >
> > > It seems like the series is ready to go in. Are you planning to take
> > > the first 4 patches through your linux-trace tree?
> >
> > Another ping. It's been two weeks since Jiri posted the last revision
> > that got no more feedback to be addressed and everyone seems to be
> > happy with it.
>
> Sorry about late reply. I agree that this is OK to go, since no other
> comments. Let me pick this up to probes/for-next branch.
>
> >
> > This is an important speed up improvement for uprobe infrastructure in
> > general and for BPF ecosystem in particular. "Uprobes are slow" is one
> > of the top complaints from production BPF users, and sys_uretprobe
> > approach is significantly improving the situation for return uprobes
> > (aka uretprobes), potentially enabling new use cases that previously
> > could have been too expensive to trace in practice and reducing the
> > overhead of the existing ones.
> >
> > I'd appreciate the engagement from linux-trace maintainers on this
> > patch set. Given it's important for BPF and that a big part of the
> > patch set is BPF-based selftests, we'd also be happy to route all this
> > through the bpf-next tree (which would actually make logistics for us
> > much easier, but that's not the main concern). But regardless o

Re: [PATCHv7 bpf-next 2/9] uprobe: Wire up uretprobe system call

2024-06-11 Thread Jiri Olsa

On Tue, Jun 11, 2024 at 07:05:21AM +0900, Masami Hiramatsu wrote:
> On Thu, 23 May 2024 14:11:42 +0200
> Jiri Olsa  wrote:
> 
> > Wiring up uretprobe system call, which comes in following changes.
> > We need to do the wiring before, because the uretprobe implementation
> > needs the syscall number.
> > 
> > Note at the moment uretprobe syscall is supported only for native
> > 64-bit process.
> > 
> 
> BTW, this does not cleanly applied to probes/for-next, based on
> 6.10-rc1. Which version did you use?

ah new syscall just got merged, I'll rebase and send new version

jirka

> 
> Thank you,
> 
> > Reviewed-by: Oleg Nesterov 
> > Reviewed-by: Masami Hiramatsu (Google) 
> > Acked-by: Andrii Nakryiko 
> > Signed-off-by: Jiri Olsa 
> > ---
> >  arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> >  include/linux/syscalls.h   | 2 ++
> >  include/uapi/asm-generic/unistd.h  | 5 -
> >  kernel/sys_ni.c| 2 ++
> >  4 files changed, 9 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
> > b/arch/x86/entry/syscalls/syscall_64.tbl
> > index cc78226ffc35..47dfea0a827c 100644
> > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > @@ -383,6 +383,7 @@
> >  459common  lsm_get_self_attr   sys_lsm_get_self_attr
> >  460common  lsm_set_self_attr   sys_lsm_set_self_attr
> >  461common  lsm_list_modulessys_lsm_list_modules
> > +46264  uretprobe   sys_uretprobe
> >  
> >  #
> >  # Due to a historical design error, certain syscalls are numbered 
> > differently
> > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > index e619ac10cd23..5318e0e76799 100644
> > --- a/include/linux/syscalls.h
> > +++ b/include/linux/syscalls.h
> > @@ -972,6 +972,8 @@ asmlinkage long sys_lsm_list_modules(u64 *ids, u32 
> > *size, u32 flags);
> >  /* x86 */
> >  asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
> >  
> > +asmlinkage long sys_uretprobe(void);
> > +
> >  /* pciconfig: alpha, arm, arm64, ia64, sparc */
> >  asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
> > unsigned long off, unsigned long len,
> > diff --git a/include/uapi/asm-generic/unistd.h 
> > b/include/uapi/asm-generic/unistd.h
> > index 75f00965ab15..8a747cd1d735 100644
> > --- a/include/uapi/asm-generic/unistd.h
> > +++ b/include/uapi/asm-generic/unistd.h
> > @@ -842,8 +842,11 @@ __SYSCALL(__NR_lsm_set_self_attr, 
> > sys_lsm_set_self_attr)
> >  #define __NR_lsm_list_modules 461
> >  __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
> >  
> > +#define __NR_uretprobe 462
> > +__SYSCALL(__NR_uretprobe, sys_uretprobe)
> > +
> >  #undef __NR_syscalls
> > -#define __NR_syscalls 462
> > +#define __NR_syscalls 463
> >  
> >  /*
> >   * 32 bit systems traditionally used different
> > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> > index faad00cce269..be6195e0d078 100644
> > --- a/kernel/sys_ni.c
> > +++ b/kernel/sys_ni.c
> > @@ -391,3 +391,5 @@ COND_SYSCALL(setuid16);
> >  
> >  /* restartable sequence */
> >  COND_SYSCALL(rseq);
> > +
> > +COND_SYSCALL(uretprobe);
> > -- 
> > 2.45.1
> > 
> 
> 
> -- 
> Masami Hiramatsu (Google)

[PATCH v7 2/5] remoteproc: Add TEE support

2024-06-11 Thread Arnaud Pouliquen

Add a remoteproc TEE (Trusted Execution Environment) driver
that will be probed by the TEE bus. If the associated Trusted
application is supported on secure part this driver offers a client
interface to load a firmware in the secure part.
This firmware could be authenticated by the secure trusted application.

Signed-off-by: Arnaud Pouliquen 
---
update from V
- Fix missing "{" in tee_rproc_find_loaded_rsc_table inline definition.

update from V5
- make tee_rproc_get_loaded_rsc_table() local and replace this API by
  tee_rproc_find_loaded_rsc_table()
- map and unmap the resource table in tee_rproc_parse_fw to make a cached copy
- use the new rproc_pa_to_va() API to map the resource table memory declared in 
carevout
- remove tee_rproc_release_loaded_rsc_table as no more used.
---
 drivers/remoteproc/Kconfig  |  10 +
 drivers/remoteproc/Makefile |   1 +
 drivers/remoteproc/tee_remoteproc.c | 451 
 include/linux/remoteproc.h  |   4 +
 include/linux/tee_remoteproc.h  | 100 ++
 5 files changed, 566 insertions(+)
 create mode 100644 drivers/remoteproc/tee_remoteproc.c
 create mode 100644 include/linux/tee_remoteproc.h

diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig
index 48845dc8fa85..6c1c07202276 100644
--- a/drivers/remoteproc/Kconfig
+++ b/drivers/remoteproc/Kconfig
@@ -365,6 +365,16 @@ config XLNX_R5_REMOTEPROC
 
  It's safe to say N if not interested in using RPU r5f cores.
 
+
+config TEE_REMOTEPROC
+   tristate "Remoteproc support by a TEE application"
+   depends on OPTEE
+   help
+ Support a remote processor with a TEE application. The Trusted
+ Execution Context is responsible for loading the trusted firmware
+ image and managing the remote processor's lifecycle.
+ This can be either built-in or a loadable module.
+
 endif # REMOTEPROC
 
 endmenu
diff --git a/drivers/remoteproc/Makefile b/drivers/remoteproc/Makefile
index 91314a9b43ce..fa8daebce277 100644
--- a/drivers/remoteproc/Makefile
+++ b/drivers/remoteproc/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_RCAR_REMOTEPROC) += rcar_rproc.o
 obj-$(CONFIG_ST_REMOTEPROC)+= st_remoteproc.o
 obj-$(CONFIG_ST_SLIM_REMOTEPROC)   += st_slim_rproc.o
 obj-$(CONFIG_STM32_RPROC)  += stm32_rproc.o
+obj-$(CONFIG_TEE_REMOTEPROC)   += tee_remoteproc.o
 obj-$(CONFIG_TI_K3_DSP_REMOTEPROC) += ti_k3_dsp_remoteproc.o
 obj-$(CONFIG_TI_K3_R5_REMOTEPROC)  += ti_k3_r5_remoteproc.o
 obj-$(CONFIG_XLNX_R5_REMOTEPROC)   += xlnx_r5_remoteproc.o
diff --git a/drivers/remoteproc/tee_remoteproc.c 
b/drivers/remoteproc/tee_remoteproc.c
new file mode 100644
index ..9455fd9d0d2d
--- /dev/null
+++ b/drivers/remoteproc/tee_remoteproc.c
@@ -0,0 +1,451 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) STMicroelectronics 2024 - All Rights Reserved
+ * Author: Arnaud Pouliquen 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "remoteproc_internal.h"
+
+#define MAX_TEE_PARAM_ARRY_MEMBER  4
+
+/*
+ * Authentication of the firmware and load in the remote processor memory
+ *
+ * [in]  params[0].value.a:unique 32bit identifier of the remote processor
+ * [in] params[1].memref:  buffer containing the image of the 
buffer
+ */
+#define TA_RPROC_FW_CMD_LOAD_FW1
+
+/*
+ * Start the remote processor
+ *
+ * [in]  params[0].value.a:unique 32bit identifier of the remote processor
+ */
+#define TA_RPROC_FW_CMD_START_FW   2
+
+/*
+ * Stop the remote processor
+ *
+ * [in]  params[0].value.a:unique 32bit identifier of the remote processor
+ */
+#define TA_RPROC_FW_CMD_STOP_FW3
+
+/*
+ * Return the address of the resource table, or 0 if not found
+ * No check is done to verify that the address returned is accessible by
+ * the non secure context. If the resource table is loaded in a protected
+ * memory the access by the non secure context will lead to a data abort.
+ *
+ * [in]  params[0].value.a:unique 32bit identifier of the remote processor
+ * [out]  params[1].value.a:   32bit LSB resource table memory address
+ * [out]  params[1].value.b:   32bit MSB resource table memory address
+ * [out]  params[2].value.a:   32bit LSB resource table memory size
+ * [out]  params[2].value.b:   32bit MSB resource table memory size
+ */
+#define TA_RPROC_FW_CMD_GET_RSC_TABLE  4
+
+/*
+ * Return the address of the core dump
+ *
+ * [in]  params[0].value.a:unique 32bit identifier of the remote processor
+ * [out] params[1].memref: address of the core dump image if exist,
+ * else return Null
+ */
+#define TA_RPROC_FW_CMD_GET_COREDUMP   5
+
+struct tee_rproc_context {
+   struct list_head sessions;
+   struct tee_context *tee_ctx;
+   struct device *dev;
+};
+
+static struct tee_rproc_context *tee_rproc_ctx;
+
+static void tee_rproc_prepare_args(str

[PATCH v7 1/5] remoteproc: core: Introduce rproc_pa_to_va helper

2024-06-11 Thread Arnaud Pouliquen

When a resource table is loaded by an external entity such as U-boot or
OP-TEE, we do not necessary get the device address(da) but the physical
address(pa).
This helper performs similar translation than the rproc_da_to_va()
but based on a physical address.

Signed-off-by: Arnaud Pouliquen 
---
 drivers/remoteproc/remoteproc_core.c | 74 +++-
 include/linux/remoteproc.h   |  3 ++
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/drivers/remoteproc/remoteproc_core.c 
b/drivers/remoteproc/remoteproc_core.c
index f276956f2c5c..3fdec0336fd6 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -230,6 +230,77 @@ void *rproc_da_to_va(struct rproc *rproc, u64 da, size_t 
len, bool *is_iomem)
 }
 EXPORT_SYMBOL(rproc_da_to_va);
 
+/**
+ * rproc_pa_to_va() - lookup the kernel virtual address for a physical address 
of a remoteproc
+ * memory
+ *
+ * @rproc: handle of a remote processor
+ * @pa: remoteproc physical address
+ * @len: length of the memory region @pa is pointing to
+ * @is_iomem: optional pointer filled in to indicate if @da is iomapped memory
+ *
+ * Some remote processors will ask us to allocate them physically contiguous
+ * memory regions (which we call "carveouts"), and map them to specific
+ * device addresses (which are hardcoded in the firmware). They may also have
+ * dedicated memory regions internal to the processors, and use them either
+ * exclusively or alongside carveouts.
+ *
+ * They may then ask us to copy objects into specific addresses (e.g.
+ * code/data sections) or expose us certain symbols in other device address
+ * (e.g. their trace buffer).
+ *
+ * This function is a helper function with which we can go over the allocated
+ * carveouts and translate specific physical addresses to kernel virtual 
addresses
+ * so we can access the referenced memory. This function also allows to perform
+ * translations on the internal remoteproc memory regions through a platform
+ * implementation specific pa_to_va ops, if present.
+ *
+ * Note: phys_to_virt(iommu_iova_to_phys(rproc->domain, da)) will work too,
+ * but only on kernel direct mapped RAM memory. Instead, we're just using
+ * here the output of the DMA API for the carveouts, which should be more
+ * correct.
+ *
+ * Return: a valid kernel address on success or NULL on failure
+ */
+void *rproc_pa_to_va(struct rproc *rproc, phys_addr_t pa, size_t len, bool 
*is_iomem)
+{
+   struct rproc_mem_entry *carveout;
+   void *ptr = NULL;
+
+   if (rproc->ops->da_to_va) {
+   ptr = rproc->ops->pa_to_va(rproc, pa, len);
+   if (ptr)
+   goto out;
+   }
+
+   list_for_each_entry(carveout, &rproc->carveouts, node) {
+   int offset = pa - carveout->dma;
+
+   /*  Verify that carveout is allocated */
+   if (!carveout->va)
+   continue;
+
+   /* try next carveout if da is too small */
+   if (offset < 0)
+   continue;
+
+   /* try next carveout if da is too large */
+   if (offset + len > carveout->len)
+   continue;
+
+   ptr = carveout->va + offset;
+
+   if (is_iomem)
+   *is_iomem = carveout->is_iomem;
+
+   break;
+   }
+
+out:
+   return ptr;
+}
+EXPORT_SYMBOL(rproc_pa_to_va);
+
 /**
  * rproc_find_carveout_by_name() - lookup the carveout region by a name
  * @rproc: handle of a remote processor
@@ -724,8 +795,7 @@ static int rproc_alloc_carveout(struct rproc *rproc,
 * firmware was compiled with.
 *
 * In this case, we must use the IOMMU API directly and map
-* the memory to the device address as expected by the remote
-* processor.
+* the memory to the device address as etable
 *
 * Obviously such remote processor devices should not be configured
 * to use the iommu-based DMA API: we expect 'dma' to contain the
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index b4795698d8c2..28aa62a3b505 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -367,6 +367,7 @@ enum rsc_handling_status {
  * @detach:detach from a device, leaving it powered up
  * @kick:  kick a virtqueue (virtqueue id given as a parameter)
  * @da_to_va:  optional platform hook to perform address translations
+ * @pa_to_va:  optional platform hook to perform address translations
  * @parse_fw:  parse firmware to extract information (e.g. resource table)
  * @handle_rsc:optional platform hook to handle vendor resources. 
Should return
  * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled
@@ -391,6 +392,7 @@ struct rproc_ops {
int (*detach)(struct rproc *rproc);
void (*kick)(struct rproc *rproc, int vqid);
void * (*da_to_va)(struct rproc *rproc,

[PATCH v7 4/5] remoteproc: stm32: Create sub-functions to request shutdown and release

2024-06-11 Thread Arnaud Pouliquen

To prepare for the support of TEE remoteproc, create sub-functions
that can be used in both cases, with and without remoteproc TEE support.

Signed-off-by: Arnaud Pouliquen 
---
 drivers/remoteproc/stm32_rproc.c | 84 +++-
 1 file changed, 51 insertions(+), 33 deletions(-)

diff --git a/drivers/remoteproc/stm32_rproc.c b/drivers/remoteproc/stm32_rproc.c
index 88623df7d0c3..8cd838df4e92 100644
--- a/drivers/remoteproc/stm32_rproc.c
+++ b/drivers/remoteproc/stm32_rproc.c
@@ -209,6 +209,54 @@ static int stm32_rproc_mbox_idx(struct rproc *rproc, const 
unsigned char *name)
return -EINVAL;
 }
 
+static void stm32_rproc_request_shutdown(struct rproc *rproc)
+{
+   struct stm32_rproc *ddata = rproc->priv;
+   int err, dummy_data, idx;
+
+   /* Request shutdown of the remote processor */
+   if (rproc->state != RPROC_OFFLINE && rproc->state != RPROC_CRASHED) {
+   idx = stm32_rproc_mbox_idx(rproc, STM32_MBX_SHUTDOWN);
+   if (idx >= 0 && ddata->mb[idx].chan) {
+   /* A dummy data is sent to allow to block on transmit. 
*/
+   err = mbox_send_message(ddata->mb[idx].chan,
+   &dummy_data);
+   if (err < 0)
+   dev_warn(&rproc->dev, "warning: remote FW 
shutdown without ack\n");
+   }
+   }
+}
+
+static int stm32_rproc_release(struct rproc *rproc)
+{
+   struct stm32_rproc *ddata = rproc->priv;
+   unsigned int err = 0;
+
+   /* To allow platform Standby power mode, set remote proc Deep Sleep. */
+   if (ddata->pdds.map) {
+   err = regmap_update_bits(ddata->pdds.map, ddata->pdds.reg,
+ddata->pdds.mask, 1);
+   if (err) {
+   dev_err(&rproc->dev, "failed to set pdds\n");
+   return err;
+   }
+   }
+
+   /* Update coprocessor state to OFF if available. */
+   if (ddata->m4_state.map) {
+   err = regmap_update_bits(ddata->m4_state.map,
+ddata->m4_state.reg,
+ddata->m4_state.mask,
+M4_STATE_OFF);
+   if (err) {
+   dev_err(&rproc->dev, "failed to set copro state\n");
+   return err;
+   }
+   }
+
+   return 0;
+}
+
 static int stm32_rproc_prepare(struct rproc *rproc)
 {
struct device *dev = rproc->dev.parent;
@@ -519,17 +567,9 @@ static int stm32_rproc_detach(struct rproc *rproc)
 static int stm32_rproc_stop(struct rproc *rproc)
 {
struct stm32_rproc *ddata = rproc->priv;
-   int err, idx;
+   int err;
 
-   /* request shutdown of the remote processor */
-   if (rproc->state != RPROC_OFFLINE && rproc->state != RPROC_CRASHED) {
-   idx = stm32_rproc_mbox_idx(rproc, STM32_MBX_SHUTDOWN);
-   if (idx >= 0 && ddata->mb[idx].chan) {
-   err = mbox_send_message(ddata->mb[idx].chan, "detach");
-   if (err < 0)
-   dev_warn(&rproc->dev, "warning: remote FW 
shutdown without ack\n");
-   }
-   }
+   stm32_rproc_request_shutdown(rproc);
 
err = stm32_rproc_set_hold_boot(rproc, true);
if (err)
@@ -541,29 +581,7 @@ static int stm32_rproc_stop(struct rproc *rproc)
return err;
}
 
-   /* to allow platform Standby power mode, set remote proc Deep Sleep */
-   if (ddata->pdds.map) {
-   err = regmap_update_bits(ddata->pdds.map, ddata->pdds.reg,
-ddata->pdds.mask, 1);
-   if (err) {
-   dev_err(&rproc->dev, "failed to set pdds\n");
-   return err;
-   }
-   }
-
-   /* update coprocessor state to OFF if available */
-   if (ddata->m4_state.map) {
-   err = regmap_update_bits(ddata->m4_state.map,
-ddata->m4_state.reg,
-ddata->m4_state.mask,
-M4_STATE_OFF);
-   if (err) {
-   dev_err(&rproc->dev, "failed to set copro state\n");
-   return err;
-   }
-   }
-
-   return 0;
+   return stm32_rproc_release(rproc);
 }
 
 static void stm32_rproc_kick(struct rproc *rproc, int vqid)
-- 
2.25.1

1 2 >

1 - 100 of 103 matches

Mail list logo