[PATCH 18/20] ftrace: Add multiple fgraph storage selftest

2024-05-24 Thread Steven Rostedt
From: "Masami Hiramatsu (Google)" 

Add a selftest for multiple function graph tracer with storage on a same
function. In this case, the shadow stack entry will be shared among those
fgraph with different data storage. So this will ensure the fgraph will
not mixed those storage data.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509111465.162236.3795819216426570800.stgit@devnote2

Signed-off-by: Masami Hiramatsu (Google) 
Suggested-by: Steven Rostedt (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace_selftest.c | 171 +-
 1 file changed, 126 insertions(+), 45 deletions(-)

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index fcdc744c245e..369efc569238 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -762,28 +762,32 @@ trace_selftest_startup_function(struct tracer *trace, 
struct trace_array *tr)
 #define SHORT_NUMBER 12345
 #define WORD_NUMBER 1234567890
 #define LONG_NUMBER 1234567890123456789LL
-
-static int fgraph_store_size __initdata;
-static const char *fgraph_store_type_name __initdata;
-static char *fgraph_error_str __initdata;
-static char fgraph_error_str_buf[128] __initdata;
+#define ERRSTR_BUFLEN 128
+
+struct fgraph_fixture {
+   struct fgraph_ops gops;
+   int store_size;
+   const char *store_type_name;
+   char error_str_buf[ERRSTR_BUFLEN];
+   char *error_str;
+};
 
 static __init int store_entry(struct ftrace_graph_ent *trace,
  struct fgraph_ops *gops)
 {
-   const char *type = fgraph_store_type_name;
-   int size = fgraph_store_size;
+   struct fgraph_fixture *fixture = container_of(gops, struct 
fgraph_fixture, gops);
+   const char *type = fixture->store_type_name;
+   int size = fixture->store_size;
void *p;
 
p = fgraph_reserve_data(gops->idx, size);
if (!p) {
-   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+   snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
 "Failed to reserve %s\n", type);
-   fgraph_error_str = fgraph_error_str_buf;
return 0;
}
 
-   switch (fgraph_store_size) {
+   switch (size) {
case 1:
*(char *)p = BYTE_NUMBER;
break;
@@ -804,7 +808,8 @@ static __init int store_entry(struct ftrace_graph_ent 
*trace,
 static __init void store_return(struct ftrace_graph_ret *trace,
struct fgraph_ops *gops)
 {
-   const char *type = fgraph_store_type_name;
+   struct fgraph_fixture *fixture = container_of(gops, struct 
fgraph_fixture, gops);
+   const char *type = fixture->store_type_name;
long long expect = 0;
long long found = -1;
int size;
@@ -812,20 +817,18 @@ static __init void store_return(struct ftrace_graph_ret 
*trace,
 
p = fgraph_retrieve_data(gops->idx, );
if (!p) {
-   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+   snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
 "Failed to retrieve %s\n", type);
-   fgraph_error_str = fgraph_error_str_buf;
return;
}
-   if (fgraph_store_size > size) {
-   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+   if (fixture->store_size > size) {
+   snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
 "Retrieved size %d is smaller than expected %d\n",
-size, (int)fgraph_store_size);
-   fgraph_error_str = fgraph_error_str_buf;
+size, (int)fixture->store_size);
return;
}
 
-   switch (fgraph_store_size) {
+   switch (fixture->store_size) {
case 1:
expect = BYTE_NUMBER;
found = *(char *)p;
@@ -845,45 +848,44 @@ static __init void store_return(struct ftrace_graph_ret 
*trace,
}
 
if (found != expect) {
-   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+   snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
 "%s returned not %lld but %lld\n", type, expect, 
found);
-   fgraph_error_str = fgraph_error_str_buf;
return;
}
-   fgraph_error_str = NULL;
+   fixture->error_str = NULL;
 }
 
-static struct fgraph_ops store_bytes __initdata = {
-   .entryfunc  = store_entry,
-   .retfunc= store_return,
-};
-
-static int __init test_graph_storage_type(const char *name, int size)
+static int __init init_fgraph_fixture(struct fgraph_fixture *fixture)
 {
char *func_name;
int len;
-   int ret;
 
-   fgraph_store_type_name = name;
-   fgraph_store_size = size;
+   snprintf(fixture->error_str_buf, ERRSTR_BUFLEN,
+"Failed to 

[PATCH 20/20] function_graph: Use bitmask to loop on fgraph entry

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

Instead of looping through all the elements of fgraph_array[] to see if
there's an gops attached to one and then calling its gops->func(). Create
a fgraph_array_bitmask that sets bits when an index in the array is
reserved (via the simple lru algorithm). Then only the bits set in this
bitmask needs to be looked at where only elements in the array that have
ops registered need to be looked at.

Note, we do not care about races. If a bit is set before the gops is
assigned, it only wastes time looking at the element and ignoring it (as
it did before this bitmask is added).

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 5e8e13ffcfb6..1aae521e5997 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -173,6 +173,7 @@ DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
 static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
+static unsigned long fgraph_array_bitmask;
 
 /* LRU index table for fgraph_array */
 static int fgraph_lru_table[FGRAPH_ARRAY_SIZE];
@@ -197,6 +198,8 @@ static int fgraph_lru_release_index(int idx)
 
fgraph_lru_table[fgraph_lru_last] = idx;
fgraph_lru_last = (fgraph_lru_last + 1) % FGRAPH_ARRAY_SIZE;
+
+   clear_bit(idx, _array_bitmask);
return 0;
 }
 
@@ -211,6 +214,8 @@ static int fgraph_lru_alloc_index(void)
 
fgraph_lru_table[fgraph_lru_next] = -1;
fgraph_lru_next = (fgraph_lru_next + 1) % FGRAPH_ARRAY_SIZE;
+
+   set_bit(idx, _array_bitmask);
return idx;
 }
 
@@ -632,7 +637,8 @@ int function_graph_enter(unsigned long ret, unsigned long 
func,
if (offset < 0)
goto out;
 
-   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
+   for_each_set_bit(i, _array_bitmask,
+sizeof(fgraph_array_bitmask) * BITS_PER_BYTE) {
struct fgraph_ops *gops = fgraph_array[i];
int save_curr_ret_stack;
 
-- 
2.43.0





[PATCH 19/20] function_graph: Use for_each_set_bit() in __ftrace_return_to_handler()

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (Google)" 

Instead of iterating through the entire fgraph_array[] and seeing if one
of the bitmap bits are set to know to call the array's retfunc() function,
use for_each_set_bit() on the bitmap itself. This will only iterate for
the number of set bits.

Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 4d503b3e45ad..5e8e13ffcfb6 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -827,11 +827,10 @@ static unsigned long __ftrace_return_to_handler(struct 
fgraph_ret_regs *ret_regs
 #endif
 
bitmap = get_bitmap_bits(current, offset);
-   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
+
+   for_each_set_bit(i, , sizeof(bitmap) * BITS_PER_BYTE) {
struct fgraph_ops *gops = fgraph_array[i];
 
-   if (!(bitmap & BIT(i)))
-   continue;
if (gops == _stub)
continue;
 
-- 
2.43.0





[PATCH 17/20] function_graph: Add selftest for passing local variables

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

Add boot up selftest that passes variables from a function entry to a
function exit, and make sure that they do get passed around.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509110271.162236.11047551496319744627.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/trace_selftest.c | 169 ++
 1 file changed, 169 insertions(+)

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index f8f55fd79e53..fcdc744c245e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -756,6 +756,173 @@ trace_selftest_startup_function(struct tracer *trace, 
struct trace_array *tr)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#define BYTE_NUMBER 123
+#define SHORT_NUMBER 12345
+#define WORD_NUMBER 1234567890
+#define LONG_NUMBER 1234567890123456789LL
+
+static int fgraph_store_size __initdata;
+static const char *fgraph_store_type_name __initdata;
+static char *fgraph_error_str __initdata;
+static char fgraph_error_str_buf[128] __initdata;
+
+static __init int store_entry(struct ftrace_graph_ent *trace,
+ struct fgraph_ops *gops)
+{
+   const char *type = fgraph_store_type_name;
+   int size = fgraph_store_size;
+   void *p;
+
+   p = fgraph_reserve_data(gops->idx, size);
+   if (!p) {
+   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+"Failed to reserve %s\n", type);
+   fgraph_error_str = fgraph_error_str_buf;
+   return 0;
+   }
+
+   switch (fgraph_store_size) {
+   case 1:
+   *(char *)p = BYTE_NUMBER;
+   break;
+   case 2:
+   *(short *)p = SHORT_NUMBER;
+   break;
+   case 4:
+   *(int *)p = WORD_NUMBER;
+   break;
+   case 8:
+   *(long long *)p = LONG_NUMBER;
+   break;
+   }
+
+   return 1;
+}
+
+static __init void store_return(struct ftrace_graph_ret *trace,
+   struct fgraph_ops *gops)
+{
+   const char *type = fgraph_store_type_name;
+   long long expect = 0;
+   long long found = -1;
+   int size;
+   char *p;
+
+   p = fgraph_retrieve_data(gops->idx, );
+   if (!p) {
+   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+"Failed to retrieve %s\n", type);
+   fgraph_error_str = fgraph_error_str_buf;
+   return;
+   }
+   if (fgraph_store_size > size) {
+   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+"Retrieved size %d is smaller than expected %d\n",
+size, (int)fgraph_store_size);
+   fgraph_error_str = fgraph_error_str_buf;
+   return;
+   }
+
+   switch (fgraph_store_size) {
+   case 1:
+   expect = BYTE_NUMBER;
+   found = *(char *)p;
+   break;
+   case 2:
+   expect = SHORT_NUMBER;
+   found = *(short *)p;
+   break;
+   case 4:
+   expect = WORD_NUMBER;
+   found = *(int *)p;
+   break;
+   case 8:
+   expect = LONG_NUMBER;
+   found = *(long long *)p;
+   break;
+   }
+
+   if (found != expect) {
+   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+"%s returned not %lld but %lld\n", type, expect, 
found);
+   fgraph_error_str = fgraph_error_str_buf;
+   return;
+   }
+   fgraph_error_str = NULL;
+}
+
+static struct fgraph_ops store_bytes __initdata = {
+   .entryfunc  = store_entry,
+   .retfunc= store_return,
+};
+
+static int __init test_graph_storage_type(const char *name, int size)
+{
+   char *func_name;
+   int len;
+   int ret;
+
+   fgraph_store_type_name = name;
+   fgraph_store_size = size;
+
+   snprintf(fgraph_error_str_buf, sizeof(fgraph_error_str_buf),
+"Failed to execute storage %s\n", name);
+   fgraph_error_str = fgraph_error_str_buf;
+
+   pr_cont("PASSED\n");
+   pr_info("Testing fgraph storage of %d byte%s: ", size, size > 1 ? "s" : 
"");
+
+   func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+   len = strlen(func_name);
+
+   ret = ftrace_set_filter(_bytes.ops, func_name, len, 1);
+   if (ret && ret != -ENODEV) {
+   pr_cont("*Could not set filter* ");
+   return -1;
+   }
+
+   ret = register_ftrace_graph(_bytes);
+   if (ret) {
+   pr_warn("Failed to init store_bytes fgraph tracing\n");
+   return -1;

[PATCH 16/20] function_graph: Implement fgraph_reserve_data() and fgraph_retrieve_data()

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

Added functions that can be called by a fgraph_ops entryfunc and retfunc to
store state between the entry of the function being traced to the exit of
the same function. The fgraph_ops entryfunc() may call
fgraph_reserve_data() to store up to 32 words onto the task's shadow
ret_stack and this then can be retrieved by fgraph_retrieve_data() called
by the corresponding retfunc().

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509109089.162236.11372474169781184034.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h |   3 +
 kernel/trace/fgraph.c  | 196 +++--
 2 files changed, 190 insertions(+), 9 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 80eb1ab3cae3..1f6a6dc1e140 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1046,6 +1046,9 @@ struct fgraph_ops {
int idx;
 };
 
+void *fgraph_reserve_data(int idx, int size_bytes);
+void *fgraph_retrieve_data(int idx, int *size_bytes);
+
 /*
  * Stack of return addresses for functions
  * of a thread.
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 0d536a48f696..4d503b3e45ad 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -32,12 +32,11 @@
  * holds a bitmask and a type (called "bitmap"). The bitmap is defined as:
  *
  * bits:  0 -  9   offset in words from the previous ftrace_ret_stack
- * Currently, this will always be set to 
FGRAPH_FRAME_OFFSET
- * to get to the fgraph frame.
  *
  * bits: 10 - 11   Type of storage
  *   0 - reserved
  *   1 - bitmap of fgraph_array index
+ *   2 - reserved data
  *
  * For type with "bitmap of fgraph_array index" (FGRAPH_TYPE_BITMAP):
  *  bits: 12 - 27  The bitmap of fgraph_ops fgraph_array index
@@ -50,10 +49,15 @@
  * The top of the ret_stack (when not empty) will always have a reference
  * word that points to the last fgraph frame that was saved.
  *
+ * For reserved data:
+ *  bits: 12 - 17  The size in words that is stored
+ *  bits: 18 - 23  The index of fgraph_array, which shows who is stored
+ *
  * That is, at the end of function_graph_enter, if the first and forth
  * fgraph_ops on the fgraph_array[] (index 0 and 3) needs their retfunc called
- * on the return of the function being traced, this is what will be on the
- * task's shadow ret_stack: (the stack grows upward)
+ * on the return of the function being traced, and the forth fgraph_ops
+ * stored two words of data, this is what will be on the task's shadow
+ * ret_stack: (the stack grows upward)
  *
  *  ret_stack[SHADOW_STACK_OFFSET]
  * | SHADOW_STACK_TASK_VARS(ret_stack)[15]  |
@@ -63,11 +67,21 @@
  * ...
  * || <- task->curr_ret_stack
  * ++
+ * | (3 << 12) | (3 << 10) | FGRAPH_FRAME_OFFSET|
+ * | *or put another way*   |
+ * | (3 << FGRAPH_DATA_INDEX_SHIFT)| \  | This is for fgraph_ops[3].
+ * | ((2 - 1) << FGRAPH_DATA_SHIFT)| \  | The data size is 2 words.
+ * | (FGRAPH_TYPE_DATA << FGRAPH_TYPE_SHIFT)| \ |
+ * | (offset2:FGRAPH_FRAME_OFFSET+3)| <- the offset2 is from here
+ * ++ ( It is 4 words from the 
ret_stack)
+ * |STORED DATA WORD 2  |
+ * |STORED DATA WORD 1  |
+ * ++
  * | (9 << 12) | (1 << 10) | FGRAPH_FRAME_OFFSET|
  * | *or put another way*   |
  * | (BIT(3)|BIT(0)) << FGRAPH_INDEX_SHIFT | \  |
  * | FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT| \ |
- * | (offset:FGRAPH_FRAME_OFFSET)   | <- the offset is from here
+ * | (offset1:FGRAPH_FRAME_OFFSET)  | <- the offset1 is from here
  * ++
  * | struct ftrace_ret_stack|
  * |   (stores the saved ret pointer)   | <- the offset points here
@@ -101,6 +115,7 @@
 enum {
FGRAPH_TYPE_RESERVED= 0,
FGRAPH_TYPE_BITMAP  = 1,
+   FGRAPH_TYPE_DATA= 2,
 };
 
 /*
@@ -111,6 +126,26 @@ enum {
 #define FGRAPH_INDEX_MASK  GENMASK(FGRAPH_INDEX_BITS - 1, 0)
 #define FGRAPH_INDEX_SHIFT (FGRAPH_TYPE_SHIFT + FGRAPH_TYPE_BITS)
 
+/*
+ * For DATA type:
+ *  FGRAPH_DATA (12-17) bits hold the size of data (in words)
+ *  FGRAPH_INDEX (18-23) bits hold the index for which gops->idx the data is 
for
+ *
+ * Note:
+ *  data_size == 0 means 1 word, and 31 (=2^5 - 1) means 32 words.
+ */
+#define FGRAPH_DATA_BITS   5
+#define FGRAPH_DATA_MASK   GENMASK(FGRAPH_DATA_BITS - 1, 0)
+#define FGRAPH_DATA_SHIFT  (FGRAPH_TYPE_SHIFT + 

[PATCH 15/20] function_graph: Move graph notrace bit to shadow stack global var

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

The use of the task->trace_recursion for the logic used for the function
graph no-trace was a bit of an abuse of that variable. Now that there
exists global vars that are per stack for registered graph traces, use
that instead.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509107907.162236.6564679266777519065.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/trace_recursion.h  |  7 ---
 kernel/trace/trace.h |  9 +
 kernel/trace/trace_functions_graph.c | 10 ++
 3 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index fdfb6f66718a..ae04054a1be3 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -44,13 +44,6 @@ enum {
  */
TRACE_IRQ_BIT,
 
-   /*
-* To implement set_graph_notrace, if this bit is set, we ignore
-* function graph tracing of called functions, until the return
-* function is called to clear it.
-*/
-   TRACE_GRAPH_NOTRACE_BIT,
-
/* Used to prevent recursion recording from recursing. */
TRACE_RECORD_RECURSION_BIT,
 };
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5398b6889027..c91953fb58f5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -919,8 +919,17 @@ enum {
 
TRACE_GRAPH_DEPTH_START_BIT,
TRACE_GRAPH_DEPTH_END_BIT,
+
+   /*
+* To implement set_graph_notrace, if this bit is set, we ignore
+* function graph tracing of called functions, until the return
+* function is called to clear it.
+*/
+   TRACE_GRAPH_NOTRACE_BIT,
 };
 
+#define TRACE_GRAPH_NOTRACE(1 << TRACE_GRAPH_NOTRACE_BIT)
+
 static inline unsigned long ftrace_graph_depth(unsigned long *task_var)
 {
return (*task_var >> TRACE_GRAPH_DEPTH_START_BIT) & 3;
diff --git a/kernel/trace/trace_functions_graph.c 
b/kernel/trace/trace_functions_graph.c
index 66cce73e94f8..13d0387ac6a6 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -130,6 +130,7 @@ static inline int ftrace_graph_ignore_irqs(void)
 int trace_graph_entry(struct ftrace_graph_ent *trace,
  struct fgraph_ops *gops)
 {
+   unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
unsigned long flags;
@@ -138,7 +139,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
int ret;
int cpu;
 
-   if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
+   if (*task_var & TRACE_GRAPH_NOTRACE)
return 0;
 
/*
@@ -149,7 +150,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
 * returning from the function.
 */
if (ftrace_graph_notrace_addr(trace->func)) {
-   trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
+   *task_var |= TRACE_GRAPH_NOTRACE_BIT;
/*
 * Need to return 1 to have the return called
 * that will clear the NOTRACE bit.
@@ -240,6 +241,7 @@ void __trace_graph_return(struct trace_array *tr,
 void trace_graph_return(struct ftrace_graph_ret *trace,
struct fgraph_ops *gops)
 {
+   unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
unsigned long flags;
@@ -249,8 +251,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
 
ftrace_graph_addr_finish(gops, trace);
 
-   if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
-   trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
+   if (*task_var & TRACE_GRAPH_NOTRACE) {
+   *task_var &= ~TRACE_GRAPH_NOTRACE;
return;
}
 
-- 
2.43.0





[PATCH 14/20] function_graph: Move graph depth stored data to shadow stack global var

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

The use of the task->trace_recursion for the logic used for the function
graph depth was a bit of an abuse of that variable. Now that there
exists global vars that are per stack for registered graph traces, use that
instead.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509106728.162236.2398372644430125344.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/trace_recursion.h | 29 
 kernel/trace/trace.h| 34 +++--
 2 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 02e6afc6d7fe..fdfb6f66718a 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -44,25 +44,6 @@ enum {
  */
TRACE_IRQ_BIT,
 
-   /*
-* In the very unlikely case that an interrupt came in
-* at a start of graph tracing, and we want to trace
-* the function in that interrupt, the depth can be greater
-* than zero, because of the preempted start of a previous
-* trace. In an even more unlikely case, depth could be 2
-* if a softirq interrupted the start of graph tracing,
-* followed by an interrupt preempting a start of graph
-* tracing in the softirq, and depth can even be 3
-* if an NMI came in at the start of an interrupt function
-* that preempted a softirq start of a function that
-* preempted normal context Luckily, it can't be
-* greater than 3, so the next two bits are a mask
-* of what the depth is when we set TRACE_GRAPH_FL
-*/
-
-   TRACE_GRAPH_DEPTH_START_BIT,
-   TRACE_GRAPH_DEPTH_END_BIT,
-
/*
 * To implement set_graph_notrace, if this bit is set, we ignore
 * function graph tracing of called functions, until the return
@@ -78,16 +59,6 @@ enum {
 #define trace_recursion_clear(bit) do { (current)->trace_recursion &= 
~(1<<(bit)); } while (0)
 #define trace_recursion_test(bit)  ((current)->trace_recursion & 
(1<<(bit)))
 
-#define trace_recursion_depth() \
-   (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3)
-#define trace_recursion_set_depth(depth) \
-   do {\
-   current->trace_recursion &= \
-   ~(3 << TRACE_GRAPH_DEPTH_START_BIT);\
-   current->trace_recursion |= \
-   ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT;   \
-   } while (0)
-
 #define TRACE_CONTEXT_BITS 4
 
 #define TRACE_FTRACE_START TRACE_FTRACE_BIT
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6f889cc6097e..5398b6889027 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -900,8 +900,38 @@ extern void free_fgraph_ops(struct trace_array *tr);
 
 enum {
TRACE_GRAPH_FL  = 1,
+
+   /*
+* In the very unlikely case that an interrupt came in
+* at a start of graph tracing, and we want to trace
+* the function in that interrupt, the depth can be greater
+* than zero, because of the preempted start of a previous
+* trace. In an even more unlikely case, depth could be 2
+* if a softirq interrupted the start of graph tracing,
+* followed by an interrupt preempting a start of graph
+* tracing in the softirq, and depth can even be 3
+* if an NMI came in at the start of an interrupt function
+* that preempted a softirq start of a function that
+* preempted normal context Luckily, it can't be
+* greater than 3, so the next two bits are a mask
+* of what the depth is when we set TRACE_GRAPH_FL
+*/
+
+   TRACE_GRAPH_DEPTH_START_BIT,
+   TRACE_GRAPH_DEPTH_END_BIT,
 };
 
+static inline unsigned long ftrace_graph_depth(unsigned long *task_var)
+{
+   return (*task_var >> TRACE_GRAPH_DEPTH_START_BIT) & 3;
+}
+
+static inline void ftrace_graph_set_depth(unsigned long *task_var, int depth)
+{
+   *task_var &= ~(3 << TRACE_GRAPH_DEPTH_START_BIT);
+   *task_var |= (depth & 3) << TRACE_GRAPH_DEPTH_START_BIT;
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern struct ftrace_hash __rcu *ftrace_graph_hash;
 extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
@@ -934,7 +964,7 @@ ftrace_graph_addr(unsigned long *task_var, struct 
ftrace_graph_ent *trace)
 * when the depth is zero.
 */
*task_var |= TRACE_GRAPH_FL;
-   trace_recursion_set_depth(trace->depth);
+   ftrace_graph_set_depth(task_var, trace->depth);
 
/*
 * If no irqs are to be traced, but a set_graph_function
@@ -959,7 +989,7 @@ 

[PATCH 13/20] function_graph: Move set_graph_function tests to shadow stack global var

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

The use of the task->trace_recursion for the logic used for the
set_graph_funnction was a bit of an abuse of that variable. Now that there
exists global vars that are per stack for registered graph traces, use that
instead.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509105520.162236.10339831553995971290.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/trace_recursion.h  |  5 +
 kernel/trace/trace.h | 32 ++--
 kernel/trace/trace_functions_graph.c |  6 +++---
 kernel/trace/trace_irqsoff.c |  4 ++--
 kernel/trace/trace_sched_wakeup.c|  4 ++--
 5 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h
index 24ea8ac049b4..02e6afc6d7fe 100644
--- a/include/linux/trace_recursion.h
+++ b/include/linux/trace_recursion.h
@@ -44,9 +44,6 @@ enum {
  */
TRACE_IRQ_BIT,
 
-   /* Set if the function is in the set_graph_function file */
-   TRACE_GRAPH_BIT,
-
/*
 * In the very unlikely case that an interrupt came in
 * at a start of graph tracing, and we want to trace
@@ -60,7 +57,7 @@ enum {
 * that preempted a softirq start of a function that
 * preempted normal context Luckily, it can't be
 * greater than 3, so the next two bits are a mask
-* of what the depth is when we set TRACE_GRAPH_BIT
+* of what the depth is when we set TRACE_GRAPH_FL
 */
 
TRACE_GRAPH_DEPTH_START_BIT,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ced61eacf40f..6f889cc6097e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -898,11 +898,16 @@ extern void init_array_fgraph_ops(struct trace_array *tr, 
struct ftrace_ops *ops
 extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
 extern void free_fgraph_ops(struct trace_array *tr);
 
+enum {
+   TRACE_GRAPH_FL  = 1,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern struct ftrace_hash __rcu *ftrace_graph_hash;
 extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash;
 
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int
+ftrace_graph_addr(unsigned long *task_var, struct ftrace_graph_ent *trace)
 {
unsigned long addr = trace->func;
int ret = 0;
@@ -924,12 +929,11 @@ static inline int ftrace_graph_addr(struct 
ftrace_graph_ent *trace)
}
 
if (ftrace_lookup_ip(hash, addr)) {
-
/*
 * This needs to be cleared on the return functions
 * when the depth is zero.
 */
-   trace_recursion_set(TRACE_GRAPH_BIT);
+   *task_var |= TRACE_GRAPH_FL;
trace_recursion_set_depth(trace->depth);
 
/*
@@ -949,11 +953,14 @@ static inline int ftrace_graph_addr(struct 
ftrace_graph_ent *trace)
return ret;
 }
 
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void
+ftrace_graph_addr_finish(struct fgraph_ops *gops, struct ftrace_graph_ret 
*trace)
 {
-   if (trace_recursion_test(TRACE_GRAPH_BIT) &&
+   unsigned long *task_var = fgraph_get_task_var(gops);
+
+   if ((*task_var & TRACE_GRAPH_FL) &&
trace->depth == trace_recursion_depth())
-   trace_recursion_clear(TRACE_GRAPH_BIT);
+   *task_var &= ~TRACE_GRAPH_FL;
 }
 
 static inline int ftrace_graph_notrace_addr(unsigned long addr)
@@ -980,7 +987,7 @@ static inline int ftrace_graph_notrace_addr(unsigned long 
addr)
 }
 
 #else
-static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace)
+static inline int ftrace_graph_addr(unsigned long *task_var, struct 
ftrace_graph_ent *trace)
 {
return 1;
 }
@@ -989,17 +996,20 @@ static inline int ftrace_graph_notrace_addr(unsigned long 
addr)
 {
return 0;
 }
-static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace)
+static inline void ftrace_graph_addr_finish(struct fgraph_ops *gops, struct 
ftrace_graph_ret *trace)
 { }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 extern unsigned int fgraph_max_depth;
 
-static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace)
+static inline bool
+ftrace_graph_ignore_func(struct fgraph_ops *gops, struct ftrace_graph_ent 
*trace)
 {
+   unsigned long *task_var = fgraph_get_task_var(gops);
+
/* trace it when it is-nested-in or is a function enabled. */
-   return !(trace_recursion_test(TRACE_GRAPH_BIT) ||
-ftrace_graph_addr(trace)) ||
+   return !((*task_var & TRACE_GRAPH_FL) ||
+ftrace_graph_addr(task_var, trace)) ||
(trace->depth < 0) ||
(fgraph_max_depth && trace->depth >= fgraph_max_depth);
 }
diff --git a/kernel/trace/trace_functions_graph.c 

[PATCH 12/20] function_graph: Add "task variables" per task for fgraph_ops

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

Add a "task variables" array on the tasks shadow ret_stack that is the
size of longs for each possible registered fgraph_ops. That's a total
of 16, taking up 8 * 16 = 128 bytes (out of a page size 4k).

This will allow for fgraph_ops to do specific features on a per task basis
having a way to maintain state for each task.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509104383.162236.12239656156685718550.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h |  1 +
 kernel/trace/fgraph.c  | 74 +-
 2 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e5b41683ffb9..80eb1ab3cae3 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1087,6 +1087,7 @@ ftrace_graph_get_ret_stack(struct task_struct *task, int 
skip);
 
 unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
unsigned long ret, unsigned long *retp);
+unsigned long *fgraph_get_task_var(struct fgraph_ops *gops);
 
 /*
  * Sometimes we don't want to trace a function with the function
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8e029d5e94f6..0d536a48f696 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -55,6 +55,10 @@
  * on the return of the function being traced, this is what will be on the
  * task's shadow ret_stack: (the stack grows upward)
  *
+ *  ret_stack[SHADOW_STACK_OFFSET]
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[15]  |
+ * ...
+ * | SHADOW_STACK_TASK_VARS(ret_stack)[0]   |
  *  ret_stack[SHADOW_STACK_MAX_OFFSET]
  * ...
  * || <- task->curr_ret_stack
@@ -117,11 +121,19 @@ enum {
 #define SHADOW_STACK_SIZE  (PAGE_SIZE)
 #define SHADOW_STACK_OFFSET(SHADOW_STACK_SIZE / sizeof(long))
 /* Leave on a buffer at the end */
-#define SHADOW_STACK_MAX_OFFSET (SHADOW_STACK_OFFSET - (FGRAPH_FRAME_OFFSET + 
1))
+#define SHADOW_STACK_MAX_OFFSET\
+   (SHADOW_STACK_OFFSET - (FGRAPH_FRAME_OFFSET + 1 + FGRAPH_ARRAY_SIZE))
 
 /* RET_STACK():Return the frame from a given @offset from task 
@t */
 #define RET_STACK(t, offset) ((struct ftrace_ret_stack 
*)(&(t)->ret_stack[offset]))
 
+/*
+ * Each fgraph_ops has a reservered unsigned long at the end (top) of the
+ * ret_stack to store task specific state.
+ */
+#define SHADOW_STACK_TASK_VARS(ret_stack) \
+   ((unsigned long *)(&(ret_stack)[SHADOW_STACK_OFFSET - 
FGRAPH_ARRAY_SIZE]))
+
 DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
@@ -212,6 +224,44 @@ static void return_run(struct ftrace_graph_ret *trace, 
struct fgraph_ops *ops)
 {
 }
 
+static void ret_stack_set_task_var(struct task_struct *t, int idx, long val)
+{
+   unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+   gvals[idx] = val;
+}
+
+static unsigned long *
+ret_stack_get_task_var(struct task_struct *t, int idx)
+{
+   unsigned long *gvals = SHADOW_STACK_TASK_VARS(t->ret_stack);
+
+   return [idx];
+}
+
+static void ret_stack_init_task_vars(unsigned long *ret_stack)
+{
+   unsigned long *gvals = SHADOW_STACK_TASK_VARS(ret_stack);
+
+   memset(gvals, 0, sizeof(*gvals) * FGRAPH_ARRAY_SIZE);
+}
+
+/**
+ * fgraph_get_task_var - retrieve a task specific state variable
+ * @gops: The ftrace_ops that owns the task specific variable
+ *
+ * Every registered fgraph_ops has a task state variable
+ * reserved on the task's ret_stack. This function returns the
+ * address to that variable.
+ *
+ * Returns the address to the fgraph_ops @gops tasks specific
+ * unsigned long variable.
+ */
+unsigned long *fgraph_get_task_var(struct fgraph_ops *gops)
+{
+   return ret_stack_get_task_var(current, gops->idx);
+}
+
 /*
  * @offset: The offset into @t->ret_stack to find the ret_stack entry
  * @frame_offset: Where to place the offset into @t->ret_stack of that entry
@@ -803,6 +853,7 @@ static int alloc_retstack_tasklist(unsigned long 
**ret_stack_list)
 
if (t->ret_stack == NULL) {
atomic_set(>trace_overrun, 0);
+   ret_stack_init_task_vars(ret_stack_list[start]);
t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
/* Make sure the tasks see the 0 first: */
@@ -863,6 +914,7 @@ static void
 graph_init_task(struct task_struct *t, unsigned long *ret_stack)
 {
atomic_set(>trace_overrun, 0);
+   ret_stack_init_task_vars(ret_stack);
t->ftrace_timestamp = 0;
t->curr_ret_stack = 0;
t->curr_ret_depth = -1;
@@ -961,6 +1013,24 @@ static int start_graph_tracing(void)
return ret;
 }
 
+static void init_task_vars(int idx)
+{
+   struct 

[PATCH 10/20] function_graph: Have the instances use their own ftrace_ops for filtering

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

Allow for instances to have their own ftrace_ops part of the fgraph_ops
that makes the funtion_graph tracer filter on the set_ftrace_filter file
of the instance and not the top instance.

Note that this also requires to update ftrace_graph_func() to call new
function_graph_enter_ops() instead of function_graph_enter() so that
it avoid pushing on shadow stack multiple times on the same function.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509102088.162236.15758883237657317789.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 arch/arm64/kernel/ftrace.c   |  21 -
 arch/loongarch/kernel/ftrace_dyn.c   |  15 ++-
 arch/powerpc/kernel/trace/ftrace.c   |   3 +-
 arch/riscv/kernel/ftrace.c   |  15 ++-
 arch/x86/kernel/ftrace.c |  19 +++-
 include/linux/ftrace.h   |   6 ++
 kernel/trace/fgraph.c| 131 ---
 kernel/trace/ftrace.c|   4 +-
 kernel/trace/trace.h |  16 ++--
 kernel/trace/trace_functions.c   |   2 +-
 kernel/trace/trace_functions_graph.c |   8 +-
 11 files changed, 190 insertions(+), 50 deletions(-)

diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c
index a650f5e11fc5..b96740829798 100644
--- a/arch/arm64/kernel/ftrace.c
+++ b/arch/arm64/kernel/ftrace.c
@@ -481,7 +481,26 @@ void prepare_ftrace_return(unsigned long self_addr, 
unsigned long *parent,
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
-   prepare_ftrace_return(ip, >lr, fregs->fp);
+   struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
+   unsigned long frame_pointer = fregs->fp;
+   unsigned long *parent = >lr;
+   int bit;
+
+   if (unlikely(ftrace_graph_is_dead()))
+   return;
+
+   if (unlikely(atomic_read(>tracing_graph_pause)))
+   return;
+
+   bit = ftrace_test_recursion_trylock(ip, *parent);
+   if (bit < 0)
+   return;
+
+   if (!function_graph_enter_ops(*parent, ip, frame_pointer,
+ (void *)frame_pointer, gops))
+   *parent = (unsigned long)_to_handler;
+
+   ftrace_test_recursion_unlock(bit);
 }
 #else
 /*
diff --git a/arch/loongarch/kernel/ftrace_dyn.c 
b/arch/loongarch/kernel/ftrace_dyn.c
index bff058317062..77acb69ad153 100644
--- a/arch/loongarch/kernel/ftrace_dyn.c
+++ b/arch/loongarch/kernel/ftrace_dyn.c
@@ -241,10 +241,21 @@ void prepare_ftrace_return(unsigned long self_addr, 
unsigned long *parent)
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
+   struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
+   unsigned long return_hooker = (unsigned long)_to_handler;
struct pt_regs *regs = >regs;
-   unsigned long *parent = (unsigned long *)>regs[1];
+   unsigned long *parent;
+   unsigned long old;
+
+   parent = (unsigned long *)>regs[1];
 
-   prepare_ftrace_return(ip, (unsigned long *)parent);
+   if (unlikely(atomic_read(>tracing_graph_pause)))
+   return;
+
+   old = *parent;
+
+   if (!function_graph_enter_ops(old, ip, 0, parent, gops))
+   *parent = return_hooker;
 }
 #else
 static int ftrace_modify_graph_caller(bool enable)
diff --git a/arch/powerpc/kernel/trace/ftrace.c 
b/arch/powerpc/kernel/trace/ftrace.c
index d8d6b4fd9a14..4a9294821c0d 100644
--- a/arch/powerpc/kernel/trace/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -421,6 +421,7 @@ int __init ftrace_dyn_arch_init(void)
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
+   struct fgraph_ops *gops = container_of(op, struct fgraph_ops, ops);
unsigned long sp = fregs->regs.gpr[1];
int bit;
 
@@ -434,7 +435,7 @@ void ftrace_graph_func(unsigned long ip, unsigned long 
parent_ip,
if (bit < 0)
goto out;
 
-   if (!function_graph_enter(parent_ip, ip, 0, (unsigned long *)sp))
+   if (!function_graph_enter_ops(parent_ip, ip, 0, (unsigned long *)sp, 
gops))
parent_ip = ppc_function_entry(return_to_handler);
 
ftrace_test_recursion_unlock(bit);
diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
index 4f4987a6d83d..7bcb5f321523 100644
--- a/arch/riscv/kernel/ftrace.c
+++ b/arch/riscv/kernel/ftrace.c
@@ -218,10 +218,23 @@ void prepare_ftrace_return(unsigned long *parent, 
unsigned long self_addr,
 void ftrace_graph_func(unsigned long ip, unsigned long parent_ip,
   struct ftrace_ops *op, struct ftrace_regs *fregs)
 {
+   struct fgraph_ops *gops 

[PATCH 11/20] function_graph: Use a simple LRU for fgraph_array index number

2024-05-24 Thread Steven Rostedt
From: "Masami Hiramatsu (Google)" 

Since the fgraph_array index is used for the bitmap on the shadow
stack, it may leave some entries after a function_graph instance is
removed. Thus if another instance reuses the fgraph_array index soon
after releasing it, the fgraph may confuse to call the newer callback
for the entries which are pushed by the older instance.
To avoid reusing the fgraph_array index soon after releasing, introduce
a simple LRU table for managing the index number. This will reduce the
possibility of this confusion.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509103267.162236.6885097397289135378.stgit@devnote2

Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 71 ++-
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 67fa7fbf6aac..8e029d5e94f6 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -125,10 +125,48 @@ enum {
 DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
-static int fgraph_array_cnt;
-
 static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
 
+/* LRU index table for fgraph_array */
+static int fgraph_lru_table[FGRAPH_ARRAY_SIZE];
+static int fgraph_lru_next;
+static int fgraph_lru_last;
+
+/* Initialize fgraph_lru_table with unused index */
+static void fgraph_lru_init(void)
+{
+   int i;
+
+   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+   fgraph_lru_table[i] = i;
+}
+
+/* Release the used index to the LRU table */
+static int fgraph_lru_release_index(int idx)
+{
+   if (idx < 0 || idx >= FGRAPH_ARRAY_SIZE ||
+   WARN_ON_ONCE(fgraph_lru_table[fgraph_lru_last] != -1))
+   return -1;
+
+   fgraph_lru_table[fgraph_lru_last] = idx;
+   fgraph_lru_last = (fgraph_lru_last + 1) % FGRAPH_ARRAY_SIZE;
+   return 0;
+}
+
+/* Allocate a new index from LRU table */
+static int fgraph_lru_alloc_index(void)
+{
+   int idx = fgraph_lru_table[fgraph_lru_next];
+
+   /* No id is available */
+   if (idx == -1)
+   return -1;
+
+   fgraph_lru_table[fgraph_lru_next] = -1;
+   fgraph_lru_next = (fgraph_lru_next + 1) % FGRAPH_ARRAY_SIZE;
+   return idx;
+}
+
 /* Get the FRAME_OFFSET from the word from the @offset on ret_stack */
 static inline int get_frame_offset(struct task_struct *t, int offset)
 {
@@ -375,7 +413,7 @@ int function_graph_enter(unsigned long ret, unsigned long 
func,
if (offset < 0)
goto out;
 
-   for (i = 0; i < fgraph_array_cnt; i++) {
+   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
struct fgraph_ops *gops = fgraph_array[i];
 
if (gops == _stub)
@@ -927,7 +965,7 @@ int register_ftrace_graph(struct fgraph_ops *gops)
 {
int command = 0;
int ret = 0;
-   int i;
+   int i = -1;
 
mutex_lock(_lock);
 
@@ -943,21 +981,16 @@ int register_ftrace_graph(struct fgraph_ops *gops)
/* The array must always have real data on it */
for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
fgraph_array[i] = _stub;
+   fgraph_lru_init();
}
 
-   /* Look for an available spot */
-   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
-   if (fgraph_array[i] == _stub)
-   break;
-   }
-   if (i >= FGRAPH_ARRAY_SIZE) {
+   i = fgraph_lru_alloc_index();
+   if (i < 0 || WARN_ON_ONCE(fgraph_array[i] != _stub)) {
ret = -ENOSPC;
goto out;
}
 
fgraph_array[i] = gops;
-   if (i + 1 > fgraph_array_cnt)
-   fgraph_array_cnt = i + 1;
gops->idx = i;
 
ftrace_graph_active++;
@@ -981,6 +1014,7 @@ int register_ftrace_graph(struct fgraph_ops *gops)
if (ret) {
fgraph_array[i] = _stub;
ftrace_graph_active--;
+   fgraph_lru_release_index(i);
}
 out:
mutex_unlock(_lock);
@@ -990,25 +1024,20 @@ int register_ftrace_graph(struct fgraph_ops *gops)
 void unregister_ftrace_graph(struct fgraph_ops *gops)
 {
int command = 0;
-   int i;
 
mutex_lock(_lock);
 
if (unlikely(!ftrace_graph_active))
goto out;
 
-   if (unlikely(gops->idx < 0 || gops->idx >= fgraph_array_cnt))
+   if (unlikely(gops->idx < 0 || gops->idx >= FGRAPH_ARRAY_SIZE ||
+fgraph_array[gops->idx] != gops))
goto out;
 
-   WARN_ON_ONCE(fgraph_array[gops->idx] != gops);
+   if (fgraph_lru_release_index(gops->idx) < 0)
+   goto out;
 
fgraph_array[gops->idx] = _stub;
-   if (gops->idx + 1 == fgraph_array_cnt) {
-   i = gops->idx;
-   while (i >= 0 && fgraph_array[i] == _stub)
-   i--;
-   fgraph_array_cnt = i + 1;
-   }
 

[PATCH 09/20] ftrace: Allow ftrace startup flags to exist without dynamic ftrace

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

Some of the flags for ftrace_startup() may be exposed even when
CONFIG_DYNAMIC_FTRACE is not configured in. This is fine as the difference
between dynamic ftrace and static ftrace is done within the internals of
ftrace itself. No need to have use cases fail to compile because dynamic
ftrace is disabled.

This change is needed to move some of the logic of what is passed to
ftrace_startup() out of the parameters of ftrace_startup().

Link: 
https://lore.kernel.org/linux-trace-kernel/171509100890.162236.436235034254912.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f7a948337d39..942a1f767280 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -509,6 +509,15 @@ static inline void stack_tracer_disable(void) { }
 static inline void stack_tracer_enable(void) { }
 #endif
 
+enum {
+   FTRACE_UPDATE_CALLS = (1 << 0),
+   FTRACE_DISABLE_CALLS= (1 << 1),
+   FTRACE_UPDATE_TRACE_FUNC= (1 << 2),
+   FTRACE_START_FUNC_RET   = (1 << 3),
+   FTRACE_STOP_FUNC_RET= (1 << 4),
+   FTRACE_MAY_SLEEP= (1 << 5),
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 void ftrace_arch_code_modify_prepare(void);
@@ -603,15 +612,6 @@ void ftrace_set_global_notrace(unsigned char *buf, int 
len, int reset);
 void ftrace_free_filter(struct ftrace_ops *ops);
 void ftrace_ops_set_global_filter(struct ftrace_ops *ops);
 
-enum {
-   FTRACE_UPDATE_CALLS = (1 << 0),
-   FTRACE_DISABLE_CALLS= (1 << 1),
-   FTRACE_UPDATE_TRACE_FUNC= (1 << 2),
-   FTRACE_START_FUNC_RET   = (1 << 3),
-   FTRACE_STOP_FUNC_RET= (1 << 4),
-   FTRACE_MAY_SLEEP= (1 << 5),
-};
-
 /*
  * The FTRACE_UPDATE_* enum is used to pass information back
  * from the ftrace_update_record() and ftrace_test_record()
-- 
2.43.0





[PATCH 08/20] ftrace: Allow function_graph tracer to be enabled in instances

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

Now that function graph tracing can handle more than one user, allow it to
be enabled in the ftrace instances. Note, the filtering of the functions is
still joined by the top level set_ftrace_filter and friends, as well as the
graph and nograph files.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509099743.162236.1699959255446248163.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h   |  1 +
 kernel/trace/ftrace.c|  1 +
 kernel/trace/trace.h | 13 +-
 kernel/trace/trace_functions.c   |  8 
 kernel/trace/trace_functions_graph.c | 65 +---
 kernel/trace/trace_selftest.c|  4 +-
 6 files changed, 64 insertions(+), 28 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 032974f55c5b..f7a948337d39 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1041,6 +1041,7 @@ extern int ftrace_graph_entry_stub(struct 
ftrace_graph_ent *trace, struct fgraph
 struct fgraph_ops {
trace_func_graph_ent_t  entryfunc;
trace_func_graph_ret_t  retfunc;
+   void*private;
int idx;
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d18387c0642d..b85f00b0ffe7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7327,6 +7327,7 @@ __init void ftrace_init_global_array_ops(struct 
trace_array *tr)
tr->ops = _ops;
tr->ops->private = tr;
ftrace_init_trace_array(tr);
+   init_array_fgraph_ops(tr);
 }
 
 void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2575ec243350..a5070f9b977b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -397,6 +397,9 @@ struct trace_array {
struct ftrace_ops   *ops;
struct trace_pid_list   __rcu *function_pids;
struct trace_pid_list   __rcu *function_no_pids;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+   struct fgraph_ops   *gops;
+#endif
 #ifdef CONFIG_DYNAMIC_FTRACE
/* All of these are protected by the ftrace_lock */
struct list_headfunc_probes;
@@ -681,7 +684,6 @@ void print_trace_header(struct seq_file *m, struct 
trace_iterator *iter);
 
 void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops 
*gops);
 int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
-void set_graph_array(struct trace_array *tr);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
@@ -892,6 +894,9 @@ extern int __trace_graph_entry(struct trace_array *tr,
 extern void __trace_graph_return(struct trace_array *tr,
 struct ftrace_graph_ret *trace,
 unsigned int trace_ctx);
+extern void init_array_fgraph_ops(struct trace_array *tr);
+extern int allocate_fgraph_ops(struct trace_array *tr);
+extern void free_fgraph_ops(struct trace_array *tr);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern struct ftrace_hash __rcu *ftrace_graph_hash;
@@ -1004,6 +1009,12 @@ print_graph_function_flags(struct trace_iterator *iter, 
u32 flags)
 {
return TRACE_TYPE_UNHANDLED;
 }
+static inline void init_array_fgraph_ops(struct trace_array *tr) { }
+static inline int allocate_fgraph_ops(struct trace_array *tr)
+{
+   return 0;
+}
+static inline void free_fgraph_ops(struct trace_array *tr) { }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 extern struct list_head ftrace_pids;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9f1bfbe105e8..8e8da0d0ee52 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -80,6 +80,7 @@ void ftrace_free_ftrace_ops(struct trace_array *tr)
 int ftrace_create_function_files(struct trace_array *tr,
 struct dentry *parent)
 {
+   int ret;
/*
 * The top level array uses the "global_ops", and the files are
 * created on boot up.
@@ -90,6 +91,12 @@ int ftrace_create_function_files(struct trace_array *tr,
if (!tr->ops)
return -EINVAL;
 
+   ret = allocate_fgraph_ops(tr);
+   if (ret) {
+   kfree(tr->ops);
+   return ret;
+   }
+
ftrace_create_filter_files(tr->ops, parent);
 
return 0;
@@ -99,6 +106,7 @@ void ftrace_destroy_function_files(struct trace_array *tr)
 {
ftrace_destroy_filter_files(tr->ops);
ftrace_free_ftrace_ops(tr);
+   free_fgraph_ops(tr);
 }
 
 static ftrace_func_t select_trace_function(u32 flags_val)
diff --git a/kernel/trace/trace_functions_graph.c 
b/kernel/trace/trace_functions_graph.c
index b7b142b65299..9ccc904a7703 100644
--- 

[PATCH 07/20] ftrace/function_graph: Pass fgraph_ops to function graph callbacks

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

Pass the fgraph_ops structure to the function graph callbacks. This will
allow callbacks to add a descriptor to a fgraph_ops private field that wil
be added in the future and use it for the callbacks. This will be useful
when more than one callback can be registered to the function graph tracer.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509098588.162236.4787930115997357578.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h   | 10 +++---
 kernel/trace/fgraph.c| 16 +---
 kernel/trace/ftrace.c|  6 --
 kernel/trace/trace.h |  4 ++--
 kernel/trace/trace_functions_graph.c | 11 +++
 kernel/trace/trace_irqsoff.c |  6 --
 kernel/trace/trace_sched_wakeup.c|  6 --
 kernel/trace/trace_selftest.c|  5 +++--
 8 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 4d4c146fbfbc..032974f55c5b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1026,11 +1026,15 @@ struct ftrace_graph_ret {
unsigned long long rettime;
 } __packed;
 
+struct fgraph_ops;
+
 /* Type of the callback handlers for tracing function graph*/
-typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *); /* return */
-typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *); /* entry */
+typedef void (*trace_func_graph_ret_t)(struct ftrace_graph_ret *,
+  struct fgraph_ops *); /* return */
+typedef int (*trace_func_graph_ent_t)(struct ftrace_graph_ent *,
+ struct fgraph_ops *); /* entry */
 
-extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace);
+extern int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace, struct 
fgraph_ops *gops);
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 2b52afa03ab4..54ed2ed2036b 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -164,13 +164,13 @@ set_bitmap(struct task_struct *t, int offset, unsigned 
long bitmap)
 }
 
 /* ftrace_graph_entry set to this to tell some archs to run function graph */
-static int entry_run(struct ftrace_graph_ent *trace)
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops)
 {
return 0;
 }
 
 /* ftrace_graph_return set to this to tell some archs to run function graph */
-static void return_run(struct ftrace_graph_ret *trace)
+static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops)
 {
 }
 
@@ -234,12 +234,14 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
 }
 #endif
 
-int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
+   struct fgraph_ops *gops)
 {
return 0;
 }
 
-static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace)
+static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops)
 {
 }
 
@@ -379,7 +381,7 @@ int function_graph_enter(unsigned long ret, unsigned long 
func,
if (gops == _stub)
continue;
 
-   if (gops->entryfunc())
+   if (gops->entryfunc(, gops))
bitmap |= BIT(i);
}
 
@@ -527,7 +529,7 @@ static unsigned long __ftrace_return_to_handler(struct 
fgraph_ret_regs *ret_regs
if (gops == _stub)
continue;
 
-   gops->retfunc();
+   gops->retfunc(, gops);
}
 
/*
@@ -681,7 +683,7 @@ void ftrace_graph_sleep_time_control(bool enable)
  * Simply points to ftrace_stub, but with the proper protocol.
  * Defined by the linker script in linux/vmlinux.lds.h
  */
-extern void ftrace_stub_graph(struct ftrace_graph_ret *);
+void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops 
*gops);
 
 /* The callbacks that hook a function */
 trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 789950a4f977..d18387c0642d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -815,7 +815,8 @@ void ftrace_graph_graph_time_control(bool enable)
fgraph_graph_time = enable;
 }
 
-static int profile_graph_entry(struct ftrace_graph_ent *trace)
+static int profile_graph_entry(struct ftrace_graph_ent *trace,
+  struct fgraph_ops *gops)
 {
struct ftrace_ret_stack *ret_stack;
 
@@ -832,7 +833,8 @@ static int profile_graph_entry(struct ftrace_graph_ent 
*trace)
return 1;
 }
 
-static void profile_graph_return(struct ftrace_graph_ret *trace)
+static void profile_graph_return(struct ftrace_graph_ret 

[PATCH 05/20] function_graph: Handle tail calls for stack unwinding

2024-05-24 Thread Steven Rostedt
From: "Masami Hiramatsu (Google)" 

For the tail-call, there would be 2 or more ftrace_ret_stacks on the
ret_stack, which records "return_to_handler" as the return address except
for the last one.  But on the real stack, there should be 1 entry because
tail-call reuses the return address on the stack and jump to the next
function.

In ftrace_graph_ret_addr() that is used for stack unwinding, skip tail
calls as a real stack unwinder would do.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509096221.162236.8806372072523195752.stgit@devnote2

Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index aae51f746828..8de2a2662281 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -594,16 +594,26 @@ unsigned long ftrace_graph_ret_addr(struct task_struct 
*task, int *idx,
unsigned long ret, unsigned long *retp)
 {
struct ftrace_ret_stack *ret_stack;
+   unsigned long return_handler = (unsigned 
long)dereference_kernel_function_descriptor(return_to_handler);
int i = task->curr_ret_stack;
 
-   if (ret != (unsigned 
long)dereference_kernel_function_descriptor(return_to_handler))
+   if (ret != return_handler)
return ret;
 
while (i > 0) {
ret_stack = get_ret_stack(current, i, );
if (!ret_stack)
break;
-   if (ret_stack->retp == retp)
+   /*
+* For the tail-call, there would be 2 or more 
ftrace_ret_stacks on
+* the ret_stack, which records "return_to_handler" as the 
return
+* address except for the last one.
+* But on the real stack, there should be 1 entry because 
tail-call
+* reuses the return address on the stack and jump to the next 
function.
+* Thus we will continue to find real return address.
+*/
+   if (ret_stack->retp == retp &&
+   ret_stack->ret != return_handler)
return ret_stack->ret;
}
 
@@ -614,10 +624,11 @@ unsigned long ftrace_graph_ret_addr(struct task_struct 
*task, int *idx,
unsigned long ret, unsigned long *retp)
 {
struct ftrace_ret_stack *ret_stack;
+   unsigned long return_handler = (unsigned 
long)dereference_kernel_function_descriptor(return_to_handler);
int offset = task->curr_ret_stack;
int i;
 
-   if (ret != (unsigned 
long)dereference_kernel_function_descriptor(return_to_handler))
+   if (ret != return_handler)
return ret;
 
if (!idx)
@@ -626,6 +637,8 @@ unsigned long ftrace_graph_ret_addr(struct task_struct 
*task, int *idx,
i = *idx;
do {
ret_stack = get_ret_stack(task, offset, );
+   if (ret_stack && ret_stack->ret == return_handler)
+   continue;
i--;
} while (i >= 0 && ret_stack);
 
-- 
2.43.0





[PATCH 06/20] function_graph: Remove logic around ftrace_graph_entry and return

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

The function pointers ftrace_graph_entry and ftrace_graph_return are no
longer called via the function_graph tracer. Instead, an array structure is
now used that will allow for multiple users of the function_graph
infrastructure. The variables are still used by the architecture code for
non dynamic ftrace configs, where a test is made against them to see if
they point to the default stub function or not. This is how the static
function tracing knows to call into the function graph tracer
infrastructure or not.

Two new stub functions are made. entry_run() and return_run(). The
ftrace_graph_entry and ftrace_graph_return are set to them respectively
when the function graph tracer is enabled, and this will trigger the
architecture specific function graph code to be executed.

This also requires checking the global_ops hash for all calls into the
function_graph tracer.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509097408.162236.17387844142114638932.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c  | 67 --
 kernel/trace/ftrace.c  |  2 -
 kernel/trace/ftrace_internal.h |  2 -
 3 files changed, 15 insertions(+), 56 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 8de2a2662281..2b52afa03ab4 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -163,6 +163,17 @@ set_bitmap(struct task_struct *t, int offset, unsigned 
long bitmap)
(FGRAPH_TYPE_BITMAP << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
 }
 
+/* ftrace_graph_entry set to this to tell some archs to run function graph */
+static int entry_run(struct ftrace_graph_ent *trace)
+{
+   return 0;
+}
+
+/* ftrace_graph_return set to this to tell some archs to run function graph */
+static void return_run(struct ftrace_graph_ret *trace)
+{
+}
+
 /*
  * @offset: The offset into @t->ret_stack to find the ret_stack entry
  * @frame_offset: Where to place the offset into @t->ret_stack of that entry
@@ -675,7 +686,6 @@ extern void ftrace_stub_graph(struct ftrace_graph_ret *);
 /* The callbacks that hook a function */
 trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
 trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
-static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
 static int alloc_retstack_tasklist(unsigned long **ret_stack_list)
@@ -758,46 +768,6 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
}
 }
 
-static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
-{
-   if (!ftrace_ops_test(_ops, trace->func, NULL))
-   return 0;
-   return __ftrace_graph_entry(trace);
-}
-
-/*
- * The function graph tracer should only trace the functions defined
- * by set_ftrace_filter and set_ftrace_notrace. If another function
- * tracer ops is registered, the graph tracer requires testing the
- * function against the global ops, and not just trace any function
- * that any ftrace_ops registered.
- */
-void update_function_graph_func(void)
-{
-   struct ftrace_ops *op;
-   bool do_test = false;
-
-   /*
-* The graph and global ops share the same set of functions
-* to test. If any other ops is on the list, then
-* the graph tracing needs to test if its the function
-* it should call.
-*/
-   do_for_each_ftrace_op(op, ftrace_ops_list) {
-   if (op != _ops && op != _ops &&
-   op != _list_end) {
-   do_test = true;
-   /* in double loop, break out with goto */
-   goto out;
-   }
-   } while_for_each_ftrace_op(op);
- out:
-   if (do_test)
-   ftrace_graph_entry = ftrace_graph_entry_test;
-   else
-   ftrace_graph_entry = __ftrace_graph_entry;
-}
-
 static DEFINE_PER_CPU(unsigned long *, idle_ret_stack);
 
 static void
@@ -939,18 +909,12 @@ int register_ftrace_graph(struct fgraph_ops *gops)
ftrace_graph_active--;
goto out;
}
-
-   ftrace_graph_return = gops->retfunc;
-
/*
-* Update the indirect function to the entryfunc, and the
-* function that gets called to the entry_test first. Then
-* call the update fgraph entry function to determine if
-* the entryfunc should be called directly or not.
+* Some archs just test to see if these are not
+* the default function
 */
-   __ftrace_graph_entry = gops->entryfunc;
-   ftrace_graph_entry = ftrace_graph_entry_test;
-   

[PATCH 04/20] function_graph: Allow multiple users to attach to function graph

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

Allow for multiple users to attach to function graph tracer at the same
time. Only 16 simultaneous users can attach to the tracer. This is because
there's an array that stores the pointers to the attached fgraph_ops. When
a function being traced is entered, each of the ftrace_ops entryfunc is
called and if it returns non zero, its index into the array will be added
to the shadow stack.

On exit of the function being traced, the shadow stack will contain the
indexes of the ftrace_ops on the array that want their retfunc to be
called.

Because a function may sleep for a long time (if a task sleeps itself),
the return of the function may be literally days later. If the ftrace_ops
is removed, its place on the array is replaced with a ftrace_ops that
contains the stub functions and that will be called when the function
finally returns.

If another ftrace_ops is added that happens to get the same index into the
array, its return function may be called. But that's actually the way
things current work with the old function graph tracer. If one tracer is
removed and another is added, the new one will get the return calls of the
function traced by the previous one, thus this is not a regression. This
can be fixed by adding a counter to each time the array item is updated and
save that on the shadow stack as well, such that it won't be called if the
index saved does not match the index on the array.

Note, being able to filter functions when both are called is not completely
handled yet, but that shouldn't be too hard to manage.

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509096221.162236.8806372072523195752.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/ftrace.h |   3 +-
 kernel/trace/fgraph.c  | 379 -
 2 files changed, 304 insertions(+), 78 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e3a83ebd1b33..4d4c146fbfbc 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1037,6 +1037,7 @@ extern int ftrace_graph_entry_stub(struct 
ftrace_graph_ent *trace);
 struct fgraph_ops {
trace_func_graph_ent_t  entryfunc;
trace_func_graph_ret_t  retfunc;
+   int idx;
 };
 
 /*
@@ -1071,7 +1072,7 @@ function_graph_enter(unsigned long ret, unsigned long 
func,
 unsigned long frame_pointer, unsigned long *retp);
 
 struct ftrace_ret_stack *
-ftrace_graph_get_ret_stack(struct task_struct *task, int idx);
+ftrace_graph_get_ret_stack(struct task_struct *task, int skip);
 
 unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx,
unsigned long ret, unsigned long *retp);
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index d2ce5d651cf0..aae51f746828 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -7,6 +7,7 @@
  *
  * Highly modified by Steven Rostedt (VMware).
  */
+#include 
 #include 
 #include 
 #include 
@@ -28,35 +29,177 @@
 /*
  * FGRAPH_FRAME_SIZE:  Size in bytes of the meta data on the shadow stack
  * FGRAPH_FRAME_OFFSET:Size in long words of the meta data frame
- * SHADOW_STACK_SIZE:  The size in bytes of the entire shadow stack
- * SHADOW_STACK_OFFSET:The size in long words of the shadow stack
- * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be 
added
  */
 #define FGRAPH_FRAME_SIZE  sizeof(struct ftrace_ret_stack)
 #define FGRAPH_FRAME_OFFSETDIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long))
-#define SHADOW_STACK_SIZE (PAGE_SIZE)
-#define SHADOW_STACK_OFFSET\
-   (ALIGN(SHADOW_STACK_SIZE, sizeof(long)) / sizeof(long))
-/* Leave on a buffer at the end */
-#define SHADOW_STACK_MAX_INDEX (SHADOW_STACK_OFFSET - FGRAPH_FRAME_OFFSET)
 
 /*
- * RET_STACK():Return the frame from a given @offset from task 
@t
- * RET_STACK_INC():Reserve one frame size on the stack.
- * RET_STACK_DEC():Remove one frame size from the stack.
+ * On entry to a function (via function_graph_enter()), a new fgraph frame
+ * (ftrace_ret_stack) is pushed onto the stack as well as a word that
+ * holds a bitmask and a type (called "bitmap"). The bitmap is defined as:
+ *
+ * bits:  0 -  9   offset in words from the previous ftrace_ret_stack
+ * Currently, this will always be set to 
FGRAPH_FRAME_OFFSET
+ * to get to the fgraph frame.
+ *
+ * bits: 10 - 11   Type of storage
+ *   0 - reserved
+ *   1 - bitmap of fgraph_array index
+ *
+ * For type with "bitmap of fgraph_array index" (FGRAPH_TYPE_BITMAP):
+ *  bits: 12 - 27  The bitmap of fgraph_ops fgraph_array index
+ * That is, it's a bitmask of 0-15 (16 bits)
+ *  

[PATCH 03/20] function_graph: Add an array structure that will allow multiple callbacks

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

Add an array structure that will eventually allow the function graph tracer
to have up to 16 simultaneous callbacks attached. It's an array of 16
fgraph_ops pointers, that is assigned when one is registered. On entry of a
function the entry of the first item in the array is called, and if it
returns zero, then the callback returns non zero if it wants the return
callback to be called on exit of the function.

The array will simplify the process of having more than one callback
attached to the same function, as its index into the array can be stored on
the shadow stack. We need to only save the index, because this will allow
the fgraph_ops to be freed before the function returns (which may happen if
the function call schedule for a long time).

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509095075.162236.8272148192748284581.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 114 ++
 1 file changed, 81 insertions(+), 33 deletions(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index fdb206aeffe3..d2ce5d651cf0 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -52,6 +52,11 @@
 DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
+static int fgraph_array_cnt;
+#define FGRAPH_ARRAY_SIZE  16
+
+static struct fgraph_ops *fgraph_array[FGRAPH_ARRAY_SIZE];
+
 /* Both enabled by default (can be cleared by function_graph tracer flags */
 static bool fgraph_sleep_time = true;
 
@@ -75,6 +80,20 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
 }
 #endif
 
+int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
+{
+   return 0;
+}
+
+static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace)
+{
+}
+
+static struct fgraph_ops fgraph_stub = {
+   .entryfunc = ftrace_graph_entry_stub,
+   .retfunc = ftrace_graph_ret_stub,
+};
+
 /**
  * ftrace_graph_stop - set to permanently disable function graph tracing
  *
@@ -161,7 +180,7 @@ int function_graph_enter(unsigned long ret, unsigned long 
func,
goto out;
 
/* Only trace if the calling function expects to */
-   if (!ftrace_graph_entry())
+   if (!fgraph_array[0]->entryfunc())
goto out_ret;
 
return 0;
@@ -276,7 +295,7 @@ static unsigned long __ftrace_return_to_handler(struct 
fgraph_ret_regs *ret_regs
trace.retval = fgraph_ret_regs_return_value(ret_regs);
 #endif
trace.rettime = trace_clock_local();
-   ftrace_graph_return();
+   fgraph_array[0]->retfunc();
/*
 * The ftrace_graph_return() may still access the current
 * ret_stack structure, we need to make sure the update of
@@ -412,11 +431,6 @@ void ftrace_graph_sleep_time_control(bool enable)
fgraph_sleep_time = enable;
 }
 
-int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
-{
-   return 0;
-}
-
 /*
  * Simply points to ftrace_stub, but with the proper protocol.
  * Defined by the linker script in linux/vmlinux.lds.h
@@ -654,37 +668,54 @@ static int start_graph_tracing(void)
 int register_ftrace_graph(struct fgraph_ops *gops)
 {
int ret = 0;
+   int i;
 
mutex_lock(_lock);
 
-   /* we currently allow only one tracer registered at a time */
-   if (ftrace_graph_active) {
+   if (!fgraph_array[0]) {
+   /* The array must always have real data on it */
+   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++)
+   fgraph_array[i] = _stub;
+   }
+
+   /* Look for an available spot */
+   for (i = 0; i < FGRAPH_ARRAY_SIZE; i++) {
+   if (fgraph_array[i] == _stub)
+   break;
+   }
+   if (i >= FGRAPH_ARRAY_SIZE) {
ret = -EBUSY;
goto out;
}
 
-   register_pm_notifier(_suspend_notifier);
+   fgraph_array[i] = gops;
+   if (i + 1 > fgraph_array_cnt)
+   fgraph_array_cnt = i + 1;
 
ftrace_graph_active++;
-   ret = start_graph_tracing();
-   if (ret) {
-   ftrace_graph_active--;
-   goto out;
-   }
 
-   ftrace_graph_return = gops->retfunc;
+   if (ftrace_graph_active == 1) {
+   register_pm_notifier(_suspend_notifier);
+   ret = start_graph_tracing();
+   if (ret) {
+   ftrace_graph_active--;
+   goto out;
+   }
+
+   ftrace_graph_return = gops->retfunc;
 
-   /*
-* Update the indirect function to the entryfunc, and the
-* function that gets called to the entry_test first. Then
-* call the update fgraph entry function to determine if
-* the entryfunc should be called directly or not.
-*/
-   __ftrace_graph_entry = 

[PATCH 01/20] function_graph: Convert ret_stack to a series of longs

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

In order to make it possible to have multiple callbacks registered with the
function_graph tracer, the retstack needs to be converted from an array of
ftrace_ret_stack structures to an array of longs. This will allow to store
the list of callbacks on the stack for the return side of the functions.

Link: 
https://lore.kernel.org/linux-trace-kernel/171509092742.162236.4427737821399314856.stgit@devnote2

Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 include/linux/sched.h |   2 +-
 kernel/trace/fgraph.c | 136 +-
 2 files changed, 83 insertions(+), 55 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 61591ac6eab6..352939dab3a5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1402,7 +1402,7 @@ struct task_struct {
int curr_ret_depth;
 
/* Stack of return addresses for return function tracing: */
-   struct ftrace_ret_stack *ret_stack;
+   unsigned long   *ret_stack;
 
/* Timestamp for last schedule: */
unsigned long long  ftrace_timestamp;
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index a130b2d898f7..c62e6db718a0 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -25,6 +25,30 @@
 #define ASSIGN_OPS_HASH(opsname, val)
 #endif
 
+/*
+ * FGRAPH_FRAME_SIZE:  Size in bytes of the meta data on the shadow stack
+ * FGRAPH_FRAME_OFFSET:Size in long words of the meta data frame
+ * SHADOW_STACK_SIZE:  The size in bytes of the entire shadow stack
+ * SHADOW_STACK_OFFSET:The size in long words of the shadow stack
+ * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be 
added
+ */
+#define FGRAPH_FRAME_SIZE  sizeof(struct ftrace_ret_stack)
+#define FGRAPH_FRAME_OFFSET(ALIGN(FGRAPH_FRAME_SIZE, sizeof(long)) / 
sizeof(long))
+#define SHADOW_STACK_SIZE (PAGE_SIZE)
+#define SHADOW_STACK_OFFSET\
+   (ALIGN(SHADOW_STACK_SIZE, sizeof(long)) / sizeof(long))
+/* Leave on a buffer at the end */
+#define SHADOW_STACK_MAX_INDEX (SHADOW_STACK_OFFSET - FGRAPH_FRAME_OFFSET)
+
+/*
+ * RET_STACK():Return the frame from a given @offset from task 
@t
+ * RET_STACK_INC():Reserve one frame size on the stack.
+ * RET_STACK_DEC():Remove one frame size from the stack.
+ */
+#define RET_STACK(t, index) ((struct ftrace_ret_stack 
*)(&(t)->ret_stack[index]))
+#define RET_STACK_INC(c) ({ c += FGRAPH_FRAME_OFFSET; })
+#define RET_STACK_DEC(c) ({ c -= FGRAPH_FRAME_OFFSET; })
+
 DEFINE_STATIC_KEY_FALSE(kill_ftrace_graph);
 int ftrace_graph_active;
 
@@ -69,6 +93,7 @@ static int
 ftrace_push_return_trace(unsigned long ret, unsigned long func,
 unsigned long frame_pointer, unsigned long *retp)
 {
+   struct ftrace_ret_stack *ret_stack;
unsigned long long calltime;
int index;
 
@@ -85,23 +110,25 @@ ftrace_push_return_trace(unsigned long ret, unsigned long 
func,
smp_rmb();
 
/* The return trace stack is full */
-   if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+   if (current->curr_ret_stack >= SHADOW_STACK_MAX_INDEX) {
atomic_inc(>trace_overrun);
return -EBUSY;
}
 
calltime = trace_clock_local();
 
-   index = ++current->curr_ret_stack;
+   index = current->curr_ret_stack;
+   RET_STACK_INC(current->curr_ret_stack);
+   ret_stack = RET_STACK(current, index);
barrier();
-   current->ret_stack[index].ret = ret;
-   current->ret_stack[index].func = func;
-   current->ret_stack[index].calltime = calltime;
+   ret_stack->ret = ret;
+   ret_stack->func = func;
+   ret_stack->calltime = calltime;
 #ifdef HAVE_FUNCTION_GRAPH_FP_TEST
-   current->ret_stack[index].fp = frame_pointer;
+   ret_stack->fp = frame_pointer;
 #endif
 #ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR
-   current->ret_stack[index].retp = retp;
+   ret_stack->retp = retp;
 #endif
return 0;
 }
@@ -137,7 +164,7 @@ int function_graph_enter(unsigned long ret, unsigned long 
func,
 
return 0;
  out_ret:
-   current->curr_ret_stack--;
+   RET_STACK_DEC(current->curr_ret_stack);
  out:
current->curr_ret_depth--;
return -EBUSY;
@@ -148,11 +175,13 @@ static void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
unsigned long frame_pointer)
 {
+   struct ftrace_ret_stack *ret_stack;
int index;
 
index = current->curr_ret_stack;
+   RET_STACK_DEC(index);
 
-   if (unlikely(index < 0 || index >= FTRACE_RETFUNC_DEPTH)) {
+   if (unlikely(index < 0 || index > SHADOW_STACK_MAX_INDEX)) {
ftrace_graph_stop();
WARN_ON(1);
/* Might as 

[PATCH 02/20] fgraph: Use BUILD_BUG_ON() to make sure we have structures divisible by long

2024-05-24 Thread Steven Rostedt
From: "Steven Rostedt (VMware)" 

Instead of using "ALIGN()", use BUILD_BUG_ON() as the structures should
always be divisible by sizeof(long).

Co-developed with Masami Hiramatsu:
Link: 
https://lore.kernel.org/linux-trace-kernel/171509093949.162236.14518699447151894536.stgit@devnote2
Link: 
http://lkml.kernel.org/r/2019052444.gi2...@hirez.programming.kicks-ass.net

Suggested-by: Peter Zijlstra 
Signed-off-by: Steven Rostedt (VMware) 
Signed-off-by: Masami Hiramatsu (Google) 
Signed-off-by: Steven Rostedt (Google) 
---
 kernel/trace/fgraph.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index c62e6db718a0..fdb206aeffe3 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -33,7 +33,7 @@
  * SHADOW_STACK_MAX_OFFSET: The max offset of the stack for a new frame to be 
added
  */
 #define FGRAPH_FRAME_SIZE  sizeof(struct ftrace_ret_stack)
-#define FGRAPH_FRAME_OFFSET(ALIGN(FGRAPH_FRAME_SIZE, sizeof(long)) / 
sizeof(long))
+#define FGRAPH_FRAME_OFFSETDIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long))
 #define SHADOW_STACK_SIZE (PAGE_SIZE)
 #define SHADOW_STACK_OFFSET\
(ALIGN(SHADOW_STACK_SIZE, sizeof(long)) / sizeof(long))
@@ -103,6 +103,8 @@ ftrace_push_return_trace(unsigned long ret, unsigned long 
func,
if (!current->ret_stack)
return -EBUSY;
 
+   BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));
+
/*
 * We must make sure the ret_stack is tested before we read
 * anything else.
@@ -326,6 +328,8 @@ ftrace_graph_get_ret_stack(struct task_struct *task, int 
idx)
 {
int index = task->curr_ret_stack;
 
+   BUILD_BUG_ON(FGRAPH_FRAME_SIZE % sizeof(long));
+
index -= FGRAPH_FRAME_OFFSET * (idx + 1);
if (index < 0)
return NULL;
-- 
2.43.0





[PATCH 00/20] function_graph: Allow multiple users for function graph tracing

2024-05-24 Thread Steven Rostedt
[
  Resend for some of you as I missed a comma in the Cc and
  quilt died sending this out.
]

This is a continuation of the function graph multi user code.
I wrote a proof of concept back in 2019 of this code[1] and
Masami started cleaning it up. I started from Masami's work v10
that can be found here:

 
https://lore.kernel.org/linux-trace-kernel/171509088006.162236.7227326999861366050.stgit@devnote2/

This is *only* the code that allows multiple users of function
graph tracing. This is not the fprobe work that Masami is working
to add on top of it. As Masami took my proof of concept, there
was still several things I disliked about that code. Instead of
having Masami clean it up even more, I decided to take over on just
my code and change it up a bit.

The biggest changes from where Masami left off is mostly renaming more
variables, macros, and function names. I fixed up the current comments
and added more to make the code a bit more understandable.

At the end of the series, I added two patches to optimize the entry
and exit. On entry, there was a loop that iterated the 16 elements
of the fgraph_array[] looking for any that may have a gops registered
to it. It's quite a waste to do that loop if there's only one
registered user. To fix that, I added a fgraph_array_bitmask that has
its bits set that correspond to the elements of the array. Then
a simple for_each_set_bit() is used for the iteration. I do the same
thing at the exit callback of the function where it iterates over the
bits of the bitmap saved on the ret_stack.

I also noticed that Masami added code to handle tail calls in the
unwinder and had that in one of my patches. I took that code out
and made it a separate patch with Masami as the author.

The diff between this and Masami's last update is at the end of this email.

Based on Linus commit: 0eb03c7e8e2a4cc3653eb5eeb2d2001182071215

[1] https://lore.kernel.org/all/20190525031633.811342...@goodmis.org/

Masami Hiramatsu (Google) (3):
  function_graph: Handle tail calls for stack unwinding
  function_graph: Use a simple LRU for fgraph_array index number
  ftrace: Add multiple fgraph storage selftest

Steven Rostedt (Google) (2):
  function_graph: Use for_each_set_bit() in __ftrace_return_to_handler()
  function_graph: Use bitmask to loop on fgraph entry

Steven Rostedt (VMware) (15):
  function_graph: Convert ret_stack to a series of longs
  fgraph: Use BUILD_BUG_ON() to make sure we have structures divisible by 
long
  function_graph: Add an array structure that will allow multiple callbacks
  function_graph: Allow multiple users to attach to function graph
  function_graph: Remove logic around ftrace_graph_entry and return
  ftrace/function_graph: Pass fgraph_ops to function graph callbacks
  ftrace: Allow function_graph tracer to be enabled in instances
  ftrace: Allow ftrace startup flags to exist without dynamic ftrace
  function_graph: Have the instances use their own ftrace_ops for filtering
  function_graph: Add "task variables" per task for fgraph_ops
  function_graph: Move set_graph_function tests to shadow stack global var
  function_graph: Move graph depth stored data to shadow stack global var
  function_graph: Move graph notrace bit to shadow stack global var
  function_graph: Implement fgraph_reserve_data() and fgraph_retrieve_data()
  function_graph: Add selftest for passing local variables


 arch/arm64/kernel/ftrace.c   |  21 +-
 arch/loongarch/kernel/ftrace_dyn.c   |  15 +-
 arch/powerpc/kernel/trace/ftrace.c   |   3 +-
 arch/riscv/kernel/ftrace.c   |  15 +-
 arch/x86/kernel/ftrace.c |  19 +-
 include/linux/ftrace.h   |  42 +-
 include/linux/sched.h|   2 +-
 include/linux/trace_recursion.h  |  39 --
 kernel/trace/fgraph.c| 994 ---
 kernel/trace/ftrace.c|  11 +-
 kernel/trace/ftrace_internal.h   |   2 -
 kernel/trace/trace.h |  94 +++-
 kernel/trace/trace_functions.c   |   8 +
 kernel/trace/trace_functions_graph.c |  96 ++--
 kernel/trace/trace_irqsoff.c |  10 +-
 kernel/trace/trace_sched_wakeup.c|  10 +-
 kernel/trace/trace_selftest.c| 259 -
 17 files changed, 1330 insertions(+), 310 deletions(-)


diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 3313e4b83aa2..1aae521e5997 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -18,29 +18,36 @@
 #include "ftrace_internal.h"
 #include "trace.h"
 
-#define FGRAPH_FRAME_SIZE sizeof(struct ftrace_ret_stack)
-#define FGRAPH_FRAME_OFFSET DIV_ROUND_UP(FGRAPH_FRAME_SIZE, sizeof(long))
 
 /*
- * On entry to a function (via function_graph_enter()), a new ftrace_ret_stack
- * is allocated on the task's ret_stack with bitmap entry, then each
- * fgraph_ops on the fgraph_array[]'s entryfunc is called and if that returns
- * non-zero, the index into the 

[no subject]

2024-05-24 Thread Steven Rostedt


[no subject]

2024-05-24 Thread Steven Rostedt




Re: [PATCH v10 07/36] function_graph: Allow multiple users to attach to function graph

2024-05-24 Thread Steven Rostedt
On Tue,  7 May 2024 23:09:22 +0900
"Masami Hiramatsu (Google)"  wrote:

> @@ -109,6 +244,21 @@ ftrace_push_return_trace(unsigned long ret, unsigned 
> long func,
>   if (!current->ret_stack)
>   return -EBUSY;
>  
> + /*
> +  * At first, check whether the previous fgraph callback is pushed by
> +  * the fgraph on the same function entry.
> +  * But if @func is the self tail-call function, we also need to ensure
> +  * the ret_stack is not for the previous call by checking whether the
> +  * bit of @fgraph_idx is set or not.
> +  */
> + ret_stack = get_ret_stack(current, current->curr_ret_stack, );
> + if (ret_stack && ret_stack->func == func &&
> + get_fgraph_type(current, offset + FGRAPH_FRAME_OFFSET) == 
> FGRAPH_TYPE_BITMAP &&
> + !is_fgraph_index_set(current, offset + FGRAPH_FRAME_OFFSET, 
> fgraph_idx))
> + return offset + FGRAPH_FRAME_OFFSET;
> +
> + val = (FGRAPH_TYPE_RESERVED << FGRAPH_TYPE_SHIFT) | FGRAPH_FRAME_OFFSET;
> +
>   BUILD_BUG_ON(SHADOW_STACK_SIZE % sizeof(long));

I'm trying to figure out what the above is trying to do. This gets called
once in function_graph_enter() (or function_graph_enter_ops()). What
exactly are you trying to catch here?

Is it from this email:

  
https://lore.kernel.org/all/20231110105154.df937bf9f200a0c16806c...@kernel.org/

As that's the last version before you added the above code.

But you also noticed it may not be needed, but triggered a crash without it
in v3:

  
https://lore.kernel.org/all/20231205234511.3839128259dfec153ea7d...@kernel.org/

I removed this code in my version and it runs just fine. Perhaps there was
another bug that this was hiding that you fixed in later versions?

-- Steve



Re: [PATCH v10 00/36] tracing: fprobe: function_graph: Multi-function graph and fprobe on fgraph

2024-05-24 Thread Steven Rostedt
On Tue,  7 May 2024 23:08:00 +0900
"Masami Hiramatsu (Google)"  wrote:

> Steven Rostedt (VMware) (15):
>   function_graph: Convert ret_stack to a series of longs
>   fgraph: Use BUILD_BUG_ON() to make sure we have structures divisible by 
> long
>   function_graph: Add an array structure that will allow multiple 
> callbacks
>   function_graph: Allow multiple users to attach to function graph
>   function_graph: Remove logic around ftrace_graph_entry and return
>   ftrace/function_graph: Pass fgraph_ops to function graph callbacks
>   ftrace: Allow function_graph tracer to be enabled in instances
>   ftrace: Allow ftrace startup flags exist without dynamic ftrace
>   function_graph: Have the instances use their own ftrace_ops for 
> filtering
>   function_graph: Add "task variables" per task for fgraph_ops
>   function_graph: Move set_graph_function tests to shadow stack global var
>   function_graph: Move graph depth stored data to shadow stack global var
>   function_graph: Move graph notrace bit to shadow stack global var
>   function_graph: Implement fgraph_reserve_data() and 
> fgraph_retrieve_data()
>   function_graph: Add selftest for passing local variables

Hi Masami,

While reviewing these patches, I realized there's several things I dislike
about the patches I wrote. So I took these patches and started cleaning
them up a little. Mostly renaming functions and adding comments.

As this is a major change to the function graph tracer, and I feel nervous
about building something on top of this, how about I take over these
patches and push them out for the next merge window. I'm hoping to get them
into linux-next by v6.10-rc2 (I spent the day working on them, and it's
mostly minor tweaks).

Then I can push it out to 6.11 and get some good testing against it. Then
we can add your stuff on top and get that merged in 6.12.

If all goes well, I'm hoping to get a series on just these patches (and
your selftest addition) by tonight.

Thoughts?

-- Steve



Re: [PATCH RFC 1/2] dt-bindings: soc: qcom,smsm: Allow specifying mboxes instead of qcom,ipc

2024-05-24 Thread Luca Weiss
On Donnerstag, 23. Mai 2024 08:19:11 MESZ Krzysztof Kozlowski wrote:
> On 23/05/2024 08:16, Luca Weiss wrote:
> > On Donnerstag, 23. Mai 2024 08:02:13 MESZ Krzysztof Kozlowski wrote:
> >> On 22/05/2024 19:34, Luca Weiss wrote:
> >>> On Mittwoch, 22. Mai 2024 08:49:43 MESZ Krzysztof Kozlowski wrote:
>  On 21/05/2024 22:35, Luca Weiss wrote:
> > On Dienstag, 21. Mai 2024 10:58:07 MESZ Krzysztof Kozlowski wrote:
> >> On 20/05/2024 17:11, Luca Weiss wrote:
> >>> Hi Krzysztof
> >>>
> >>> Ack, sounds good.
> >>>
> >>> Maybe also from you, any opinion between these two binding styles?
> >>>
> >>> So first using index of mboxes for the numbering, where for the known
> >>> usages the first element (and sometimes the 3rd - ipc-2) are empty <>.
> >>>
> >>> The second variant is using mbox-names to get the correct channel-mbox
> >>> mapping.
> >>>
> >>> -   qcom,ipc-1 = < 8 13>;
> >>> -   qcom,ipc-2 = < 8 9>;
> >>> -   qcom,ipc-3 = < 8 19>;
> >>> +   mboxes = <0>, < 13>, < 9>, < 19>;
> >>>
> >>> vs.
> >>>
> >>> -   qcom,ipc-1 = < 8 13>;
> >>> -   qcom,ipc-2 = < 8 9>;
> >>> -   qcom,ipc-3 = < 8 19>;
> >>> +   mboxes = < 13>, < 9>, < 19>;
> >>> +   mbox-names = "ipc-1", "ipc-2", "ipc-3";
> >>
> >> Sorry, don't get, ipc-1 is the first mailbox, so why would there be <0>
> >> in first case?
> >
> > Actually not, ipc-0 would be permissible by the driver, used for the 
> > 0th host
> >
> > e.g. from:
> >
> > /* Iterate over all hosts to check whom wants a kick */
> > for (host = 0; host < smsm->num_hosts; host++) {
> > hostp = >hosts[host];
> >
> > Even though no mailbox is specified in any upstream dts for this 0th 
> > host I
> > didn't want the bindings to restrict that, that's why in the first 
> > example
> > there's an empty element (<0>) for the 0th smsm host
> >
> >> Anyway, the question is if you need to know that some
> >> mailbox is missing. But then it is weird to name them "ipc-1" etc.
> >
> > In either case we'd just query the mbox (either by name or index) and 
> > then
> > see if it's there? Not quite sure I understand the sentence..
> > Pretty sure either binding would work the same way.
> 
>  The question is: does the driver care only about having some mailboxes
>  or the driver cares about each specific mailbox? IOW, is skipping ipc-0
>  important for the driver?
> >>>
> >>> There's nothing special from driver side about any mailbox. Some SoCs have
> >>> a mailbox for e.g. hosts 1&2&3, some have only 1&3, and apq8064 even has
> >>> 1&2&3&4.
> >>>
> >>> And if the driver doesn't find a mailbox for a host, it just ignores it
> >>> but then of course it can't 'ring' the mailbox for that host when 
> >>> necessary.
> >>>
> >>> Not sure how much more I can add here, to be fair I barely understand what
> >>> this driver is doing myself apart from the obvious.
> >>
> >> From what you said, it looks like it is enough to just list mailboxes,
> >> e.g. for ipc-1, ipc-2 and ipc-4 (so no ipc-0 and ipc-3):
> > 
> > No, for sure we need also the possibility to list ipc-3.
> 
> ? You can list it, what's the problem>

Maybe we're talking past each other...

You asked why this wouldn't work:

  e.g. for ipc-1, ipc-2 and ipc-4 (so no ipc-0 and ipc-3):
  mboxes = < 13>, < 9>, < 19>;

How would we know that the 3rd mailbox ( 19) is for the 4th host
(previous ipc-4)?

1. If we use mboxes with indexes we'd need to have <0> values for
"smsm hosts" where we don't have a mailbox for - this is at least
for the 2nd smsm host (qcom,ipc-2) for a bunch of SoCs.

2. If we use mboxes with mbox-names then we could skip that since we
can directly specify which "smsm host" a given mailbox is for.

My only question really is whether 1. or 2. is a better idea.

Is this clearer now or still not?


> 
> > 
> > And my point is that I'm not sure if any platform will ever need ipc-0, but
> > the code to use that if it ever exists is there - the driver always
> > tries getting an mbox (currently just syscon of course) for every host
> > from 0 to n.
> > 
> > These are the current (non-mbox-API) mboxes provided to smsm:
> > 
> > $ git grep qcom,ipc- arch/
> > arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-1 = < 
> > 8 4>;
> > arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-2 = < 
> > 8 14>;
> > arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-3 = < 
> > 8 23>;
> > arch/arm/boot/dts/qcom/qcom-apq8064.dtsi:   qcom,ipc-4 = 
> > <_sic_non_secure 0x4094 0>;
> > arch/arm/boot/dts/qcom/qcom-msm8974.dtsi:   qcom,ipc-1 = < 
> > 8 13>;
> > arch/arm/boot/dts/qcom/qcom-msm8974.dtsi:   qcom,ipc-2 = < 
> > 8 9>;
> > 

Re: Bug in Kernel 6.8.x, 6.9.x Causing Trace/Panic During Shutdown/Reboot

2024-05-24 Thread Steven Rostedt
On Fri, 24 May 2024 12:50:08 +0200
"Linux regression tracking (Thorsten Leemhuis)"  
wrote:

> > - Affected Versions: Before kernel version 6.8.10, the bug caused a
> > quick display of a kernel trace dump before the shutdown/reboot
> > completed. Starting from version 6.8.10 and continuing into version
> > 6.9.0 and 6.9.1, this issue has escalated to a kernel panic,
> > preventing the shutdown or reboot from completing and leaving the
> > machine stuck.

Ah, I bet it was this commit: baa23a8d4360d ("tracefs: Reset permissions on
remount if permissions are options"), which added a "iput" callback to the
dentry without calling iput, leaving stale inodes around.

This is fixed with:

  0bcfd9aa4dafa ("tracefs: Clear EVENT_INODE flag in tracefs_drop_inode()")

Try adding just that patch. It will at least make it go back to what was
happening before 6.8.10 (I hope!).

-- Steve



Re: Bug in Kernel 6.8.x, 6.9.x Causing Trace/Panic During Shutdown/Reboot

2024-05-24 Thread Steven Rostedt
On Fri, 24 May 2024 12:50:08 +0200
"Linux regression tracking (Thorsten Leemhuis)"  
wrote:

> [CCing a few people]
> 

Thanks for the Cc.

> On 24.05.24 12:31, Ilkka Naulapää wrote:
> > 
> > I have encountered a critical bug in the Linux vanilla kernel that
> > leads to a kernel panic during the shutdown or reboot process. The
> > issue arises after all services, including `journald`, have been
> > stopped. As a result, the machine fails to complete the shutdown or
> > reboot procedure, effectively causing the system to hang and not shut
> > down or reboot.  

To understand this, did you do anything with tracing? Before shutting down,
is there anything in /sys/kernel/tracing/instances directory?
Were any of the files/directories permissions in /sys/kernel/tracing changed?

> 
> Thx for the report. Not my area of expertise, so take this with a gain
> of salt. But given the versions your mention in your report and the
> screenshot that mentioned tracefs_free_inode I suspect this is caused by
> baa23a8d4360d ("tracefs: Reset permissions on remount if permissions are
> options"). A few fixes for it will soon hit mainline and are meant to be
> backported to affected stable trees:
> 
> https://lore.kernel.org/all/20240523212406.254317...@goodmis.org/
> https://lore.kernel.org/all/20240523174419.1e588...@gandalf.local.home/
> 
> You might want to try them – or recheck once they hit the stable trees
> you are about. If they don't work, please report back.

There's been quite a bit of updates in this code, but this looks new to me.
I have more fixes that were just pulled by Linus today.

  https://git.kernel.org/torvalds/c/0eb03c7e8e2a4cc3653eb5eeb2d2001182071215

I'm not sure how relevant that is for this. But if you can reproduce it
with that commit, then this is a new bug.

-- Steve



[PATCH v5 2/2] misc: fastrpc: use coherent pool for untranslated Compute Banks

2024-05-24 Thread Dylan Van Assche
Use fastrpc_remote_heap_alloc to allocate from the FastRPC device
instead of the Compute Bank when the session ID is 0. This ensures
that the allocation is inside the coherent DMA pool which is already
accessible to the DSP. This is necessary to support FastRPC devices
which do not have dedicated Compute Banks such as the SLPI on the SDM845.
The latter uses an allocated CMA region instead of FastRPC Compute Banks.

Signed-off-by: Dylan Van Assche 
Reviewed-by: Caleb Connolly 
---
 drivers/misc/fastrpc.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index c06667b29055..f53d20e2e07e 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -953,7 +953,10 @@ static int fastrpc_get_args(u32 kernel, struct 
fastrpc_invoke_ctx *ctx)
 
ctx->msg_sz = pkt_size;
 
-   err = fastrpc_buf_alloc(ctx->fl, dev, pkt_size, >buf);
+   if (ctx->fl->sctx->sid)
+   err = fastrpc_buf_alloc(ctx->fl, dev, pkt_size, >buf);
+   else
+   err = fastrpc_remote_heap_alloc(ctx->fl, dev, pkt_size, 
>buf);
if (err)
return err;
 
-- 
2.45.1




[PATCH v5 1/2] misc: fastrpc: support complete DMA pool access to the DSP

2024-05-24 Thread Dylan Van Assche
To support FastRPC Context Banks which aren't mapped via the SMMU,
make the whole reserved memory region available to the DSP to allow
access to coherent buffers.

This is performed by assigning the memory to the DSP via a hypervisor
call to set the correct permissions for the Virtual Machines on the DSP.
This is only necessary when a memory region is provided for SLPI DSPs
so guard this with a domain ID check.

Signed-off-by: Dylan Van Assche 
Reviewed-by: Caleb Connolly 
---
 drivers/misc/fastrpc.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/drivers/misc/fastrpc.c b/drivers/misc/fastrpc.c
index 4c67e2c5a82e..c06667b29055 100644
--- a/drivers/misc/fastrpc.c
+++ b/drivers/misc/fastrpc.c
@@ -2255,6 +2255,8 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device *rpdev)
int i, err, domain_id = -1, vmcount;
const char *domain;
bool secure_dsp;
+   struct device_node *rmem_node;
+   struct reserved_mem *rmem;
unsigned int vmids[FASTRPC_MAX_VMIDS];
 
err = of_property_read_string(rdev->of_node, "label", );
@@ -2297,6 +2299,23 @@ static int fastrpc_rpmsg_probe(struct rpmsg_device 
*rpdev)
}
}
 
+   rmem_node = of_parse_phandle(rdev->of_node, "memory-region", 0);
+   if (domain_id == SDSP_DOMAIN_ID && rmem_node) {
+   u64 src_perms;
+
+   rmem = of_reserved_mem_lookup(rmem_node);
+   if (!rmem) {
+   err = -EINVAL;
+   goto fdev_error;
+   }
+
+   src_perms = BIT(QCOM_SCM_VMID_HLOS);
+
+   qcom_scm_assign_mem(rmem->base, rmem->size, _perms,
+   data->vmperms, data->vmcount);
+
+   }
+
secure_dsp = !(of_property_read_bool(rdev->of_node, 
"qcom,non-secure-domain"));
data->secure = secure_dsp;
 
-- 
2.45.1




[PATCH v5 0/2] misc: fastrpc: FastRPC reserved memory assignment for SDM845 SLPI

2024-05-24 Thread Dylan Van Assche
* About *

The Qualcomm SDM845 SoC has a separate SLPI (Sensor Low Power Island)
DSP for sensors connected to the SoC which is responsible for exposing
sensors to userspace, power saving, and other features. 
While sensors are connected to GPIOs of the SoC, they cannot be used
because the hypervisor blocks direct access to the sensors, thus the 
DSP must be used to access any sensor on this SoC. The SLPI DSP uses a
GLink edge (dsps) to communicate with the host and has a FastRPC interface
to load files from the host filesystem such as sensor configuration files.
The FastRPC interface does not use regular FastRPC Compute Banks
but instead uses an allocated CMA region through which communication happens.

* Changes *

This patchseries add support to the FastRPC for assigning a coherent memory
region to a DSP via the hypervisor with the correct permissions.
This is necessary to support the SLPI found in the Qualcomm SDM845 SoC which
does not have dedicated FastRPC Compute Banks, in contrast to newer SoCs,
but uses a memory region instead when allocating buffers.

* Related patches *

1. Remoteproc changes to support the SLPI DSP in SDM845 (v3), needs to be 
applied:
https://lore.kernel.org/linux-remoteproc/20230330164633.117335-1...@dylanvanassche.be
2. DTS changes (v5), already applied:
https://lore.kernel.org/linux-devicetree/20230406173148.28309-1...@dylanvanassche.be

This serie does not depend on any serie, but all of them are necessary
to enable the feature in the end.

* Changelog *

Changes in v5:
- Adjusted to FastRPC driver changes in 6.9.X.

Changes in v4:
- Fixed possible memory leak when driver encounters an error during probing.

Changes in v3:
- Dropped debug prints.
- Added Reviewed-By tags from v2.

Changes in v2:

- Removed double blank lines
- Dropped dt-bindings property as it is not needed for driver behavior
- Add additional patch to allocate buffers via CMA memory for DSPs
  without dedicated FastRPC Compute Banks.

Dylan Van Assche (2):
  misc: fastrpc: support complete DMA pool access to the DSP
  misc: fastrpc: use coherent pool for untranslated Compute Banks

 drivers/misc/fastrpc.c | 24 +++-
 1 file changed, 23 insertions(+), 1 deletion(-)

-- 
2.45.1




Re: [PATCH v10 03/36] x86: tracing: Add ftrace_regs definition in the header

2024-05-24 Thread Steven Rostedt
On Fri, 24 May 2024 10:37:54 +0900
Masami Hiramatsu (Google)  wrote:
> > >  
> > >  #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
> > >  struct ftrace_regs {
> > > + /*
> > > +  * On the x86_64, the ftrace_regs saves;
> > > +  * rax, rcx, rdx, rdi, rsi, r8, r9, rbp, rip and rsp.
> > > +  * Also orig_ax is used for passing direct trampoline address.
> > > +  * x86_32 doesn't support ftrace_regs.  
> > 
> > Should add a comment that if fregs->regs.cs is set, then all of the pt_regs
> > is valid.  
> 
> But what about rbx and r1*? Only regs->cs should be care for pt_regs?
> Or, did you mean "the ftrace_regs is valid"?

Yeah, on x86_64 ftrace_regs uses regs.cs to denote if it is valid or not:

static __always_inline struct pt_regs *
arch_ftrace_get_regs(struct ftrace_regs *fregs)
{
/* Only when FL_SAVE_REGS is set, cs will be non zero */
if (!fregs->regs.cs)
return NULL;
return >regs;
}


-- Steve



Re: How to properly fix reading user pointers in bpf in android kernel 4.9?

2024-05-24 Thread Bagas Sanjaya
[also Cc: bpf maintainers and get_maintainer output]

On Thu, May 23, 2024 at 07:52:22PM +0300, Marcel wrote:
> This seems that it was a long standing problem with the Linux kernel in 
> general. bpf_probe_read should have worked for both kernel and user pointers 
> but it fails with access error when reading an user one instead. 
> 
> I know there's a patch upstream that fixes this by introducing new helpers 
> for reading kernel and userspace pointers and I tried to back port them back 
> to my kernel but with no success. Tools like bcc fail to use them and instead 
> they report that the arguments sent to the helpers are invalid. I assume this 
> is due to the arguments ARG_CONST_STACK_SIZE and ARG_PTR_TO_RAW_STACK handle 
> data different in the 4.9 android version and the upstream version but I'm 
> not sure that this is the cause. I left the patch I did below and with a link 
> to the kernel I'm working on and maybe someone can take a look and give me an 
> hand (the patch isn't applied yet)

What upstream patch? Has it already been in mainline?

> 
> 
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 744b4763b80e..de94c13b7193 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -559,6 +559,43 @@ enum bpf_func_id {
> */
> BPF_FUNC_probe_read_user,
>  
> +   /**
> +   * int bpf_probe_read_kernel(void *dst, int size, void *src)
> +   * Read a kernel pointer safely.
> +   * Return: 0 on success or negative error
> +   */
> +   BPF_FUNC_probe_read_kernel,
> +
> + /**
> +  * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
> +  * Copy a NUL terminated string from user unsafe address. In case 
> the string
> +  * length is smaller than size, the target is not padded with 
> further NUL
> +  * bytes. In case the string length is larger than size, just 
> count-1
> +  * bytes are copied and the last byte is set to NUL.
> +  * @dst: destination address
> +  * @size: maximum number of bytes to copy, including the trailing 
> NUL
> +  * @unsafe_ptr: unsafe address
> +  * Return:
> +  *   > 0 length of the string including the trailing NUL on success
> +  *   < 0 error
> +  */
> + BPF_FUNC_probe_read_user_str,
> +
> + /**
> +  * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr)
> +  * Copy a NUL terminated string from unsafe address. In case the 
> string
> +  * length is smaller than size, the target is not padded with 
> further NUL
> +  * bytes. In case the string length is larger than size, just 
> count-1
> +  * bytes are copied and the last byte is set to NUL.
> +  * @dst: destination address
> +  * @size: maximum number of bytes to copy, including the trailing 
> NUL
> +  * @unsafe_ptr: unsafe address
> +  * Return:
> +  *   > 0 length of the string including the trailing NUL on success
> +  *   < 0 error
> +  */
> + BPF_FUNC_probe_read_kernel_str,
> +
>   __BPF_FUNC_MAX_ID,
>  };
>  
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index a1e37a5d8c88..3478ca744a45 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -94,7 +94,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
>   .arg3_type  = ARG_ANYTHING,
>  };
>  
> -BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, const void *, 
> unsafe_ptr)
> +BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size, const void  __user 
> *, unsafe_ptr)
>  {
>   int ret;
>  
> @@ -115,6 +115,27 @@ static const struct bpf_func_proto 
> bpf_probe_read_user_proto = {
>  };
>  
>  
> +BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size, const void *, 
> unsafe_ptr)
> +{
> + int ret;
> +
> + ret = probe_kernel_read(dst, unsafe_ptr, size);
> + if (unlikely(ret < 0))
> + memset(dst, 0, size);
> +
> + return ret;
> +}
> +
> +static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
> + .func   = bpf_probe_read_kernel,
> + .gpl_only   = true,
> + .ret_type   = RET_INTEGER,
> + .arg1_type  = ARG_PTR_TO_RAW_STACK,
> + .arg2_type  = ARG_CONST_STACK_SIZE,
> + .arg3_type  = ARG_ANYTHING,
> +};
> +
> +
>  BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
>  u32, size)
>  {
> @@ -487,6 +508,69 @@ static const struct bpf_func_proto 
> bpf_probe_read_str_proto = {
>   .arg3_type  = ARG_ANYTHING,
>  };
>  
> +
> +
> +BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
> +const void __user *, unsafe_ptr)
> +{
> + int ret;
> +
> + /*
> +  * The strncpy_from_unsafe() call will likely not fill the entire
> +  * buffer, but that's okay in this circumstance as we're probing
> +  * 

Re: Bug in Kernel 6.8.x, 6.9.x Causing Trace/Panic During Shutdown/Reboot

2024-05-24 Thread Linux regression tracking (Thorsten Leemhuis)
[CCing a few people]

On 24.05.24 12:31, Ilkka Naulapää wrote:
> 
> I have encountered a critical bug in the Linux vanilla kernel that
> leads to a kernel panic during the shutdown or reboot process. The
> issue arises after all services, including `journald`, have been
> stopped. As a result, the machine fails to complete the shutdown or
> reboot procedure, effectively causing the system to hang and not shut
> down or reboot.

Thx for the report. Not my area of expertise, so take this with a gain
of salt. But given the versions your mention in your report and the
screenshot that mentioned tracefs_free_inode I suspect this is caused by
baa23a8d4360d ("tracefs: Reset permissions on remount if permissions are
options"). A few fixes for it will soon hit mainline and are meant to be
backported to affected stable trees:

https://lore.kernel.org/all/20240523212406.254317...@goodmis.org/
https://lore.kernel.org/all/20240523174419.1e588...@gandalf.local.home/

You might want to try them – or recheck once they hit the stable trees
you are about. If they don't work, please report back.

Ciao, Thorsten (wearing his 'the Linux kernel's regression tracker' hat)
--
Everything you wanna know about Linux kernel regression tracking:
https://linux-regtracking.leemhuis.info/about/#tldr
If I did something stupid, please tell me, as explained on that page.

> Here are the details of the issue:
> 
> - Affected Versions: Before kernel version 6.8.10, the bug caused a
> quick display of a kernel trace dump before the shutdown/reboot
> completed. Starting from version 6.8.10 and continuing into version
> 6.9.0 and 6.9.1, this issue has escalated to a kernel panic,
> preventing the shutdown or reboot from completing and leaving the
> machine stuck.
> 
> - Symptoms:
>   - In normal shutdown/reboot scenarios, the kernel trace dump briefly
> appears as the last message on the screen.
>   - In rescue mode, the kernel panic message is displayed. Normally it
> is not shown.
> 
> Since `journald` is stopped before this issue occurs, no textual logs
> are available. However, I have captured two pictures illustrating
> these related issues, which I am attaching to this email for your
> reference. Also added my custom kernel config.
> 
> Thank you for your attention to this matter. Please let me know if any
> additional information is required to assist in diagnosing and
> resolving this bug.
> 
> Best regards,
> 
> Ilkka Naulapää



Re: [PATCH] livepatch: introduce klp_func called interface

2024-05-24 Thread zhang warden



> On May 23, 2024, at 22:22, Dan Carpenter  wrote:
> 
> Always run your patches through checkpatch.
> 
> So this patch is so that testers can see if a function has been called?
> Can you not get the same information from gcov or ftrace?
> 
> There are style issues with the patch, but it's not so important until
> the design is agreed on.
> 
> regards,
> dan carpenter

Hi, Dan.

This patch have format issues as Markus said. A newer version of this patch is 
sent which is checked by ./scripts/checkpatch.pl

Thanks for your suggestions.

Regards,
Wardenjohn




Re: [PATCH] livepatch: introduce klp_func called interface

2024-05-24 Thread zhang warden



> On May 21, 2024, at 16:04, Petr Mladek  wrote:
> 
> Another motivation to use ftrace for testing is that it does not
> affect the performance in production.
> 
> We should keep klp_ftrace_handler() as fast as possible so that we
> could livepatch also performance sensitive functions.
> 

How about using unlikely() for branch testing? If we use unlikely, maybe there 
is no negative effect to klp_ftrace_handler() once this function is called.

Regards,
Wardenjohn




Re: [RFC PATCH 00/20] Introduce the famfs shared-memory file system

2024-05-24 Thread Miklos Szeredi
On Fri, 24 May 2024 at 02:47, John Groves  wrote:

> Apologies, but I'm short on time at the moment - going into a long holiday
> weekend in the US with family plans. I should be focused again by middle of
> next week.

NP.

Obviously I'll need to test it before anything is merged, other than
that this is not urgent at all...

> But can you check /proc/cmdline to see of the memmap arg got through without
> getting mangled? The '$' tends to get fubar'd. You might need \$, or I've seen
> the need for \\\$. If it's un-mangled, there should be a dax device.

/proc/cmdline shows the option correctly:

root@kvm:~# cat /proc/cmdline
root=/dev/vda console=hvc0 memmap=4G$4G

> If that doesn't work, it's worth trying '!' instead, which I think would give
> you a pmem device - if the arg gets through (but ! is less likely to get
> horked). That pmem device can be converted to devdax...

That doesn't work either.  No device created in /dev  (dax or pmem).

free(1) does show that the reserved memory is gone in both cases, so
something does happen.

Attaching my .config as well.

Thanks,
Miklos


.config
Description: Binary data


[PATCH v4 2/2] LoongArch: Add steal time support in guest side

2024-05-24 Thread Bibo Mao
Percpu struct kvm_steal_time is added here, its size is 64 bytes and
also defined as 64 bytes, so that the whole structure is in one physical
page.

When vcpu is onlined, function pv_enable_steal_time() is called. This
function will pass guest physical address of struct kvm_steal_time and
tells hypervisor to enable steal time. When vcpu is offline, physical
address is set as 0 and tells hypervisor to disable steal time.

Here is output of vmstat on guest when there is workload on both host
and guest. It shows steal time stat information.

procs ---memory-- -io -system-- --cpu-
 r  b   swpd   free  inact active   bibo   in   cs us sy id wa st
15  1  0 7583616 184112  72208200  162   52 31  6 43  0 20
17  0  0 7583616 184704  721920 0 6318 6885  5 60  8  5 22
16  0  0 7583616 185392  721440 0 1766 1081  0 49  0  1 50
16  0  0 7583616 184816  723040 0 6300 6166  4 62 12  2 20
18  0  0 7583632 184480  722400 0 2814 1754  2 58  4  1 35

Signed-off-by: Bibo Mao 
---
 .../admin-guide/kernel-parameters.txt |   2 +-
 arch/loongarch/Kconfig|  11 ++
 arch/loongarch/include/asm/paravirt.h |   5 +
 arch/loongarch/kernel/paravirt.c  | 133 ++
 arch/loongarch/kernel/time.c  |   2 +
 5 files changed, 152 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index ef25e06ec08c..3435eaab392b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4057,7 +4057,7 @@
prediction) vulnerability. System may allow data
leaks with this option.
 
-   no-steal-acc[X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,EARLY] Disable
+   no-steal-acc[X86,PV_OPS,ARM64,PPC/PSERIES,RISCV,LOONGARCH,EARLY] 
Disable
paravirtualized steal time accounting. steal time is
computed, but won't influence scheduler behaviour
 
diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 4bda59525a13..a1c4b0080039 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -641,6 +641,17 @@ config PARAVIRT
  over full virtualization.  However, when run without a hypervisor
  the kernel is theoretically slower and slightly larger.
 
+config PARAVIRT_TIME_ACCOUNTING
+   bool "Paravirtual steal time accounting"
+   select PARAVIRT
+   help
+ Select this option to enable fine granularity task steal time
+ accounting. Time spent executing other tasks in parallel with
+ the current vCPU is discounted from the vCPU power. To account for
+ that, there can be a small performance impact.
+
+ If in doubt, say N here.
+
 endmenu
 
 config ARCH_SELECT_MEMORY_MODEL
diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h
index 0965710f47f2..dddec49671ae 100644
--- a/arch/loongarch/include/asm/paravirt.h
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -18,6 +18,7 @@ static inline u64 paravirt_steal_clock(int cpu)
 }
 
 int __init pv_ipi_init(void);
+int __init pv_time_init(void);
 
 #else
 
@@ -26,5 +27,9 @@ static inline int pv_ipi_init(void)
return 0;
 }
 
+static inline int pv_time_init(void)
+{
+   return 0;
+}
 #endif // CONFIG_PARAVIRT
 #endif
diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
index 1633ed4f692f..68f3cd75862a 100644
--- a/arch/loongarch/kernel/paravirt.c
+++ b/arch/loongarch/kernel/paravirt.c
@@ -4,11 +4,14 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
 struct static_key paravirt_steal_enabled;
 struct static_key paravirt_steal_rq_enabled;
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock;
 
 static u64 native_steal_clock(int cpu)
 {
@@ -17,6 +20,57 @@ static u64 native_steal_clock(int cpu)
 
 DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
 
+static bool steal_acc = true;
+static int __init parse_no_stealacc(char *arg)
+{
+   steal_acc = false;
+   return 0;
+}
+early_param("no-steal-acc", parse_no_stealacc);
+
+static u64 para_steal_clock(int cpu)
+{
+   u64 steal;
+   struct kvm_steal_time *src;
+   int version;
+
+   src = _cpu(steal_time, cpu);
+   do {
+
+   version = src->version;
+   /* Make sure that the version is read before the steal */
+   virt_rmb();
+   steal = src->steal;
+   /* Make sure that the steal is read before the next version */
+   virt_rmb();
+
+   } while ((version & 1) || (version != src->version));
+   return steal;
+}
+
+static int pv_enable_steal_time(void)
+{
+   int cpu = smp_processor_id();
+   struct kvm_steal_time *st;
+   unsigned 

[PATCH v4 1/2] LoongArch: KVM: Add steal time support in kvm side

2024-05-24 Thread Bibo Mao
Steal time feature is added here in kvm side, VM can search supported
features provided by KVM hypervisor, feature KVM_FEATURE_STEAL_TIME
is added here. Like x86, steal time structure is saved in guest memory,
one hypercall function KVM_HCALL_FUNC_NOTIFY is added to notify KVM to
enable the feature.

One cpu attr ioctl command KVM_LOONGARCH_VCPU_PVTIME_CTRL is added to
save and restore base address of steal time structure when VM is migrated.

Signed-off-by: Bibo Mao 
---
 arch/loongarch/include/asm/kvm_host.h  |   7 ++
 arch/loongarch/include/asm/kvm_para.h  |  10 ++
 arch/loongarch/include/asm/kvm_vcpu.h  |   4 +
 arch/loongarch/include/asm/loongarch.h |   1 +
 arch/loongarch/include/uapi/asm/kvm.h  |   4 +
 arch/loongarch/kvm/Kconfig |   1 +
 arch/loongarch/kvm/exit.c  |  38 +++-
 arch/loongarch/kvm/vcpu.c  | 124 +
 8 files changed, 187 insertions(+), 2 deletions(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index c87b6ea0ec47..2eb2f7572023 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -30,6 +30,7 @@
 #define KVM_PRIVATE_MEM_SLOTS  0
 
 #define KVM_HALT_POLL_NS_DEFAULT   50
+#define KVM_REQ_STEAL_UPDATE   KVM_ARCH_REQ(1)
 
 #define KVM_GUESTDBG_SW_BP_MASK\
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)
@@ -201,6 +202,12 @@ struct kvm_vcpu_arch {
struct kvm_mp_state mp_state;
/* cpucfg */
u32 cpucfg[KVM_MAX_CPUCFG_REGS];
+   /* paravirt steal time */
+   struct {
+   u64 guest_addr;
+   u64 last_steal;
+   struct gfn_to_hva_cache cache;
+   } st;
 };
 
 static inline unsigned long readl_sw_gcsr(struct loongarch_csrs *csr, int reg)
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index 4ba2312e5f8c..a9ba8185d4af 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -14,6 +14,7 @@
 
 #define KVM_HCALL_SERVICE  HYPERCALL_ENCODE(HYPERVISOR_KVM, 
KVM_HCALL_CODE_SERVICE)
 #define  KVM_HCALL_FUNC_IPI1
+#define  KVM_HCALL_FUNC_NOTIFY 2
 
 #define KVM_HCALL_SWDBG
HYPERCALL_ENCODE(HYPERVISOR_KVM, KVM_HCALL_CODE_SWDBG)
 
@@ -24,6 +25,15 @@
 #define KVM_HCALL_INVALID_CODE -1UL
 #define KVM_HCALL_INVALID_PARAMETER-2UL
 
+#define KVM_STEAL_PHYS_VALID   BIT_ULL(0)
+#define KVM_STEAL_PHYS_MASKGENMASK_ULL(63, 6)
+struct kvm_steal_time {
+   __u64 steal;
+   __u32 version;
+   __u32 flags;
+   __u32 pad[12];
+};
+
 /*
  * Hypercall interface for KVM hypervisor
  *
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 590a92cb5416..d7e51300a89f 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -120,4 +120,8 @@ static inline void kvm_write_reg(struct kvm_vcpu *vcpu, int 
num, unsigned long v
vcpu->arch.gprs[num] = val;
 }
 
+static inline bool kvm_pvtime_supported(void)
+{
+   return !!sched_info_on();
+}
 #endif /* __ASM_LOONGARCH_KVM_VCPU_H__ */
diff --git a/arch/loongarch/include/asm/loongarch.h 
b/arch/loongarch/include/asm/loongarch.h
index eb09adda54b7..7a4633ef284b 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -169,6 +169,7 @@
 #define  KVM_SIGNATURE "KVM\0"
 #define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4)
 #define  KVM_FEATURE_IPI   BIT(1)
+#define  KVM_FEATURE_STEAL_TIMEBIT(2)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/loongarch/include/uapi/asm/kvm.h 
b/arch/loongarch/include/uapi/asm/kvm.h
index f9abef382317..ddc5cab0ffd0 100644
--- a/arch/loongarch/include/uapi/asm/kvm.h
+++ b/arch/loongarch/include/uapi/asm/kvm.h
@@ -81,7 +81,11 @@ struct kvm_fpu {
 #define LOONGARCH_REG_64(TYPE, REG)(TYPE | KVM_REG_SIZE_U64 | (REG << 
LOONGARCH_REG_SHIFT))
 #define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, 
REG)
 #define KVM_IOC_CPUCFG(REG)
LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG)
+
+/* Device Control API on vcpu fd */
 #define KVM_LOONGARCH_VCPU_CPUCFG  0
+#define KVM_LOONGARCH_VCPU_PVTIME_CTRL 1
+#define  KVM_LOONGARCH_VCPU_PVTIME_GPA 0
 
 struct kvm_debug_exit_arch {
 };
diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig
index c4ef2b4d9797..248744b4d086 100644
--- a/arch/loongarch/kvm/Kconfig
+++ b/arch/loongarch/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
select KVM_MMIO
select HAVE_KVM_READONLY_MEM
select KVM_XFER_TO_GUEST_WORK
+   select SCHED_INFO
help
  Support hosting virtualized guest machines using
  hardware virtualization extensions. You will need
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c

[PATCH v4 0/2] LoongArch: Add steal time support

2024-05-24 Thread Bibo Mao
Para-virt feature steal time is added in both kvm and guest kernel side.
It is silimar with other architectures, steal time structure comes from
guest memory, also pseduo register is used to save/restore base address
of steal time structure, so that when vm is migrated, kvm module on
target machine knows the base address of steal time.

---
v3 ... v4:
  1. To resolve compile dependency problem, enable SCHED_INFO option
section if KVM is enabled.
  2. Put new added option PARAVIRT_TIME_ACCOUNTING in one submemu with
PARAVIRT in file arch/loongarch/Kconfig.

v2 ... v3:
  1. Solve code confliction based on the kernel 6.9.0
  2. Add kernel parameter no-steal-acc support on LoongArch with file
Documentation/admin-guide/kernel-parameters.txt
  3. Add strict checking with pv stealtimer gpa address in function
kvm_save_notify() and kvm_loongarch_pvtime_set_attr()

v1 ... v2:
  1. Add PARAVIRT_TIME_ACCOUNTING kconfig option in file
arch/loongarch/Kconfig
  2. Function name change such as replace pv_register_steal_time with
pv_enable_steal_time etc

---

Bibo Mao (2):
  LoongArch: KVM: Add steal time support in kvm side
  LoongArch: Add steal time support in guest side

 .../admin-guide/kernel-parameters.txt |   2 +-
 arch/loongarch/Kconfig|  11 ++
 arch/loongarch/include/asm/kvm_host.h |   7 +
 arch/loongarch/include/asm/kvm_para.h |  10 ++
 arch/loongarch/include/asm/kvm_vcpu.h |   4 +
 arch/loongarch/include/asm/loongarch.h|   1 +
 arch/loongarch/include/asm/paravirt.h |   5 +
 arch/loongarch/include/uapi/asm/kvm.h |   4 +
 arch/loongarch/kernel/paravirt.c  | 133 ++
 arch/loongarch/kernel/time.c  |   2 +
 arch/loongarch/kvm/Kconfig|   1 +
 arch/loongarch/kvm/exit.c |  38 -
 arch/loongarch/kvm/vcpu.c | 124 
 13 files changed, 339 insertions(+), 3 deletions(-)


base-commit: 6e51b4b5bbc07e52b226017936874715629932d1
-- 
2.39.3