[PATCH v3 1/1] arm64: Implement stack trace termination record

2021-04-20 Thread madvenka
From: "Madhavan T. Venkataraman" 

Reliable stacktracing requires that we identify when a stacktrace is
terminated early. We can do this by ensuring all tasks have a final
frame record at a known location on their task stack, and checking
that this is the final frame record in the chain.

Kernel Tasks


All tasks except the idle task have a pt_regs structure right after the
task stack. This is called the task pt_regs. The pt_regs structure has a
special stackframe field. Make this stackframe field the final frame in the
task stack. This needs to be done in copy_thread() which initializes a new
task's pt_regs and initial CPU context.

For the idle task, there is no task pt_regs. For our purpose, we need one.
So, create a pt_regs just like other kernel tasks and make
pt_regs->stackframe the final frame in the idle task stack. This needs to be
done at two places:

- On the primary CPU, the boot task runs. It calls start_kernel()
  and eventually becomes the idle task for the primary CPU. Just
  before start_kernel() is called, set up the final frame.

- On each secondary CPU, a startup task runs that calls
  secondary_startup_kernel() and eventually becomes the idle task
  on the secondary CPU. Just before secondary_start_kernel() is
  called, set up the final frame.

User Tasks
==

User tasks are initially set up like kernel tasks when they are created.
Then, they return to userland after fork via ret_from_fork(). After that,
they enter the kernel only on an EL0 exception. (In arm64, system calls are
also EL0 exceptions). The EL0 exception handler stores state in the task
pt_regs and calls different functions based on the type of exception. The
stack trace for an EL0 exception must end at the task pt_regs. So, make
task pt_regs->stackframe as the final frame in the EL0 exception stack.

In summary, task pt_regs->stackframe is where a successful stack trace ends.

Stack trace termination
===

In the unwinder, terminate the stack trace successfully when
task_pt_regs(task)->stackframe is reached. For stack traces in the kernel,
this will correctly terminate the stack trace at the right place.

However, debuggers may terminate the stack trace when FP == 0. In the
pt_regs->stackframe, the PC is 0 as well. So, stack traces taken in the
debugger may print an extra record 0x0 at the end. While this is not
pretty, this does not do any harm. This is a small price to pay for
having reliable stack trace termination in the kernel. That said, gdb
does not show the extra record probably because it uses DWARF and not
frame pointers for stack traces.

Reviewed-by: Mark Brown 
Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/entry.S  |  8 +---
 arch/arm64/kernel/head.S   | 29 +++--
 arch/arm64/kernel/process.c|  5 +
 arch/arm64/kernel/stacktrace.c | 10 +-
 4 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 6acfc5e6b5e0..e677b9a2b8f8 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -263,16 +263,18 @@ alternative_else_nop_endif
stp lr, x21, [sp, #S_LR]
 
/*
-* For exceptions from EL0, terminate the callchain here.
+* For exceptions from EL0, terminate the callchain here at
+* task_pt_regs(current)->stackframe.
+*
 * For exceptions from EL1, create a synthetic frame record so the
 * interrupted code shows up in the backtrace.
 */
.if \el == 0
-   mov x29, xzr
+   stp xzr, xzr, [sp, #S_STACKFRAME]
.else
stp x29, x22, [sp, #S_STACKFRAME]
-   add x29, sp, #S_STACKFRAME
.endif
+   add x29, sp, #S_STACKFRAME
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 alternative_if_not ARM64_HAS_PAN
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 840bda1869e9..743c019a42c7 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -393,6 +393,23 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
ret x28
 SYM_FUNC_END(__create_page_tables)
 
+   /*
+* The boot task becomes the idle task for the primary CPU. The
+* CPU startup task on each secondary CPU becomes the idle task
+* for the secondary CPU.
+*
+* The idle task does not require pt_regs. But create a dummy
+* pt_regs so that task_pt_regs(idle_task)->stackframe can be
+* set up to be the final frame on the idle task stack just like
+* all the other kernel tasks. This helps the unwinder to
+* terminate the stack trace at a well-known stack offset.
+*/
+   .macro setup_final_frame
+   sub sp, sp, #PT_REGS_SIZE
+   stp xzr, xzr, [sp, #S_STACKFRAME]
+   add x29, sp, #S_STACKFRAME
+   .endm
+
 /*
  * The following fragment of code is executed with the 

[PATCH v3 0/1] arm64: Implement stack trace termination record

2021-04-20 Thread madvenka
From: "Madhavan T. Venkataraman" 

Reliable stacktracing requires that we identify when a stacktrace is
terminated early. We can do this by ensuring all tasks have a final
frame record at a known location on their task stack, and checking
that this is the final frame record in the chain.

All tasks have a pt_regs structure right after the task stack in the stack
page. The pt_regs structure contains a stackframe field. Make this stackframe
field the final frame in the task stack so all stack traces end at a fixed
stack offset.

For kernel tasks, this is simple to understand. For user tasks, there is
some extra detail. User tasks get created via fork() et al. Once they return
from fork, they enter the kernel only on an EL0 exception. In arm64,
system calls are also EL0 exceptions.

The EL0 exception handler uses the task pt_regs mentioned above to save
register state and call different exception functions. All stack traces
from EL0 exception code must end at the pt_regs. So, make pt_regs->stackframe
the final frame in the EL0 exception stack.

To summarize, task_pt_regs(task)->stackframe will always be the final frame
in a stack trace.

Sample stack traces
===

Showing just the last couple of frames in each stack trace to show how the
stack trace ends.

Primary CPU idle task
=

 ...
[0.077109]   rest_init+0x108/0x144
[0.077188]   arch_call_rest_init+0x18/0x24
[0.077220]   start_kernel+0x3ac/0x3e4
[0.077293]   __primary_switched+0xac/0xb0

Secondary CPU idle task
===

...
[0.077264]   secondary_start_kernel+0x228/0x388
[0.077326]   __secondary_switched+0x80/0x84

Sample kernel thread


 ...
[   24.543250]   kernel_init+0xa4/0x164
[   24.561850]   ret_from_fork+0x10/0x18

Write system call (EL0 exception)
=

(using a test driver called callfd)

[ 1160.628723]   callfd_stack+0x3c/0x70
[ 1160.628768]   callfd_op+0x35c/0x3a8
[ 1160.628791]   callfd_write+0x5c/0xc8
[ 1160.628813]   vfs_write+0x104/0x3b8
[ 1160.628837]   ksys_write+0xd0/0x188
[ 1160.628859]   __arm64_sys_write+0x4c/0x60
[ 1160.628883]   el0_svc_common.constprop.0+0xa8/0x240
[ 1160.628904]   do_el0_svc+0x40/0xa8
[ 1160.628921]   el0_svc+0x2c/0x78
[ 1160.628942]   el0_sync_handler+0xb0/0xb8
[ 1160.628962]   el0_sync+0x17c/0x180

NULL pointer dereference exception (EL1 exception)
==

[ 1160.637984]   callfd_stack+0x3c/0x70
[ 1160.638015]   die_kernel_fault+0x80/0x108
[ 1160.638042]   do_page_fault+0x520/0x600
[ 1160.638075]   do_translation_fault+0xa8/0xdc
[ 1160.638102]   do_mem_abort+0x68/0x100
[ 1160.638120]   el1_abort+0x40/0x60
[ 1160.638138]   el1_sync_handler+0xac/0xc8
[ 1160.638157]   el1_sync+0x74/0x100
[ 1160.638174]   0x0   <=== NULL pointer dereference
[ 1160.638189]   callfd_write+0x5c/0xc8
[ 1160.638211]   vfs_write+0x104/0x3b8
[ 1160.638234]   ksys_write+0xd0/0x188
[ 1160.638278]   __arm64_sys_write+0x4c/0x60
[ 1160.638325]   el0_svc_common.constprop.0+0xa8/0x240
[ 1160.638358]   do_el0_svc+0x40/0xa8
[ 1160.638379]   el0_svc+0x2c/0x78
[ 1160.638409]   el0_sync_handler+0xb0/0xb8
[ 1160.638452]   el0_sync+0x17c/0x180

Timer interrupt (EL1 exception)
===

Secondary CPU idle task interrupted by the timer interrupt:

[ 1160.702949] callfd_callback:
[ 1160.703006]   callfd_stack+0x3c/0x70
[ 1160.703060]   callfd_callback+0x30/0x40
[ 1160.703087]   call_timer_fn+0x48/0x220
[ 1160.703113]   run_timer_softirq+0x7cc/0xc70
[ 1160.703144]   __do_softirq+0x1ec/0x608
[ 1160.703166]   irq_exit+0x138/0x180
[ 1160.703193]   __handle_domain_irq+0x8c/0xf0
[ 1160.703218]   gic_handle_irq+0xec/0x410
[ 1160.703253]   el1_irq+0xc0/0x180
[ 1160.703278]   arch_local_irq_enable+0xc/0x28
[ 1160.703329]   default_idle_call+0x54/0x1d8
[ 1160.703355]   do_idle+0x2d8/0x350
[ 1160.703388]   cpu_startup_entry+0x2c/0x98
[ 1160.703412]   secondary_start_kernel+0x238/0x388
[ 1160.703446]   __secondary_switched+0x80/0x84
---
Changelog:

v3:
- Added Reviewed-by: Mark Brown .
- Fixed an extra space after a cast reported by checkpatch --strict.
- Synced with mainline tip.

v2:
- Changed some wordings as suggested by Mark Rutland.
- Removed the synthetic return PC for idle tasks. Changed the
  branches to start_kernel() and secondary_start_kernel() to
  calls so that they will have a proper return PC.

v1:
- Set up task_pt_regs(current)->stackframe as the final frame
  when a new task is initialized in copy_thread().
- Create pt_regs for the idle tasks and set up pt_regs->stackframe
  as the final frame for the idle tasks.
- Set up task_pt_regs(current)->stackframe as the final frame in
  the EL0 exception handler so the EL0 exception stack trace ends
  there.
- Terminate the stack trace successfully in unwind_frame() when

[RFC PATCH v2 4/4] arm64: Mark stack trace as unreliable if kretprobed functions are present

2021-04-05 Thread madvenka
From: "Madhavan T. Venkataraman" 

When a kretprobe is active for a function, the function's return address
in its stack frame is modified to point to the kretprobe trampoline. When
the function returns, the frame is popped and control is transferred
to the trampoline. The trampoline eventually returns to the original return
address.

If a stack walk is done within the function (or any functions that get
called from there), the stack trace will only show the trampoline and the
not the original caller.

Also, if the trampoline itself and the functions it calls do a stack trace,
that stack trace will also have the same problem. Detect this as well.

If the trampoline is detected in the stack trace, mark the stack trace
as unreliable.

Reviewed-by: Mark Brown 
Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/stacktrace.c | 37 ++
 1 file changed, 37 insertions(+)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 7a3c638d4aeb..926bd91ffb3f 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -97,6 +97,36 @@ struct function_range {
  * if return_to_handler is detected on the stack.
  *
  * NOTE: The unwinder must do (1) before (2).
+ *
+ * KPROBES
+ * ===
+ *
+ * There are two types of kprobes:
+ *
+ * (1) Regular kprobes that are placed anywhere in a probed function.
+ * This is implemented by replacing the probed instruction with a
+ * breakpoint. When the breakpoint is hit, the kprobe code emulates
+ * the original instruction in-situ and returns to the next
+ * instruction.
+ *
+ * Breakpoints are EL1 exceptions. When the unwinder detects them,
+ * the stack trace is marked as unreliable as it does not know where
+ * exactly the exception happened. Detection of EL1 exceptions in
+ * a stack trace will be done separately.
+ *
+ * (2) Return kprobes that are placed on the return of a probed function.
+ * In this case, Kprobes sets up an initial breakpoint at the
+ * beginning of the probed function. When the breakpoint is hit,
+ * Kprobes replaces the return address in the stack frame with the
+ * kretprobe_trampoline and records the original return address.
+ * When the probed function returns, control goes to the trampoline
+ * which eventually returns to the original return address.
+ *
+ * Stack traces taken while in the probed function or while in the
+ * trampoline will show kretprobe_trampoline instead of the original
+ * return address. Detect this and mark the stack trace unreliable.
+ * The detection is done by checking if the return PC falls anywhere
+ * in kretprobe_trampoline.
  */
 static struct function_range   special_functions[] = {
/*
@@ -125,6 +155,13 @@ static struct function_range   special_functions[] = {
{ (unsigned long) return_to_handler, 0 },
 #endif
 
+   /*
+* Kprobe trampolines.
+*/
+#ifdef CONFIG_KRETPROBES
+   { (unsigned long) kretprobe_trampoline, 0 },
+#endif
+
{ /* sentinel */ }
 };
 
-- 
2.25.1



[RFC PATCH v2 3/4] arm64: Detect FTRACE cases that make the stack trace unreliable

2021-04-05 Thread madvenka
From: "Madhavan T. Venkataraman" 

When CONFIG_DYNAMIC_FTRACE_WITH_REGS is enabled and tracing is activated
for a function, the ftrace infrastructure is called for the function at
the very beginning. Ftrace creates two frames:

- One for the traced function

- One for the caller of the traced function

That gives a reliable stack trace while executing in the ftrace code. When
ftrace returns to the traced function, the frames are popped and everything
is back to normal.

However, in cases like live patch, a tracer function may redirect execution
to a different function when it returns. A stack trace taken while still in
the tracer function will not show the target function. The target function
is the real function that we want to track.

So, if an FTRACE frame is detected on the stack, just mark the stack trace
as unreliable. The detection is done by checking the return PC against
FTRACE trampolines.

Also, the Function Graph Tracer modifies the return address of a traced
function to a return trampoline to gather tracing data on function return.
Stack traces taken from that trampoline and functions it calls are
unreliable as the original return address may not be available in
that context. Mark the stack trace unreliable accordingly.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/entry-ftrace.S | 12 +++
 arch/arm64/kernel/stacktrace.c   | 61 
 2 files changed, 73 insertions(+)

diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index b3e4f9a088b1..1f0714a50c71 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -86,6 +86,18 @@ SYM_CODE_START(ftrace_caller)
b   ftrace_common
 SYM_CODE_END(ftrace_caller)
 
+/*
+ * A stack trace taken from anywhere in the FTRACE trampoline code should be
+ * considered unreliable as a tracer function (patched at ftrace_call) could
+ * potentially set pt_regs->pc and redirect execution to a function different
+ * than the traced function. E.g., livepatch.
+ *
+ * No stack traces are taken in this FTRACE trampoline assembly code. But
+ * they can be taken from C functions that get called from here. The unwinder
+ * checks if a return address falls in this FTRACE trampoline code. See
+ * stacktrace.c. If function calls in this code are changed, please keep the
+ * special_functions[] array in stacktrace.c in sync.
+ */
 SYM_CODE_START(ftrace_common)
sub x0, x30, #AARCH64_INSN_SIZE // ip (callsite's BL insn)
mov x1, x9  // parent_ip (callsite's LR)
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index fb11e4372891..7a3c638d4aeb 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -51,6 +51,52 @@ struct function_range {
  * unreliable. Breakpoints are used for executing probe code. Stack traces
  * taken while in the probe code will show an EL1 frame and will be considered
  * unreliable. This is correct behavior.
+ *
+ * FTRACE
+ * ==
+ *
+ * When CONFIG_DYNAMIC_FTRACE_WITH_REGS is enabled, the FTRACE trampoline code
+ * is called from a traced function even before the frame pointer prolog.
+ * FTRACE sets up two stack frames (one for the traced function and one for
+ * its caller) so that the unwinder can provide a sensible stack trace for
+ * any tracer function called from the FTRACE trampoline code.
+ *
+ * There are two cases where the stack trace is not reliable.
+ *
+ * (1) The task gets preempted before the two frames are set up. Preemption
+ * involves an interrupt which is an EL1 exception. The unwinder already
+ * handles EL1 exceptions.
+ *
+ * (2) The tracer function that gets called by the FTRACE trampoline code
+ * changes the return PC (e.g., livepatch).
+ *
+ * Not all tracer functions do that. But to err on the side of safety,
+ * consider the stack trace as unreliable in all cases.
+ *
+ * When Function Graph Tracer is used, FTRACE modifies the return address of
+ * the traced function in its stack frame to an FTRACE return trampoline
+ * (return_to_handler). When the traced function returns, control goes to
+ * return_to_handler. return_to_handler calls FTRACE to gather tracing data
+ * and to obtain the original return address. Then, return_to_handler returns
+ * to the original return address.
+ *
+ * There are two cases to consider from a stack trace reliability point of
+ * view:
+ *
+ * (1) Stack traces taken within the traced function (and functions that get
+ * called from there) will show return_to_handler instead of the original
+ * return address. The original return address can be obtained from FTRACE.
+ * The unwinder already obtains it and modifies the return PC in its copy
+ * of the stack frame to the original return address. So, this is handled.
+ *
+ * (2) return_to_handler calls FTRACE as mentioned before. FTRACE discards
+ * the record of the 

[RFC PATCH v2 2/4] arm64: Mark a stack trace unreliable if an EL1 exception frame is detected

2021-04-05 Thread madvenka
From: "Madhavan T. Venkataraman" 

EL1 exceptions can happen on any instruction including instructions in
the frame pointer prolog or epilog. Depending on where exactly they happen,
they could render the stack trace unreliable.

If an EL1 exception frame is found on the stack, mark the stack trace as
unreliable. Now, the EL1 exception frame is not at any well-known offset
on the stack. It can be anywhere on the stack. In order to properly detect
an EL1 exception frame, the return address must be checked against all of
the possible EL1 exception handlers.

Preemption
==

Interrupts encountered in kernel code are also EL1 exceptions. At the end
of an interrupt, the interrupt handler checks if the current task must be
preempted for any reason. If so, it calls the preemption code which takes
the task off the CPU. A stack trace taken on the task after the preemption
will show the EL1 frame and will be considered unreliable. This is correct
behavior as preemption can happen practically at any point in code.

Probing
===

Breakpoints encountered in kernel code are also EL1 exceptions. The probing
infrastructure uses breakpoints for executing probe code. While in the probe
code, the stack trace will show an EL1 frame and will be considered
unreliable. This is also correct behavior.

Reviewed-by: Mark Brown 
Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/include/asm/exception.h |  8 +++
 arch/arm64/kernel/entry.S  | 14 +--
 arch/arm64/kernel/stacktrace.c | 37 ++
 3 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/exception.h 
b/arch/arm64/include/asm/exception.h
index 6546158d2f2d..4ebd2390ef54 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -35,6 +35,14 @@ asmlinkage void el1_sync_handler(struct pt_regs *regs);
 asmlinkage void el0_sync_handler(struct pt_regs *regs);
 asmlinkage void el0_sync_compat_handler(struct pt_regs *regs);
 
+asmlinkage void el1_sync(void);
+asmlinkage void el1_irq(void);
+asmlinkage void el1_error(void);
+asmlinkage void el1_sync_invalid(void);
+asmlinkage void el1_irq_invalid(void);
+asmlinkage void el1_fiq_invalid(void);
+asmlinkage void el1_error_invalid(void);
+
 asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs);
 asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs);
 asmlinkage void enter_from_user_mode(void);
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a31a0a713c85..9fe3aaeff019 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -630,19 +630,19 @@ SYM_CODE_START_LOCAL(el0_fiq_invalid_compat)
 SYM_CODE_END(el0_fiq_invalid_compat)
 #endif
 
-SYM_CODE_START_LOCAL(el1_sync_invalid)
+SYM_CODE_START(el1_sync_invalid)
inv_entry 1, BAD_SYNC
 SYM_CODE_END(el1_sync_invalid)
 
-SYM_CODE_START_LOCAL(el1_irq_invalid)
+SYM_CODE_START(el1_irq_invalid)
inv_entry 1, BAD_IRQ
 SYM_CODE_END(el1_irq_invalid)
 
-SYM_CODE_START_LOCAL(el1_fiq_invalid)
+SYM_CODE_START(el1_fiq_invalid)
inv_entry 1, BAD_FIQ
 SYM_CODE_END(el1_fiq_invalid)
 
-SYM_CODE_START_LOCAL(el1_error_invalid)
+SYM_CODE_START(el1_error_invalid)
inv_entry 1, BAD_ERROR
 SYM_CODE_END(el1_error_invalid)
 
@@ -650,7 +650,7 @@ SYM_CODE_END(el1_error_invalid)
  * EL1 mode handlers.
  */
.align  6
-SYM_CODE_START_LOCAL_NOALIGN(el1_sync)
+SYM_CODE_START_NOALIGN(el1_sync)
kernel_entry 1
mov x0, sp
bl  el1_sync_handler
@@ -658,7 +658,7 @@ SYM_CODE_START_LOCAL_NOALIGN(el1_sync)
 SYM_CODE_END(el1_sync)
 
.align  6
-SYM_CODE_START_LOCAL_NOALIGN(el1_irq)
+SYM_CODE_START_NOALIGN(el1_irq)
kernel_entry 1
gic_prio_irq_setup pmr=x20, tmp=x1
enable_da_f
@@ -737,7 +737,7 @@ el0_irq_naked:
b   ret_to_user
 SYM_CODE_END(el0_irq)
 
-SYM_CODE_START_LOCAL(el1_error)
+SYM_CODE_START(el1_error)
kernel_entry 1
mrs x1, esr_el1
gic_prio_kentry_setup tmp=x2
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 557657d6e6bd..fb11e4372891 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -14,6 +14,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -25,8 +26,44 @@ struct function_range {
 
 /*
  * Special functions where the stack trace is unreliable.
+ *
+ * EL1 exceptions
+ * ==
+ *
+ * EL1 exceptions can happen on any instruction including instructions in
+ * the frame pointer prolog or epilog. Depending on where exactly they happen,
+ * they could render the stack trace unreliable.
+ *
+ * If an EL1 exception frame is found on the stack, mark the stack trace as
+ * unreliable. Now, the EL1 exception frame is not at any well-known offset
+ * on the stack. It can be anywhere on the stack. In order to properly detect
+ * an EL1 exception frame, the return address must be checked against all of
+ * the 

[RFC PATCH v2 1/4] arm64: Implement infrastructure for stack trace reliability checks

2021-04-05 Thread madvenka
From: "Madhavan T. Venkataraman" 

Implement a check_reliability() function that will contain checks for the
presence of various features and conditions that can render the stack trace
unreliable.

Introduce the first reliability check - If a return PC encountered in a
stack trace is not a valid kernel text address, the stack trace is
considered unreliable. It could be some generated code.

Other reliability checks will be added in the future.

These checks will involve checking the return PC to see if it falls inside
any special functions where the stack trace is considered unreliable.
Implement the infrastructure needed for this.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/include/asm/stacktrace.h |  2 +
 arch/arm64/kernel/stacktrace.c  | 80 +
 2 files changed, 82 insertions(+)

diff --git a/arch/arm64/include/asm/stacktrace.h 
b/arch/arm64/include/asm/stacktrace.h
index eb29b1fe8255..684f65808394 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -59,6 +59,7 @@ struct stackframe {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
int graph;
 #endif
+   bool reliable;
 };
 
 extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
@@ -169,6 +170,7 @@ static inline void start_backtrace(struct stackframe *frame,
bitmap_zero(frame->stacks_done, __NR_STACK_TYPES);
frame->prev_fp = 0;
frame->prev_type = STACK_TYPE_UNKNOWN;
+   frame->reliable = true;
 }
 
 #endif /* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index ad20981dfda4..557657d6e6bd 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -18,6 +18,84 @@
 #include 
 #include 
 
+struct function_range {
+   unsigned long   start;
+   unsigned long   end;
+};
+
+/*
+ * Special functions where the stack trace is unreliable.
+ */
+static struct function_range   special_functions[] = {
+   { /* sentinel */ }
+};
+
+static bool is_reliable_function(unsigned long pc)
+{
+   static bool inited = false;
+   struct function_range *func;
+
+   if (!inited) {
+   static char sym[KSYM_NAME_LEN];
+   unsigned long size, offset;
+
+   for (func = special_functions; func->start; func++) {
+   if (kallsyms_lookup(func->start, , ,
+   NULL, sym)) {
+   func->start -= offset;
+   func->end = func->start + size;
+   } else {
+   /*
+* This is just a label. So, we only need to
+* consider that particular location. So, size
+* is the size of one Aarch64 instruction.
+*/
+   func->end = func->start + 4;
+   }
+   }
+   inited = true;
+   }
+
+   for (func = special_functions; func->start; func++) {
+   if (pc >= func->start && pc < func->end)
+   return false;
+   }
+   return true;
+}
+
+/*
+ * Check for the presence of features and conditions that render the stack
+ * trace unreliable.
+ *
+ * Once all such cases have been addressed, this function can aid live
+ * patching (and this comment can be removed).
+ */
+static void check_reliability(struct stackframe *frame)
+{
+   /*
+* If the stack trace has already been marked unreliable, just return.
+*/
+   if (!frame->reliable)
+   return;
+
+   /*
+* First, make sure that the return address is a proper kernel text
+* address. A NULL or invalid return address probably means there's
+* some generated code which __kernel_text_address() doesn't know
+* about. Mark the stack trace as not reliable.
+*/
+   if (!__kernel_text_address(frame->pc)) {
+   frame->reliable = false;
+   return;
+   }
+
+   /*
+* Check the reliability of the return PC's function.
+*/
+   if (!is_reliable_function(frame->pc))
+   frame->reliable = false;
+}
+
 /*
  * AArch64 PCS assigns the frame pointer to x29.
  *
@@ -108,6 +186,8 @@ int notrace unwind_frame(struct task_struct *tsk, struct 
stackframe *frame)
 
frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
+   check_reliability(frame);
+
return 0;
 }
 NOKPROBE_SYMBOL(unwind_frame);
-- 
2.25.1



[RFC PATCH v2 0/4] arm64: Implement stack trace reliability checks

2021-04-05 Thread madvenka
From: "Madhavan T. Venkataraman" 

There are a number of places in kernel code where the stack trace is not
reliable. Enhance the unwinder to check for those cases and mark the
stack trace as unreliable. Once all of the checks are in place, the unwinder
can provide a reliable stack trace. But before this can be used for livepatch,
some other entity needs to guarantee that the frame pointers are all set up
correctly in kernel functions. objtool is currently being worked on to
fill that gap.

Except for the return address check, all the other checks involve checking
the return PC of every frame against certain kernel functions. To do this,
implement some infrastructure code:

- Define a special_functions[] array and populate the array with
  the special functions

- Using kallsyms_lookup(), lookup the symbol table entries for the
  functions and record their address ranges

- Define an is_reliable_function(pc) to match a return PC against
  the special functions.

The unwinder calls is_reliable_function(pc) for every return PC and marks
the stack trace as reliable or unreliable accordingly.

Return address check


Check the return PC of every stack frame to make sure that it is a valid
kernel text address (and not some generated code, for example).

Detect EL1 exception frame
==

EL1 exceptions can happen on any instruction including instructions in
the frame pointer prolog or epilog. Depending on where exactly they happen,
they could render the stack trace unreliable.

Add all of the EL1 exception handlers to special_functions[].

- el1_sync()
- el1_irq()
- el1_error()
- el1_sync_invalid()
- el1_irq_invalid()
- el1_fiq_invalid()
- el1_error_invalid()

Detect ftrace frame
===

When FTRACE executes at the beginning of a traced function, it creates two
frames and calls the tracer function:

- One frame for the traced function

- One frame for the caller of the traced function

That gives a sensible stack trace while executing in the tracer function.
When FTRACE returns to the traced function, the frames are popped and
everything is back to normal.

However, in cases like live patch, the tracer function redirects execution
to a different function. When FTRACE returns, control will go to that target
function. A stack trace taken in the tracer function will not show the target
function. The target function is the real function that we want to track.
So, the stack trace is unreliable.

To detect stack traces from a tracer function, add the following to
special_functions[]:

- ftrace_call + 4

ftrace_call is the label at which the tracer function is patched in. So,
ftrace_call + 4 is its return address. This is what will show up in a
stack trace taken from the tracer function.

When Function Graph Tracing is on, ftrace_graph_caller is patched in
at the label ftrace_graph_call. If a tracer function called before it has
redirected execution as mentioned above, the stack traces taken from within
ftrace_graph_caller will also be unreliable for the same reason as mentioned
above. So, add ftrace_graph_caller to special_functions[] as well.

Also, the Function Graph Tracer modifies the return address of a traced
function to a return trampoline (return_to_handler()) to gather tracing
data on function return. Stack traces taken from the traced function and
functions it calls will not show the original caller of the traced function.
The unwinder handles this case by getting the original caller from FTRACE.

However, stack traces taken from the trampoline itself and functions it calls
are unreliable as the original return address may not be available in
that context. This is because the trampoline calls FTRACE to gather trace
data as well as to obtain the actual return address and FTRACE discards the
record of the original return address along the way.

Add return_to_handler() to special_functions[].

Check for kretprobe
===

For functions with a kretprobe set up, probe code executes on entry
to the function and replaces the return address in the stack frame with a
kretprobe trampoline. Whenever the function returns, control is
transferred to the trampoline. The trampoline eventually returns to the
original return address.

A stack trace taken while executing in the function (or in functions that
get called from the function) will not show the original return address.
Similarly, a stack trace taken while executing in the trampoline itself
(and functions that get called from the trampoline) will not show the
original return address. This means that the caller of the probed function
will not show. This makes the stack trace unreliable.

Add the kretprobe trampoline to special_functions[].

Optprobes
=

Optprobes may be implemented in the future for arm64. For optprobes,
the relevant trampoline(s) can be added to 

[RFC PATCH v2 1/1] arm64: Implement stack trace termination record

2021-04-01 Thread madvenka
From: "Madhavan T. Venkataraman" 

Reliable stacktracing requires that we identify when a stacktrace is
terminated early. We can do this by ensuring all tasks have a final
frame record at a known location on their task stack, and checking
that this is the final frame record in the chain.

Kernel Tasks


All tasks except the idle task have a pt_regs structure right after the
task stack. This is called the task pt_regs. The pt_regs structure has a
special stackframe field. Make this stackframe field the final frame in the
task stack. This needs to be done in copy_thread() which initializes a new
task's pt_regs and initial CPU context.

For the idle task, there is no task pt_regs. For our purpose, we need one.
So, create a pt_regs just like other kernel tasks and make
pt_regs->stackframe the final frame in the idle task stack. This needs to be
done at two places:

- On the primary CPU, the boot task runs. It calls start_kernel()
  and eventually becomes the idle task for the primary CPU. Just
  before start_kernel() is called, set up the final frame.

- On each secondary CPU, a startup task runs that calls
  secondary_startup_kernel() and eventually becomes the idle task
  on the secondary CPU. Just before secondary_start_kernel() is
  called, set up the final frame.

User Tasks
==

User tasks are initially set up like kernel tasks when they are created.
Then, they return to userland after fork via ret_from_fork(). After that,
they enter the kernel only on an EL0 exception. (In arm64, system calls are
also EL0 exceptions). The EL0 exception handler stores state in the task
pt_regs and calls different functions based on the type of exception. The
stack trace for an EL0 exception must end at the task pt_regs. So, make
task pt_regs->stackframe as the final frame in the EL0 exception stack.

In summary, task pt_regs->stackframe is where a successful stack trace ends.

Stack trace termination
===

In the unwinder, terminate the stack trace successfully when
task_pt_regs(task)->stackframe is reached. For stack traces in the kernel,
this will correctly terminate the stack trace at the right place.

However, debuggers terminate the stack trace when FP == 0. In the
pt_regs->stackframe, the PC is 0 as well. So, stack traces taken in the
debugger may print an extra record 0x0 at the end. While this is not
pretty, this does not do any harm. This is a small price to pay for
having reliable stack trace termination in the kernel.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/entry.S  |  8 +---
 arch/arm64/kernel/head.S   | 29 +++--
 arch/arm64/kernel/process.c|  5 +
 arch/arm64/kernel/stacktrace.c | 10 +-
 4 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a31a0a713c85..e2dc2e998934 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -261,16 +261,18 @@ alternative_else_nop_endif
stp lr, x21, [sp, #S_LR]
 
/*
-* For exceptions from EL0, terminate the callchain here.
+* For exceptions from EL0, terminate the callchain here at
+* task_pt_regs(current)->stackframe.
+*
 * For exceptions from EL1, create a synthetic frame record so the
 * interrupted code shows up in the backtrace.
 */
.if \el == 0
-   mov x29, xzr
+   stp xzr, xzr, [sp, #S_STACKFRAME]
.else
stp x29, x22, [sp, #S_STACKFRAME]
-   add x29, sp, #S_STACKFRAME
.endif
+   add x29, sp, #S_STACKFRAME
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 alternative_if_not ARM64_HAS_PAN
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 840bda1869e9..743c019a42c7 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -393,6 +393,23 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
ret x28
 SYM_FUNC_END(__create_page_tables)
 
+   /*
+* The boot task becomes the idle task for the primary CPU. The
+* CPU startup task on each secondary CPU becomes the idle task
+* for the secondary CPU.
+*
+* The idle task does not require pt_regs. But create a dummy
+* pt_regs so that task_pt_regs(idle_task)->stackframe can be
+* set up to be the final frame on the idle task stack just like
+* all the other kernel tasks. This helps the unwinder to
+* terminate the stack trace at a well-known stack offset.
+*/
+   .macro setup_final_frame
+   sub sp, sp, #PT_REGS_SIZE
+   stp xzr, xzr, [sp, #S_STACKFRAME]
+   add x29, sp, #S_STACKFRAME
+   .endm
+
 /*
  * The following fragment of code is executed with the MMU enabled.
  *
@@ -447,9 +464,9 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 #endif
bl  switch_to_vhe   // Prefer 

[RFC PATCH v2 0/1] arm64: Implement stack trace termination record

2021-04-01 Thread madvenka
From: "Madhavan T. Venkataraman" 

Reliable stacktracing requires that we identify when a stacktrace is
terminated early. We can do this by ensuring all tasks have a final
frame record at a known location on their task stack, and checking
that this is the final frame record in the chain.

All tasks have a pt_regs structure right after the task stack in the stack
page. The pt_regs structure contains a stackframe field. Make this stackframe
field the final frame in the task stack so all stack traces end at a fixed
stack offset.

For kernel tasks, this is simple to understand. For user tasks, there is
some extra detail. User tasks get created via fork() et al. Once they return
from fork, they enter the kernel only on an EL0 exception. In arm64,
system calls are also EL0 exceptions.

The EL0 exception handler uses the task pt_regs mentioned above to save
register state and call different exception functions. All stack traces
from EL0 exception code must end at the pt_regs. So, make pt_regs->stackframe
the final frame in the EL0 exception stack.

To summarize, task_pt_regs(task)->stackframe will always be the final frame
in a stack trace.

Sample stack traces
===

The final frame for the idle tasks is different from v1. The rest of the
stack traces are the same.

Primary CPU's idle task (changed from v1)
===

[0.022365]   arch_stack_walk+0x0/0xd0
[0.022376]   callfd_stack+0x30/0x60
[0.022387]   rest_init+0xd8/0xf8
[0.022397]   arch_call_rest_init+0x18/0x24
[0.022411]   start_kernel+0x5b8/0x5f4
[0.022424]   __primary_switched+0xa8/0xac

Secondary CPU's idle task (changed from v1)
=

[0.022484]   arch_stack_walk+0x0/0xd0
[0.022494]   callfd_stack+0x30/0x60
[0.022502]   secondary_start_kernel+0x188/0x1e0
[0.022513]   __secondary_switched+0x80/0x84

---
Changelog:

v1
- Set up task_pt_regs(current)->stackframe as the final frame
  when a new task is initialized in copy_thread().

- Create pt_regs for the idle tasks and set up pt_regs->stackframe
  as the final frame for the idle tasks.

- Set up task_pt_regs(current)->stackframe as the final frame in
  the EL0 exception handler so the EL0 exception stack trace ends
  there.

- Terminate the stack trace successfully in unwind_frame() when
  the FP reaches task_pt_regs(current)->stackframe.

- The stack traces (above) in the kernel will terminate at the
  correct place. Debuggers may show an extra record 0x0 at the end
  for pt_regs->stackframe. That said, I did not see that extra frame
  when I did stack traces using gdb.
v2
- Changed some wordings as suggested by Mark Rutland.

- Removed the synthetic return PC for idle tasks. Changed the
  branches to start_kernel() and secondary_start_kernel() to
  calls so that they will have a proper return PC.

Madhavan T. Venkataraman (1):
  arm64: Implement stack trace termination record

 arch/arm64/kernel/entry.S  |  8 +---
 arch/arm64/kernel/head.S   | 29 +++--
 arch/arm64/kernel/process.c|  5 +
 arch/arm64/kernel/stacktrace.c | 10 +-
 4 files changed, 38 insertions(+), 14 deletions(-)


base-commit: 0d02ec6b3136c73c09e7859f0d0e4e2c4c07b49b
-- 
2.25.1



[RFC PATCH v1 4/4] arm64: Mark stack trace as unreliable if kretprobed functions are present

2021-03-30 Thread madvenka
From: "Madhavan T. Venkataraman" 

When a kretprobe is active for a function, the function's return address
in its stack frame is modified to point to the kretprobe trampoline. When
the function returns, the frame is popped and control is transferred
to the trampoline. The trampoline eventually returns to the original return
address.

If a stack walk is done within the function (or any functions that get
called from there), the stack trace will only show the trampoline and the
not the original caller.

Also, if the trampoline itself and the functions it calls do a stack trace,
that stack trace will also have the same problem. Detect this as well.

If the trampoline is detected in the stack trace, mark the stack trace
as unreliable.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/stacktrace.c | 37 ++
 1 file changed, 37 insertions(+)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 8b493a90c9f3..bf5abb0dd876 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -97,6 +97,36 @@ struct function_range {
  * if return_to_handler is detected on the stack.
  *
  * NOTE: The unwinder must do (1) before (2).
+ *
+ * KPROBES
+ * ===
+ *
+ * There are two types of kprobes:
+ *
+ * (1) Regular kprobes that are placed anywhere in a probed function.
+ * This is implemented by replacing the probed instruction with a
+ * breakpoint. When the breakpoint is hit, the kprobe code emulates
+ * the original instruction in-situ and returns to the next
+ * instruction.
+ *
+ * Breakpoints are EL1 exceptions. When the unwinder detects them,
+ * the stack trace is marked as unreliable as it does not know where
+ * exactly the exception happened. Detection of EL1 exceptions in
+ * a stack trace will be done separately.
+ *
+ * (2) Return kprobes that are placed on the return of a probed function.
+ * In this case, Kprobes sets up an initial breakpoint at the
+ * beginning of the probed function. When the breakpoint is hit,
+ * Kprobes replaces the return address in the stack frame with the
+ * kretprobe_trampoline and records the original return address.
+ * When the probed function returns, control goes to the trampoline
+ * which eventually returns to the original return address.
+ *
+ * Stack traces taken while in the probed function or while in the
+ * trampoline will show kretprobe_trampoline instead of the original
+ * return address. Detect this and mark the stack trace unreliable.
+ * The detection is done by checking if the return PC falls anywhere
+ * in kretprobe_trampoline.
  */
 static struct function_range   special_functions[] = {
/*
@@ -121,6 +151,13 @@ static struct function_range   special_functions[] = {
 #endif
 #endif
 
+   /*
+* Kprobe trampolines.
+*/
+#ifdef CONFIG_KRETPROBES
+   { (unsigned long) kretprobe_trampoline, 0 },
+ #endif
+
{ 0, 0 }
 };
 
-- 
2.25.1



[RFC PATCH v1 2/4] arm64: Mark a stack trace unreliable if an EL1 exception frame is detected

2021-03-30 Thread madvenka
From: "Madhavan T. Venkataraman" 

EL1 exceptions can happen on any instruction including instructions in
the frame pointer prolog or epilog. Depending on where exactly they happen,
they could render the stack trace unreliable.

If an EL1 exception frame is found on the stack, mark the stack trace as
unreliable. Now, the EL1 exception frame is not at any well-known offset
on the stack. It can be anywhere on the stack. In order to properly detect
an EL1 exception frame, the return address must be checked against all of
the possible EL1 exception handlers.

Preemption
==

Interrupts encountered in kernel code are also EL1 exceptions. At the end
of an interrupt, the interrupt handler checks if the current task must be
preempted for any reason. If so, it calls the preemption code which takes
the task off the CPU. A stack trace taken on the task after the preemption
will show the EL1 frame and will be considered unreliable. This is correct
behavior as preemption can happen practically at any point in code.

Probing
===

Breakpoints encountered in kernel code are also EL1 exceptions. The probing
infrastructure uses breakpoints for executing probe code. While in the probe
code, the stack trace will show an EL1 frame and will be considered
unreliable. This is also correct behavior.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/include/asm/exception.h |  8 +++
 arch/arm64/kernel/entry.S  | 14 +--
 arch/arm64/kernel/stacktrace.c | 37 ++
 3 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/exception.h 
b/arch/arm64/include/asm/exception.h
index 6546158d2f2d..4ebd2390ef54 100644
--- a/arch/arm64/include/asm/exception.h
+++ b/arch/arm64/include/asm/exception.h
@@ -35,6 +35,14 @@ asmlinkage void el1_sync_handler(struct pt_regs *regs);
 asmlinkage void el0_sync_handler(struct pt_regs *regs);
 asmlinkage void el0_sync_compat_handler(struct pt_regs *regs);
 
+asmlinkage void el1_sync(void);
+asmlinkage void el1_irq(void);
+asmlinkage void el1_error(void);
+asmlinkage void el1_sync_invalid(void);
+asmlinkage void el1_irq_invalid(void);
+asmlinkage void el1_fiq_invalid(void);
+asmlinkage void el1_error_invalid(void);
+
 asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs);
 asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs);
 asmlinkage void enter_from_user_mode(void);
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a31a0a713c85..9fe3aaeff019 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -630,19 +630,19 @@ SYM_CODE_START_LOCAL(el0_fiq_invalid_compat)
 SYM_CODE_END(el0_fiq_invalid_compat)
 #endif
 
-SYM_CODE_START_LOCAL(el1_sync_invalid)
+SYM_CODE_START(el1_sync_invalid)
inv_entry 1, BAD_SYNC
 SYM_CODE_END(el1_sync_invalid)
 
-SYM_CODE_START_LOCAL(el1_irq_invalid)
+SYM_CODE_START(el1_irq_invalid)
inv_entry 1, BAD_IRQ
 SYM_CODE_END(el1_irq_invalid)
 
-SYM_CODE_START_LOCAL(el1_fiq_invalid)
+SYM_CODE_START(el1_fiq_invalid)
inv_entry 1, BAD_FIQ
 SYM_CODE_END(el1_fiq_invalid)
 
-SYM_CODE_START_LOCAL(el1_error_invalid)
+SYM_CODE_START(el1_error_invalid)
inv_entry 1, BAD_ERROR
 SYM_CODE_END(el1_error_invalid)
 
@@ -650,7 +650,7 @@ SYM_CODE_END(el1_error_invalid)
  * EL1 mode handlers.
  */
.align  6
-SYM_CODE_START_LOCAL_NOALIGN(el1_sync)
+SYM_CODE_START_NOALIGN(el1_sync)
kernel_entry 1
mov x0, sp
bl  el1_sync_handler
@@ -658,7 +658,7 @@ SYM_CODE_START_LOCAL_NOALIGN(el1_sync)
 SYM_CODE_END(el1_sync)
 
.align  6
-SYM_CODE_START_LOCAL_NOALIGN(el1_irq)
+SYM_CODE_START_NOALIGN(el1_irq)
kernel_entry 1
gic_prio_irq_setup pmr=x20, tmp=x1
enable_da_f
@@ -737,7 +737,7 @@ el0_irq_naked:
b   ret_to_user
 SYM_CODE_END(el0_irq)
 
-SYM_CODE_START_LOCAL(el1_error)
+SYM_CODE_START(el1_error)
kernel_entry 1
mrs x1, esr_el1
gic_prio_kentry_setup tmp=x2
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index ff35b3953c39..7662f57d3e88 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -14,6 +14,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -25,8 +26,44 @@ struct function_range {
 
 /*
  * Special functions where the stack trace is unreliable.
+ *
+ * EL1 exceptions
+ * ==
+ *
+ * EL1 exceptions can happen on any instruction including instructions in
+ * the frame pointer prolog or epilog. Depending on where exactly they happen,
+ * they could render the stack trace unreliable.
+ *
+ * If an EL1 exception frame is found on the stack, mark the stack trace as
+ * unreliable. Now, the EL1 exception frame is not at any well-known offset
+ * on the stack. It can be anywhere on the stack. In order to properly detect
+ * an EL1 exception frame, the return address must be checked against all of
+ * the possible EL1 exception 

[RFC PATCH v1 3/4] arm64: Detect FTRACE cases that make the stack trace unreliable

2021-03-30 Thread madvenka
From: "Madhavan T. Venkataraman" 

When CONFIG_DYNAMIC_FTRACE_WITH_REGS is enabled and tracing is activated
for a function, the ftrace infrastructure is called for the function at
the very beginning. Ftrace creates two frames:

- One for the traced function

- One for the caller of the traced function

That gives a reliable stack trace while executing in the ftrace code. When
ftrace returns to the traced function, the frames are popped and everything
is back to normal.

However, in cases like live patch, a tracer function may redirect execution
to a different function when it returns. A stack trace taken while still in
the tracer function will not show the target function. The target function
is the real function that we want to track.

So, if an FTRACE frame is detected on the stack, just mark the stack trace
as unreliable. The detection is done by checking the return PC against
FTRACE trampolines.

Also, the Function Graph Tracer modifies the return address of a traced
function to a return trampoline to gather tracing data on function return.
Stack traces taken from that trampoline and functions it calls are
unreliable as the original return address may not be available in
that context. Mark the stack trace unreliable accordingly.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/entry-ftrace.S | 10 ++
 arch/arm64/kernel/stacktrace.c   | 57 
 2 files changed, 67 insertions(+)

diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index b3e4f9a088b1..5373f88b4c44 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -95,6 +95,16 @@ SYM_CODE_START(ftrace_common)
 SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
bl  ftrace_stub
 
+   /*
+* The only call in the FTRACE trampoline code is above. The above
+* instruction is patched to call a tracer function. Its return
+* address is below (ftrace_graph_call). In a stack trace taken from
+* a tracer function, ftrace_graph_call() will show. The unwinder
+* checks this for reliable stack trace. Please see the comments
+* in stacktrace.c. If another call is added in the FTRACE
+* trampoline code, the special_functions[] array in stacktrace.c
+* must be updated.
+*/
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) // ftrace_graph_caller();
nop // If enabled, this will be replaced
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 7662f57d3e88..8b493a90c9f3 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -51,6 +51,52 @@ struct function_range {
  * unreliable. Breakpoints are used for executing probe code. Stack traces
  * taken while in the probe code will show an EL1 frame and will be considered
  * unreliable. This is correct behavior.
+ *
+ * FTRACE
+ * ==
+ *
+ * When CONFIG_DYNAMIC_FTRACE_WITH_REGS is enabled, the FTRACE trampoline code
+ * is called from a traced function even before the frame pointer prolog.
+ * FTRACE sets up two stack frames (one for the traced function and one for
+ * its caller) so that the unwinder can provide a sensible stack trace for
+ * any tracer function called from the FTRACE trampoline code.
+ *
+ * There are two cases where the stack trace is not reliable.
+ *
+ * (1) The task gets preempted before the two frames are set up. Preemption
+ * involves an interrupt which is an EL1 exception. The unwinder already
+ * handles EL1 exceptions.
+ *
+ * (2) The tracer function that gets called by the FTRACE trampoline code
+ * changes the return PC (e.g., livepatch).
+ *
+ * Not all tracer functions do that. But to err on the side of safety,
+ * consider the stack trace as unreliable in all cases.
+ *
+ * When Function Graph Tracer is used, FTRACE modifies the return address of
+ * the traced function in its stack frame to an FTRACE return trampoline
+ * (return_to_handler). When the traced function returns, control goes to
+ * return_to_handler. return_to_handler calls FTRACE to gather tracing data
+ * and to obtain the original return address. Then, return_to_handler returns
+ * to the original return address.
+ *
+ * There are two cases to consider from a stack trace reliability point of
+ * view:
+ *
+ * (1) Stack traces taken within the traced function (and functions that get
+ * called from there) will show return_to_handler instead of the original
+ * return address. The original return address can be obtained from FTRACE.
+ * The unwinder already obtains it and modifies the return PC in its copy
+ * of the stack frame to the original return address. So, this is handled.
+ *
+ * (2) return_to_handler calls FTRACE as mentioned before. FTRACE discards
+ * the record of the original return address along the way as it does not
+ * need to 

[RFC PATCH v1 1/4] arm64: Implement infrastructure for stack trace reliability checks

2021-03-30 Thread madvenka
From: "Madhavan T. Venkataraman" 

Implement a check_reliability() function that will contain checks for the
presence of various features and conditions that can render the stack trace
unreliable.

Introduce the first reliability check - If a return PC encountered in a
stack trace is not a valid kernel text address, the stack trace is
considered unreliable. It could be some generated code.

Other reliability checks will be added in the future.

These checks will involve checking the return PC to see if it falls inside
any special functions where the stack trace is considered unreliable.
Implement the infrastructure needed for this.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/include/asm/stacktrace.h |  2 +
 arch/arm64/kernel/stacktrace.c  | 80 +
 2 files changed, 82 insertions(+)

diff --git a/arch/arm64/include/asm/stacktrace.h 
b/arch/arm64/include/asm/stacktrace.h
index eb29b1fe8255..684f65808394 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -59,6 +59,7 @@ struct stackframe {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
int graph;
 #endif
+   bool reliable;
 };
 
 extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
@@ -169,6 +170,7 @@ static inline void start_backtrace(struct stackframe *frame,
bitmap_zero(frame->stacks_done, __NR_STACK_TYPES);
frame->prev_fp = 0;
frame->prev_type = STACK_TYPE_UNKNOWN;
+   frame->reliable = true;
 }
 
 #endif /* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index ad20981dfda4..ff35b3953c39 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -18,6 +18,84 @@
 #include 
 #include 
 
+struct function_range {
+   unsigned long   start;
+   unsigned long   end;
+};
+
+/*
+ * Special functions where the stack trace is unreliable.
+ */
+static struct function_range   special_functions[] = {
+   { 0, 0 }
+};
+
+static bool is_reliable_function(unsigned long pc)
+{
+   static bool inited = false;
+   struct function_range *func;
+
+   if (!inited) {
+   static char sym[KSYM_NAME_LEN];
+   unsigned long size, offset;
+
+   for (func = special_functions; func->start; func++) {
+   if (kallsyms_lookup(func->start, , ,
+   NULL, sym)) {
+   func->start -= offset;
+   func->end = func->start + size;
+   } else {
+   /*
+* This is just a label. So, we only need to
+* consider that particular location. So, size
+* is the size of one Aarch64 instruction.
+*/
+   func->end = func->start + 4;
+   }
+   }
+   inited = true;
+   }
+
+   for (func = special_functions; func->start; func++) {
+   if (pc >= func->start && pc < func->end)
+   return false;
+   }
+   return true;
+}
+
+/*
+ * Check for the presence of features and conditions that render the stack
+ * trace unreliable.
+ *
+ * Once all such cases have been addressed, this function can aid live
+ * patching (and this comment can be removed).
+ */
+static void check_reliability(struct stackframe *frame)
+{
+   /*
+* If the stack trace has already been marked unreliable, just return.
+*/
+   if (!frame->reliable)
+   return;
+
+   /*
+* First, make sure that the return address is a proper kernel text
+* address. A NULL or invalid return address probably means there's
+* some generated code which __kernel_text_address() doesn't know
+* about. Mark the stack trace as not reliable.
+*/
+   if (!__kernel_text_address(frame->pc)) {
+   frame->reliable = false;
+   return;
+   }
+
+   /*
+* Check the reliability of the return PC's function.
+*/
+   if (!is_reliable_function(frame->pc))
+   frame->reliable = false;
+}
+
 /*
  * AArch64 PCS assigns the frame pointer to x29.
  *
@@ -108,6 +186,8 @@ int notrace unwind_frame(struct task_struct *tsk, struct 
stackframe *frame)
 
frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
+   check_reliability(frame);
+
return 0;
 }
 NOKPROBE_SYMBOL(unwind_frame);
-- 
2.25.1



[RFC PATCH v1 0/4] arm64: Implement stack trace reliability checks

2021-03-30 Thread madvenka
From: "Madhavan T. Venkataraman" 

There are a number of places in kernel code where the stack trace is not
reliable. Enhance the unwinder to check for those cases and mark the
stack trace as unreliable. Once all of the checks are in place, the unwinder
can be used for livepatching.

Except for the return address check, all the other checks involve checking
the return PC of every frame against certain kernel functions. To do this,
implement some infrastructure code:

- Define a special_functions[] array and populate the array with
  the special functions

- Using kallsyms_lookup(), lookup the symbol table entries for the
  functions and record their address ranges

- Define an is_reliable_function(pc) to match a return PC against
  the special functions.

The unwinder calls is_reliable_function(pc) for every return PC and marks
the stack trace as reliable or unreliable accordingly.

Return address check


Check the return PC of every stack frame to make sure that it is a valid
kernel text address (and not some generated code, for example).

Detect EL1 exception frame
==

EL1 exceptions can happen on any instruction including instructions in
the frame pointer prolog or epilog. Depending on where exactly they happen,
they could render the stack trace unreliable.

Add all of the EL1 exception handlers to special_functions[].

- el1_sync()
- el1_irq()
- el1_error()
- el1_sync_invalid()
- el1_irq_invalid()
- el1_fiq_invalid()
- el1_error_invalid()

Interrupts are EL1 exceptions. When a task is preempted, the preempt
interrupt EL1 frame will show on the stack and the stack trace is
considered unreliable. This is correct behavior as preemption can
happen anywhere.

Breakpoints are EL1 exceptions and can happen anywhere. Stack traces
taken from within the breakpoint handler are, therefore, unreliable.
This includes KProbe code that gets called from the breakpoint handler.

Mark Rutland wanted me to send the EL1 checks in a separate patch series
because the exception handling code is being reorganized. But the
infrastructure code is common to the EL1 detection and other cases listed
below. I was not entirely sure how to neatly split the patches.

Besides, all this patch does is include the EL1 exception handlers in
special_functions[]. When the names change because of the code reorg,
this array can simply be edited. So, in the interest of getting review
comments on this EL1 related work, I have included it in this patch
series.

Hope this is ok.

Detect ftrace frame
===

When FTRACE executes at the beginning of a traced function, it creates two
frames and calls the tracer function:

- One frame for the traced function

- One frame for the caller of the traced function

That gives a sensible stack trace while executing in the tracer function.
When FTRACE returns to the traced function, the frames are popped and
everything is back to normal.

However, in cases like live patch, the tracer function redirects execution
to a different function. When FTRACE returns, control will go to that target
function. A stack trace taken in the tracer function will not show the target
function. The target function is the real function that we want to track.
So, the stack trace is unreliable.

To detect FTRACE in a stack trace, add the following to special_functions[]:

- ftrace_graph_call()
- ftrace_graph_caller()

Please see the diff for a comment that explains why ftrace_graph_call()
must be checked.

Also, the Function Graph Tracer modifies the return address of a traced
function to a return trampoline (return_to_handler()) to gather tracing
data on function return. Stack traces taken from the traced function and
functions it calls will not show the original caller of the traced function.
The unwinder handles this case by getting the original caller from FTRACE.

However, stack traces taken from the trampoline itself and functions it calls
are unreliable as the original return address may not be available in
that context. This is because the trampoline calls FTRACE to gather trace
data as well as to obtain the actual return address and FTRACE discards the
record of the original return address along the way.

Add return_to_handler() to special_functions[].

Check for kretprobe
===

For functions with a kretprobe set up, probe code executes on entry
to the function and replaces the return address in the stack frame with a
kretprobe trampoline. Whenever the function returns, control is
transferred to the trampoline. The trampoline eventually returns to the
original return address.

A stack trace taken while executing in the function (or in functions that
get called from the function) will not show the original return address.
Similarly, a stack trace taken while executing in the trampoline itself
(and functions that get 

[RFC PATCH v1 0/1] arm64: Implement stack trace termination record

2021-03-24 Thread madvenka
From: "Madhavan T. Venkataraman" 

The unwinder needs to be able to reliably tell when it has reached the end
of a stack trace. One way to do this is to have the last stack frame at a
fixed offset from the base of the task stack. When the unwinder reaches
that offset, it knows it is done.

All tasks have a pt_regs structure right after the task stack in the stack
page. The pt_regs structure contains a stackframe field. Make this stackframe
field the last frame in the task stack so all stack traces end at a fixed
stack offset.

For kernel tasks, this is simple to understand. For user tasks, there is
some extra detail. User tasks get created via fork() et al. Once they return
from fork, they enter the kernel only on an EL0 exception. In arm64,
system calls are also EL0 exceptions.

The EL0 exception handler uses the task pt_regs mentioned above to save
register state and call different exception functions. All stack traces
from EL0 exception code must end at the pt_regs. So, make pt_regs->stackframe
the last frame in the EL0 exception stack.

To summarize, task_pt_regs(task)->stackframe will always be the stack
termination record.

Sample stack traces
===

These stack traces were taken using a test driver called callfd from
certain locations.

Primary CPU's idle task
===

[0.022932]   arch_stack_walk+0x0/0xd0
[0.022944]   callfd_stack+0x30/0x60
[0.022955]   rest_init+0xd8/0xf8
[0.022968]   arch_call_rest_init+0x18/0x24
[0.022984]   start_kernel+0x5b8/0x5f4
[0.022993]   ret_from_fork+0x0/0x18

Secondary CPU's idle task
=

[0.023043]   arch_stack_walk+0x0/0xd0
[0.023052]   callfd_stack+0x30/0x60
[0.023061]   secondary_start_kernel+0x188/0x1e0
[0.023071]   ret_from_fork+0x0/0x18

Sample kernel thread


[   12.000679]   arch_stack_walk+0x0/0xd0
[   12.007616]   callfd_stack+0x30/0x60
[   12.014347]   kernel_init+0x84/0x148
[   12.021026]   ret_from_fork+0x10/0x18

kernel_clone() system call
==

Showing EL0 exception:

[  364.152827]   arch_stack_walk+0x0/0xd0
[  364.152833]   callfd_stack+0x30/0x60
[  364.152839]   kernel_clone+0x57c/0x590
[  364.152846]   __do_sys_clone+0x58/0x88
[  364.152851]   __arm64_sys_clone+0x28/0x38
[  364.152856]   el0_svc_common.constprop.0+0x70/0x1a8
[  364.152863]   do_el0_svc+0x2c/0x98
[  364.152868]   el0_svc+0x2c/0x70
[  364.152873]   el0_sync_handler+0xb0/0xb8
[  364.152879]   el0_sync+0x178/0x180

Timer interrupt
===

Showing EL1 exception (Interrupt happened on a secondary CPU):

[  364.195456]   arch_stack_walk+0x0/0xd0
[  364.195467]   callfd_stack+0x30/0x60
[  364.195475]   callfd_callback+0x2c/0x38
[  364.195482]   call_timer_fn+0x38/0x180
[  364.195489]   run_timer_softirq+0x43c/0x6b8
[  364.195495]   __do_softirq+0x138/0x37c
[  364.195501]   irq_exit+0xc0/0xe8
[  364.195512]   __handle_domain_irq+0x70/0xc8
[  364.195521]   gic_handle_irq+0xd4/0x2f4
[  364.195527]   el1_irq+0xc0/0x180
[  364.195533]   arch_cpu_idle+0x18/0x40
[  364.195540]   default_idle_call+0x44/0x170
[  364.195548]   do_idle+0x224/0x278
[  364.195567]   cpu_startup_entry+0x2c/0x98
[  364.195573]   secondary_start_kernel+0x198/0x1e0
[  364.195581]   ret_from_fork+0x0/0x18
---
Changelog:

v1
- Set up task_pt_regs(current)->stackframe as the last frame
  when a new task is initialized in copy_thread().

- Create pt_regs for the idle tasks and set up pt_regs->stackframe
  as the last frame for the idle tasks.

- Set up task_pt_regs(current)->stackframe as the last frame in
  the EL0 exception handler so the EL0 exception stack trace ends
  there.

- Terminate the stack trace successfully in unwind_frame() when
  the FP reaches task_pt_regs(current)->stackframe.

- The stack traces (above) in the kernel will terminate at the
  correct place. Debuggers may show an extra record 0x0 at the end
  for pt_regs->stackframe. That said, I did not see that extra frame
  when I did stack traces using gdb.

Madhavan T. Venkataraman (1):
  arm64: Implement stack trace termination record

 arch/arm64/kernel/entry.S  |  8 +---
 arch/arm64/kernel/head.S   | 28 
 arch/arm64/kernel/process.c|  5 +
 arch/arm64/kernel/stacktrace.c |  8 
 4 files changed, 38 insertions(+), 11 deletions(-)


base-commit: 0d02ec6b3136c73c09e7859f0d0e4e2c4c07b49b
-- 
2.25.1



[RFC PATCH v1 1/1] arm64: Implement stack trace termination record

2021-03-24 Thread madvenka
From: "Madhavan T. Venkataraman" 

The unwinder needs to be able to reliably tell when it has reached the end
of a stack trace. One way to do this is to have the last stack frame at a
fixed offset from the base of the task stack. When the unwinder reaches
that offset, it knows it is done.

Kernel Tasks


All tasks except the idle task have a pt_regs structure right after the
task stack. This is called the task pt_regs. The pt_regs structure has a
special stackframe field. Make this stackframe field the last frame in the
task stack. This needs to be done in copy_thread() which initializes a new
task's pt_regs and initial CPU context.

For the idle task, there is no task pt_regs. For our purpose, we need one.
So, create a pt_regs just like other kernel tasks and make
pt_regs->stackframe the last frame in the idle task stack. This needs to be
done at two places:

- On the primary CPU, the boot task runs. It calls start_kernel()
  and eventually becomes the idle task for the primary CPU. Just
  before start_kernel() is called, set up the last frame.

- On each secondary CPU, a startup task runs that calls
  secondary_startup_kernel() and eventually becomes the idle task
  on the secondary CPU. Just before secondary_start_kernel() is
  called, set up the last frame.

User Tasks
==

User tasks are initially set up like kernel tasks when they are created.
Then, they return to userland after fork via ret_from_fork(). After that,
they enter the kernel only on an EL0 exception. (In arm64, system calls are
also EL0 exceptions). The EL0 exception handler stores state in the task
pt_regs and calls different functions based on the type of exception. The
stack trace for an EL0 exception must end at the task pt_regs. So, make
task pt_regs->stackframe as the last frame in the EL0 exception stack.

In summary, task pt_regs->stackframe is where a successful stack trace ends.

Stack trace termination
===

In the unwinder, terminate the stack trace successfully when
task_pt_regs(task)->stackframe is reached. For stack traces in the kernel,
this will correctly terminate the stack trace at the right place.

However, debuggers terminate the stack trace when FP == 0. In the
pt_regs->stackframe, the PC is 0 as well. So, stack traces taken in the
debugger may print an extra record 0x0 at the end. While this is not
pretty, this does not do any harm. This is a small price to pay for
having reliable stack trace termination in the kernel.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/entry.S  |  8 +---
 arch/arm64/kernel/head.S   | 28 
 arch/arm64/kernel/process.c|  5 +
 arch/arm64/kernel/stacktrace.c |  8 
 4 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a31a0a713c85..e2dc2e998934 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -261,16 +261,18 @@ alternative_else_nop_endif
stp lr, x21, [sp, #S_LR]
 
/*
-* For exceptions from EL0, terminate the callchain here.
+* For exceptions from EL0, terminate the callchain here at
+* task_pt_regs(current)->stackframe.
+*
 * For exceptions from EL1, create a synthetic frame record so the
 * interrupted code shows up in the backtrace.
 */
.if \el == 0
-   mov x29, xzr
+   stp xzr, xzr, [sp, #S_STACKFRAME]
.else
stp x29, x22, [sp, #S_STACKFRAME]
-   add x29, sp, #S_STACKFRAME
.endif
+   add x29, sp, #S_STACKFRAME
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 alternative_if_not ARM64_HAS_PAN
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 840bda1869e9..b8003fb9cfa5 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -393,6 +393,28 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
ret x28
 SYM_FUNC_END(__create_page_tables)
 
+   /*
+* The boot task becomes the idle task for the primary CPU. The
+* CPU startup task on each secondary CPU becomes the idle task
+* for the secondary CPU.
+*
+* The idle task does not require pt_regs. But create a dummy
+* pt_regs so that task_pt_regs(idle_task)->stackframe can be
+* set up to be the last frame on the idle task stack just like
+* all the other kernel tasks. This helps the unwinder to
+* terminate the stack trace at a well-known stack offset.
+*
+* Also, set up the last return PC to be ret_from_fork() just
+* like all the other kernel tasks so that the stack trace of
+* all kernel tasks ends with the same function.
+*/
+   .macro setup_last_frame
+   sub sp, sp, #PT_REGS_SIZE
+   stp xzr, xzr, [sp, #S_STACKFRAME]
+   add x29, sp, #S_STACKFRAME
+   ldr x30, 

[RFC PATCH v2 7/8] arm64: Detect kretprobed functions in stack trace

2021-03-15 Thread madvenka
From: "Madhavan T. Venkataraman" 

When a kretprobe is active for a function, the function's return address
in its stack frame is modified to point to the kretprobe trampoline. When
the function returns, the frame is popped and control is transferred
to the trampoline. The trampoline eventually returns to the original return
address.

If a stack walk is done within the function (or any functions that get
called from there), the stack trace will only show the trampoline and the
not the original caller. Detect this and mark the stack trace as unreliable.

Also, if the trampoline and the functions it calls do a stack trace,
that stack trace will also have the same problem. Detect this as well.

This is done by looking up the symbol table entry for the trampoline
and checking if the return PC in a frame falls anywhere in the
trampoline function.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/stacktrace.c | 43 ++
 1 file changed, 43 insertions(+)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 358aae3906d7..752b77f11c61 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -18,6 +18,26 @@
 #include 
 #include 
 
+#ifdef CONFIG_KRETPROBES
+static bool kretprobe_detected(struct stackframe *frame)
+{
+   static char kretprobe_name[KSYM_NAME_LEN];
+   static unsigned long kretprobe_pc, kretprobe_end_pc;
+   unsigned long pc, offset, size;
+
+   if (!kretprobe_pc) {
+   pc = (unsigned long) kretprobe_trampoline;
+   if (!kallsyms_lookup(pc, , , NULL, kretprobe_name))
+   return false;
+
+   kretprobe_pc = pc - offset;
+   kretprobe_end_pc = kretprobe_pc + size;
+   }
+
+   return frame->pc >= kretprobe_pc && frame->pc < kretprobe_end_pc;
+}
+#endif
+
 static void check_if_reliable(unsigned long fp, struct stackframe *frame,
  struct stack_info *info)
 {
@@ -111,6 +131,29 @@ static void check_if_reliable(unsigned long fp, struct 
stackframe *frame,
frame->reliable = false;
return;
}
+
+#ifdef CONFIG_KRETPROBES
+   /*
+* The return address of a function that has an active kretprobe
+* is modified in the stack frame to point to a trampoline. So,
+* the original return address is not available on the stack.
+*
+* A stack trace taken while executing the function (and its
+* descendants) will not show the original caller. So, mark the
+* stack trace as unreliable if the trampoline shows up in the
+* stack trace. (Obtaining the original return address from
+* task->kretprobe_instances seems problematic and not worth the
+* effort).
+*
+* The stack trace taken while inside the trampoline and functions
+* called by the trampoline have the same problem as above. This
+* is also covered by kretprobe_detected() using a range check.
+*/
+   if (kretprobe_detected(frame)) {
+   frame->reliable = false;
+   return;
+   }
+#endif
 }
 
 /*
-- 
2.25.1



[RFC PATCH v2 8/8] arm64: Implement arch_stack_walk_reliable()

2021-03-15 Thread madvenka
From: "Madhavan T. Venkataraman" 

unwind_frame() already sets the reliable flag in the stack frame during
a stack walk to indicate whether the stack trace is reliable or not.

Implement arch_stack_walk_reliable() like arch_stack_walk() but abort
the stack walk as soon as the reliable flag is set to false for any
reason.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/Kconfig |  1 +
 arch/arm64/kernel/stacktrace.c | 35 ++
 2 files changed, 36 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 1f212b47a48a..954f60c35b26 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -167,6 +167,7 @@ config ARM64
if $(cc-option,-fpatchable-function-entry=2)
select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \
if DYNAMIC_FTRACE_WITH_REGS
+   select HAVE_RELIABLE_STACKTRACE
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_FAST_GUP
select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 752b77f11c61..5d15c111f3aa 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -361,4 +361,39 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, 
void *cookie,
walk_stackframe(task, , consume_entry, cookie);
 }
 
+/*
+ * Walk the stack like arch_stack_walk() but stop the walk as soon as
+ * some unreliability is detected in the stack.
+ */
+int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
+ void *cookie, struct task_struct *task)
+{
+   struct stackframe frame;
+   int ret = 0;
+
+   if (task == current) {
+   start_backtrace(,
+   (unsigned long)__builtin_frame_address(0),
+   (unsigned long)arch_stack_walk_reliable);
+   } else {
+   /*
+* The task must not be running anywhere for the duration of
+* arch_stack_walk_reliable(). The caller must guarantee
+* this.
+*/
+   start_backtrace(, thread_saved_fp(task),
+   thread_saved_pc(task));
+   }
+
+   while (!ret) {
+   if (!frame.reliable)
+   return -EINVAL;
+   if (!consume_entry(cookie, frame.pc))
+   return -EINVAL;
+   ret = unwind_frame(task, );
+   }
+
+   return ret == -ENOENT ? 0 : -EINVAL;
+}
+
 #endif
-- 
2.25.1



[RFC PATCH v2 6/8] arm64: Check the return PC of every stack frame

2021-03-15 Thread madvenka
From: "Madhavan T. Venkataraman" 

If a function encountered in a stack trace is not a valid kernel text
address, the stack trace is considered unreliable. Mark the stack trace
as not reliable.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/stacktrace.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 594806a0c225..358aae3906d7 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -101,6 +101,16 @@ static void check_if_reliable(unsigned long fp, struct 
stackframe *frame,
}
}
 #endif
+
+   /*
+* A NULL or invalid return address probably means there's some
+* generated code which __kernel_text_address() doesn't know about.
+* Mark the stack trace as not reliable.
+*/
+   if (!__kernel_text_address(frame->pc)) {
+   frame->reliable = false;
+   return;
+   }
 }
 
 /*
-- 
2.25.1



[RFC PATCH v2 4/8] arm64: Detect an EL1 exception frame and mark a stack trace unreliable

2021-03-15 Thread madvenka
From: "Madhavan T. Venkataraman" 

EL1 exceptions can happen on any instruction including instructions in
the frame pointer prolog or epilog. Depending on where exactly they happen,
they could render the stack trace unreliable.

If an EL1 exception frame is found on the stack, mark the stack trace as
unreliable.

Now, the EL1 exception frame is not at any well-known offset on the stack.
It can be anywhere on the stack. In order to properly detect an EL1
exception frame the following checks must be done:

- The frame type must be EL1_FRAME.

- When the register state is saved in the EL1 pt_regs, the frame
  pointer x29 is saved in pt_regs->regs[29] and the return PC
  is saved in pt_regs->pc. These must match with the current
  frame.

Interrupts encountered in kernel code are also EL1 exceptions. At the end
of an interrupt, the interrupt handler checks if the current task must be
preempted for any reason. If so, it calls the preemption code which takes
the task off the CPU. A stack trace taken on the task after the preemption
will show the EL1 frame and will be considered unreliable. This is correct
behavior as preemption can happen practically at any point in code
including the frame pointer prolog and epilog.

Breakpoints encountered in kernel code are also EL1 exceptions. The probing
infrastructure uses breakpoints for executing probe code. While in the probe
code, the stack trace will show an EL1 frame and will be considered
unreliable. This is also correct behavior.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/include/asm/stacktrace.h |  2 +
 arch/arm64/kernel/stacktrace.c  | 57 +
 2 files changed, 59 insertions(+)

diff --git a/arch/arm64/include/asm/stacktrace.h 
b/arch/arm64/include/asm/stacktrace.h
index eb29b1fe8255..684f65808394 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -59,6 +59,7 @@ struct stackframe {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
int graph;
 #endif
+   bool reliable;
 };
 
 extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
@@ -169,6 +170,7 @@ static inline void start_backtrace(struct stackframe *frame,
bitmap_zero(frame->stacks_done, __NR_STACK_TYPES);
frame->prev_fp = 0;
frame->prev_type = STACK_TYPE_UNKNOWN;
+   frame->reliable = true;
 }
 
 #endif /* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 504cd161339d..6ae103326f7b 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -18,6 +18,58 @@
 #include 
 #include 
 
+static void check_if_reliable(unsigned long fp, struct stackframe *frame,
+ struct stack_info *info)
+{
+   struct pt_regs *regs;
+   unsigned long regs_start, regs_end;
+
+   /*
+* If the stack trace has already been marked unreliable, just
+* return.
+*/
+   if (!frame->reliable)
+   return;
+
+   /*
+* Assume that this is an intermediate marker frame inside a pt_regs
+* structure created on the stack and get the pt_regs pointer. Other
+* checks will be done below to make sure that this is a marker
+* frame.
+*/
+   regs_start = fp - offsetof(struct pt_regs, stackframe);
+   if (regs_start < info->low)
+   return;
+   regs_end = regs_start + sizeof(*regs);
+   if (regs_end > info->high)
+   return;
+   regs = (struct pt_regs *) regs_start;
+
+   /*
+* When an EL1 exception happens, a pt_regs structure is created
+* on the stack and the register state is recorded. Part of the
+* state is the FP and PC at the time of the exception.
+*
+* In addition, the FP and PC are also stored in pt_regs->stackframe
+* and pt_regs->stackframe is chained with other frames on the stack.
+* This is so that the interrupted function shows up in the stack
+* trace.
+*
+* The exception could have happened during the frame pointer
+* prolog or epilog. This could result in a missing frame in
+* the stack trace so that the caller of the interrupted
+* function does not show up in the stack trace.
+*
+* So, mark the stack trace as unreliable if an EL1 frame is
+* detected.
+*/
+   if (regs->frame_type == EL1_FRAME && regs->pc == frame->pc &&
+   regs->regs[29] == frame->fp) {
+   frame->reliable = false;
+   return;
+   }
+}
+
 /*
  * AArch64 PCS assigns the frame pointer to x29.
  *
@@ -114,6 +166,11 @@ int notrace unwind_frame(struct task_struct *tsk, struct 
stackframe *frame)
 
frame->pc = ptrauth_strip_insn_pac(frame->pc);
 
+   /*
+* Check for features that render the stack trace unreliable.
+*/
+   check_if_reliable(fp, frame, 

[RFC PATCH v2 3/8] arm64: Terminate the stack trace at TASK_FRAME and EL0_FRAME

2021-03-15 Thread madvenka
From: "Madhavan T. Venkataraman" 

Implement the following checks in the unwinder to detect the terminating
frame reliably:

- The frame must end in task_pt_regs(task)->stackframe.

- The frame type must be either TASK_FRAME or EL0_FRAME.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/stacktrace.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index ad20981dfda4..504cd161339d 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -43,16 +43,22 @@ int notrace unwind_frame(struct task_struct *tsk, struct 
stackframe *frame)
 {
unsigned long fp = frame->fp;
struct stack_info info;
+   struct pt_regs *regs;
 
-   /* Terminal record; nothing to unwind */
-   if (!fp)
-   return -ENOENT;
+   if (!tsk)
+   tsk = current;
+   regs = task_pt_regs(tsk);
 
-   if (fp & 0xf)
+   /* Terminal record, nothing to unwind */
+   if (fp == (unsigned long) regs->stackframe) {
+   if (regs->frame_type == TASK_FRAME ||
+   regs->frame_type == EL0_FRAME)
+   return -ENOENT;
return -EINVAL;
+   }
 
-   if (!tsk)
-   tsk = current;
+   if (!fp || fp & 0xf)
+   return -EINVAL;
 
if (!on_accessible_stack(tsk, fp, ))
return -EINVAL;
-- 
2.25.1



[RFC PATCH v2 2/8] arm64: Implement frame types

2021-03-15 Thread madvenka
From: "Madhavan T. Venkataraman" 

Apart from the task pt_regs, pt_regs is also created on the stack for other
other cases:

- EL1 exception. A pt_regs is created on the stack to save register
  state. In addition, pt_regs->stackframe is set up for the
  interrupted kernel function so that the function shows up in the
  EL1 exception stack trace.

- When a traced function calls the ftrace infrastructure at the
  beginning of the function, ftrace creates a pt_regs on the stack
  at that point to save register state. In addition, it sets up
  pt_regs->stackframe for the traced function so that the traced
  function shows up in the stack trace taken from anywhere in the
  ftrace code after that point. When the ftrace code returns to the
  traced function, the pt_regs is removed from the stack.

To summarize, pt_regs->stackframe is used (or will be used) as a marker
frame in stack traces. To enable the unwinder to detect these frames, tag
each pt_regs->stackframe with a type. To record the type, use the unused2
field in struct pt_regs and rename it to frame_type. The types are:

TASK_FRAME
Terminating frame for a normal stack trace.
EL0_FRAME
Terminating frame for an EL0 exception.
EL1_FRAME
EL1 exception frame.
FTRACE_FRAME
FTRACE frame.

These frame types will be used by the unwinder later to validate frames.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/include/asm/ptrace.h | 15 +--
 arch/arm64/kernel/asm-offsets.c |  1 +
 arch/arm64/kernel/entry.S   |  4 
 arch/arm64/kernel/head.S|  2 ++
 arch/arm64/kernel/process.c |  1 +
 5 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index e58bca832dff..a75211ce009a 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -117,6 +117,17 @@
  */
 #define NO_SYSCALL (-1)
 
+/*
+ * pt_regs->stackframe is a marker frame that is used in different
+ * situations. These are the different types of frames. Use patterns
+ * for the frame types instead of (0, 1, 2, 3, ..) so that it is less
+ * likely to find them on the stack.
+ */
+#define TASK_FRAME 0xDEADBEE0  /* Task stack termination frame */
+#define EL0_FRAME  0xDEADBEE1  /* EL0 exception frame */
+#define EL1_FRAME  0xDEADBEE2  /* EL1 exception frame */
+#define FTRACE_FRAME   0xDEADBEE3  /* FTrace frame */
+
 #ifndef __ASSEMBLY__
 #include 
 #include 
@@ -187,11 +198,11 @@ struct pt_regs {
};
u64 orig_x0;
 #ifdef __AARCH64EB__
-   u32 unused2;
+   u32 frame_type;
s32 syscallno;
 #else
s32 syscallno;
-   u32 unused2;
+   u32 frame_type;
 #endif
u64 sdei_ttbr1;
/* Only valid when ARM64_HAS_IRQ_PRIO_MASKING is enabled. */
diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
index a36e2fc330d4..43f97dbc7dfc 100644
--- a/arch/arm64/kernel/asm-offsets.c
+++ b/arch/arm64/kernel/asm-offsets.c
@@ -75,6 +75,7 @@ int main(void)
   DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1));
   DEFINE(S_PMR_SAVE,   offsetof(struct pt_regs, pmr_save));
   DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe));
+  DEFINE(S_FRAME_TYPE, offsetof(struct pt_regs, frame_type));
   DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs));
   BLANK();
 #ifdef CONFIG_COMPAT
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index e2dc2e998934..ecc3507d9cdd 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -269,8 +269,12 @@ alternative_else_nop_endif
 */
.if \el == 0
stp xzr, xzr, [sp, #S_STACKFRAME]
+   ldr w17, =EL0_FRAME
+   str w17, [sp, #S_FRAME_TYPE]
.else
stp x29, x22, [sp, #S_STACKFRAME]
+   ldr w17, =EL1_FRAME
+   str w17, [sp, #S_FRAME_TYPE]
.endif
add x29, sp, #S_STACKFRAME
 
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 2769b20934d4..d2ee78f8f97f 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -410,6 +410,8 @@ SYM_FUNC_END(__create_page_tables)
 */
.macro setup_last_frame
sub sp, sp, #PT_REGS_SIZE
+   ldr w17, =TASK_FRAME
+   str w17, [sp, #S_FRAME_TYPE]
stp xzr, xzr, [sp, #S_STACKFRAME]
add x29, sp, #S_STACKFRAME
ldr x30, =ret_from_fork
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 7ffa689e8b60..5c152fd60503 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -442,6 +442,7 @@ int copy_thread(unsigned long clone_flags, unsigned long 
stack_start,
 * as the last frame for the new task.
 */
p->thread.cpu_context.fp = (unsigned long)childregs->stackframe;
+   

[RFC PATCH v2 5/8] arm64: Detect an FTRACE frame and mark a stack trace unreliable

2021-03-15 Thread madvenka
From: "Madhavan T. Venkataraman" 

When CONFIG_DYNAMIC_FTRACE_WITH_REGS is enabled and tracing is activated
for a function, the ftrace infrastructure is called for the function at
the very beginning. Ftrace creates two frames:

- One for the traced function

- One for the caller of the traced function

That gives a reliable stack trace while executing in the ftrace
infrastructure code. When ftrace returns to the traced function, the frames
are popped and everything is back to normal.

However, in cases like live patch, execution is redirected to a different
function when ftrace returns. A stack trace taken while still in the ftrace
infrastructure code will not show the target function. The target function
is the real function that we want to track.

So, if an FTRACE frame is detected on the stack, just mark the stack trace
as unreliable.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/entry-ftrace.S |  2 ++
 arch/arm64/kernel/stacktrace.c   | 33 
 2 files changed, 35 insertions(+)

diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index b3e4f9a088b1..1ec8c5180fc0 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -74,6 +74,8 @@
/* Create our frame record within pt_regs. */
stp x29, x30, [sp, #S_STACKFRAME]
add x29, sp, #S_STACKFRAME
+   ldr w17, =FTRACE_FRAME
+   str w17, [sp, #S_FRAME_TYPE]
.endm
 
 SYM_CODE_START(ftrace_regs_caller)
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 6ae103326f7b..594806a0c225 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -23,6 +23,7 @@ static void check_if_reliable(unsigned long fp, struct 
stackframe *frame,
 {
struct pt_regs *regs;
unsigned long regs_start, regs_end;
+   unsigned long caller_fp;
 
/*
 * If the stack trace has already been marked unreliable, just
@@ -68,6 +69,38 @@ static void check_if_reliable(unsigned long fp, struct 
stackframe *frame,
frame->reliable = false;
return;
}
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+   /*
+* When tracing is active for a function, the ftrace code is called
+* from the function even before the frame pointer prolog and
+* epilog. ftrace creates a pt_regs structure on the stack to save
+* register state.
+*
+* In addition, ftrace sets up two stack frames and chains them
+* with other frames on the stack. One frame is pt_regs->stackframe
+* that is for the traced function. The other frame is set up right
+* after the pt_regs structure and it is for the caller of the
+* traced function. This is done to ensure a proper stack trace.
+*
+* If the ftrace code returns to the traced function, then all is
+* fine. But if it transfers control to a different function (like
+* in livepatch), then a stack walk performed while still in the
+* ftrace code will not find the target function.
+*
+* So, mark the stack trace as unreliable if an ftrace frame is
+* detected.
+*/
+   if (regs->frame_type == FTRACE_FRAME && frame->fp == regs_end &&
+   frame->fp < info->high) {
+   /* Check the traced function's caller's frame. */
+   caller_fp = READ_ONCE_NOCHECK(*(unsigned long *)(frame->fp));
+   if (caller_fp == regs->regs[29]) {
+   frame->reliable = false;
+   return;
+   }
+   }
+#endif
 }
 
 /*
-- 
2.25.1



[RFC PATCH v2 0/8] arm64: Implement reliable stack trace

2021-03-15 Thread madvenka
From: "Madhavan T. Venkataraman" 

I have made an attempt to implement reliable stack trace for arm64 so
it can be used for livepatch. Below is the list of changes. I have
documented my understanding of the issues and solutions below as well
as in the patch descriptions and the code. Please let me know if my
understanding is incorrect or incomplete anywhere.

Stack termination record


The unwinder needs to be able to reliably tell when it has reached the end
of a stack trace. One way to do this is to have the last stack frame at a
fixed offset from the base of the task stack. When the unwinder reaches
that offset, it knows it is done.

All tasks have a pt_regs structure right after the task stack in the stack
page. The pt_regs structure contains a stackframe field. Make this stackframe
field the last frame in the task stack so all stack traces end at a fixed
stack offset.

For kernel tasks, this is simple to understand. For user tasks, there is
some extra detail. User tasks get created via fork() et al. Once they return
from fork, they enter the kernel only on an EL0 exception. In arm64,
system calls are also EL0 exceptions.

The EL0 exception handler uses the task pt_regs mentioned above to save
register state and call different exception functions. All stack traces
from EL0 exception code must end at the pt_regs. So, make pt_regs->stackframe
the last frame in the EL0 exception stack.

To summarize, task_pt_regs(task)->stackframe will always be the stack
termination record.

Implement frame types
=

Apart from the task pt_regs, pt_regs is also created on the stack for two
other cases:

EL1 exceptions:
When the kernel encounters an exception (more on this below),
it is called an EL1 exception. A pt_regs is created on the
stack at that point to save register state. In addition,
pt_regs->stackframe is set up for the interrupted kernel function
so that the function shows up in the EL1 exception stack trace.

Ftrace:
When CONFIG_DYNAMIC_FTRACE_WITH_REGS is on, the ftrace infrastructure
is called at the beginning of a traced function, ftrace creates a
pt_regs on the stack at that point to save register state. In addition,
it sets up pt_regs->stackframe for the traced function so that the
traced function shows up in the stack trace taken from anywhere in
the ftrace code after that point. When the ftrace code returns to the
traced function, the pt_regs is removed from the stack.

To summarize, pt_regs->stackframe is used as a marker frame in stack traces.
To enable the unwinder to detect these frames, tag each pt_regs->stackframe
with a type. To record the type, use the unused2 field in struct pt_regs and
rename it to frame_type. The types are:

TASK_FRAME
Terminating frame for a normal stack trace.
EL0_FRAME
Terminating frame for an EL0 exception.
EL1_FRAME
EL1 exception frame.
FTRACE_FRAME
FTRACE frame.

These frame types will be used by the unwinder later to validate frames.

Proper termination of the stack trace
=

In the unwinder, check the following for properly terminating the stack
trace:

- Check every frame to see if it is task_pt_regs(stack)->stackframe.
  If it is, terminate the stack trace successfully.

- For additional validation, make sure that the frame_type is either
  TASK_FRAME or EL0_FRAME.

Detect EL1 frame


The kernel runs at Exception Level 1. If an exception happens while
executing in the kernel, it is an EL1 exception. This includes interrupts
which are asynchronous exceptions in arm64.

EL1 exceptions can happen on any instruction including instructions in
the frame pointer prolog or epilog. Depending on where exactly they happen,
they could render the stack trace unreliable.

If an EL1 exception frame is found on the stack, mark the stack trace as
unreliable.

Now, the EL1 exception frame is not at any well-known offset on the stack.
It can be anywhere on the stack. In order to properly detect an EL1
exception frame, some checks must be done. See the patch description and
the code for more detail.

There are two special cases to be aware of:

- At the end of an interrupt, the code checks if the current task
  must be preempted for any reason. If so, it calls the preemption
  code which takes the task off the CPU. A stack trace taken on
  the task after the preemption will show the EL1 frame and will be
  considered unreliable. Preemption can happen practically at any
  point in code including the frame pointer prolog and epilog.

- Breakpoints encountered in kernel code are also EL1 exceptions.
  The probing infrastructure uses breakpoints for executing
  probe code. While in the probe code, the stack trace will show
  an EL1 frame and will be 

[RFC PATCH v2 1/8] arm64: Implement stack trace termination record

2021-03-15 Thread madvenka
From: "Madhavan T. Venkataraman" 

The unwinder needs to be able to reliably tell when it has reached the end
of a stack trace. One way to do this is to have the last stack frame at a
fixed offset from the base of the task stack. When the unwinder reaches
that offset, it knows it is done.

Kernel Tasks


All tasks except the idle task have a pt_regs structure right after the
task stack. This is called the task pt_regs. The pt_regs structure has a
special stackframe field. Make this stackframe field the last frame in the
task stack. This needs to be done in copy_thread() which initializes a new
task's pt_regs and initial CPU context.

For the idle task, there is no task pt_regs. For our purpose, we need one.
So, create a pt_regs just like other kernel tasks and make
pt_regs->stackframe the last frame in the idle task stack. This needs to be
done at two places:

- On the primary CPU, the boot task runs. It calls start_kernel()
  and eventually becomes the idle task for the primary CPU. Just
  before start_kernel() is called, set up the last frame.

- On each secondary CPU, a startup task runs that calls
  secondary_startup_kernel() and eventually becomes the idle task
  on the secondary CPU. Just before secondary_start_kernel() is
  called, set up the last frame.

User Tasks
==

User tasks are initially set up like kernel tasks when they are created.
Then, they return to userland after fork via ret_from_fork(). After that,
they enter the kernel only on an EL0 exception. (In arm64, system calls are
also EL0 exceptions). The EL0 exception handler stores state in the task
pt_regs and calls different functions based on the type of exception. The
stack trace for an EL0 exception must end at the task pt_regs. So, make
task pt_regs->stackframe as the last frame in the EL0 exception stack.

In summary, task pt_regs->stackframe is where a successful stack trace ends.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/kernel/entry.S   |  8 +---
 arch/arm64/kernel/head.S| 28 
 arch/arm64/kernel/process.c |  5 +
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index a31a0a713c85..e2dc2e998934 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -261,16 +261,18 @@ alternative_else_nop_endif
stp lr, x21, [sp, #S_LR]
 
/*
-* For exceptions from EL0, terminate the callchain here.
+* For exceptions from EL0, terminate the callchain here at
+* task_pt_regs(current)->stackframe.
+*
 * For exceptions from EL1, create a synthetic frame record so the
 * interrupted code shows up in the backtrace.
 */
.if \el == 0
-   mov x29, xzr
+   stp xzr, xzr, [sp, #S_STACKFRAME]
.else
stp x29, x22, [sp, #S_STACKFRAME]
-   add x29, sp, #S_STACKFRAME
.endif
+   add x29, sp, #S_STACKFRAME
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
 alternative_if_not ARM64_HAS_PAN
diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S
index 66b0e0b66e31..2769b20934d4 100644
--- a/arch/arm64/kernel/head.S
+++ b/arch/arm64/kernel/head.S
@@ -393,6 +393,28 @@ SYM_FUNC_START_LOCAL(__create_page_tables)
ret x28
 SYM_FUNC_END(__create_page_tables)
 
+   /*
+* The boot task becomes the idle task for the primary CPU. The
+* CPU startup task on each secondary CPU becomes the idle task
+* for the secondary CPU.
+*
+* The idle task does not require pt_regs. But create a dummy
+* pt_regs so that task_pt_regs(idle_task)->stackframe can be
+* set up to be the last frame on the idle task stack just like
+* all the other kernel tasks. This helps the unwinder to
+* terminate the stack trace at a well-known stack offset.
+*
+* Also, set up the last return PC to be ret_from_fork() just
+* like all the other kernel tasks so that the stack trace of
+* all kernel tasks ends with the same function.
+*/
+   .macro setup_last_frame
+   sub sp, sp, #PT_REGS_SIZE
+   stp xzr, xzr, [sp, #S_STACKFRAME]
+   add x29, sp, #S_STACKFRAME
+   ldr x30, =ret_from_fork
+   .endm
+
 /*
  * The following fragment of code is executed with the MMU enabled.
  *
@@ -447,8 +469,7 @@ SYM_FUNC_START_LOCAL(__primary_switched)
 #endif
bl  switch_to_vhe   // Prefer VHE if possible
add sp, sp, #16
-   mov x29, #0
-   mov x30, #0
+   setup_last_frame
b   start_kernel
 SYM_FUNC_END(__primary_switched)
 
@@ -606,8 +627,7 @@ SYM_FUNC_START_LOCAL(__secondary_switched)
cbz x2, __secondary_too_slow
msr sp_el0, x2
scs_load x2, x3
-   mov x29, #0
-   mov x30, #0
+   

[RFC PATCH v1 1/1] arm64: Unwinder enhancements for reliable stack trace

2021-02-23 Thread madvenka
From: "Madhavan T. Venkataraman" 

Unwinder changes


Termination
===

Currently, the unwinder terminates when both the FP (frame pointer)
and the PC (return address) of a frame are 0. But a frame could get
corrupted and zeroed. There needs to be a better check.

The following special terminating frame and function have been
defined for this purpose:

const u64arm64_last_frame[2] __attribute__ ((aligned (16)));

void arm64_last_func(void)
{
}

So, set the FP to arm64_last_frame and the PC to arm64_last_func in
the bottom most frame.

Exception/Interrupt detection
=

An EL1 exception renders the stack trace unreliable as it can happen
anywhere including the frame pointer prolog and epilog. The
unwinder needs to be able to detect the exception on the stack.

Currently, the EL1 exception handler sets up pt_regs on the stack
and chains pt_regs->stackframe with the other frames on the stack.
But, the unwinder does not know where this exception frame is in
the stack trace.

Set the LSB of the exception frame FP to allow the unwinder to
detect the exception frame. When the unwinder detects the frame,
it needs to make sure that it is really an exception frame and
not the result of any stack corruption.

It can do this if the FP and PC are also recorded elsewhere in the
pt_regs for comparison. Currently, the FP is also stored in
regs->regs[29]. The PC is stored in regs->pc. However, regs->pc can
be changed by lower level functions.

Create a new field, pt_regs->orig_pc, and record the return address
PC there. With this, the unwinder can validate the exception frame
and set a flag so that the caller of the unwinder can know when
an exception frame is encountered.

Unwinder return value
=

Currently, the unwinder returns -EINVAL for stack trace termination
as well as stack trace error. Return -ENOENT for stack trace
termination and -EINVAL for error to disambiguate. This idea has
been borrowed from Mark Brown.

Reliable stack trace function
=

Implement arch_stack_walk_reliable(). This function walks the stack like
the existing stack trace functions with a couple of additional checks:

Return address check


For each frame, check the return address to see if it is a
proper kernel text address. If not, return -EINVAL.

Exception frame check
-

Check each frame to see if it is an EL1 exception frame. If it is,
return -EINVAL.

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/include/asm/processor.h  |   2 +
 arch/arm64/include/asm/ptrace.h |   7 ++
 arch/arm64/include/asm/stacktrace.h |   5 ++
 arch/arm64/kernel/asm-offsets.c |   1 +
 arch/arm64/kernel/entry.S   |  14 +++-
 arch/arm64/kernel/head.S|   8 +--
 arch/arm64/kernel/process.c |  12 
 arch/arm64/kernel/stacktrace.c  | 103 +---
 8 files changed, 137 insertions(+), 15 deletions(-)

diff --git a/arch/arm64/include/asm/processor.h 
b/arch/arm64/include/asm/processor.h
index ca2cd75d3286..d268c74d262e 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -195,6 +195,8 @@ static inline void start_thread_common(struct pt_regs 
*regs, unsigned long pc)
memset(regs, 0, sizeof(*regs));
forget_syscall(regs);
regs->pc = pc;
+   regs->stackframe[0] = (u64) arm64_last_frame;
+   regs->stackframe[1] = (u64) arm64_last_func;
 
if (system_uses_irq_prio_masking())
regs->pmr_save = GIC_PRIO_IRQON;
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index e58bca832dff..a15750a9f6e5 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -201,8 +201,15 @@ struct pt_regs {
/* Only valid for some EL1 exceptions. */
u64 lockdep_hardirqs;
u64 exit_rcu;
+
+   /* Only valid for EL1 exceptions. */
+   u64 orig_pc;
+   u64 unused1;
 };
 
+extern const u64 arm64_last_frame[2];
+extern void arm64_last_func(void);
+
 static inline bool in_syscall(struct pt_regs const *regs)
 {
return regs->syscallno != NO_SYSCALL;
diff --git a/arch/arm64/include/asm/stacktrace.h 
b/arch/arm64/include/asm/stacktrace.h
index eb29b1fe8255..9760ceddbd78 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -49,6 +49,9 @@ struct stack_info {
  *
  * @graph:   When FUNCTION_GRAPH_TRACER is selected, holds the index of a
  *   replacement lr value in the ftrace graph stack.
+ 

[RFC PATCH v1 0/1] arm64: Unwinder enhancements for reliable stack trace

2021-02-23 Thread madvenka
From: "Madhavan T. Venkataraman" 

I have made an attempt to add some enhancements to the stack trace code
so it is a few steps closer to what is required for livepatch.

Unwinder changes


Termination
===

Currently, the unwinder terminates when both the FP (frame pointer)
and the PC (return address) of a frame are 0. But a frame could get
corrupted and zeroed. There needs to be a better check.

The following special terminating frame and function have been
defined for this purpose:

const u64arm64_last_frame[2] __attribute__ ((aligned (16)));

void arm64_last_func(void)
{
}

In this patch, the FP is set to arm64_last_frame and the PC is set
to arm64_last_func in the bottom most frame.

Exception/Interrupt detection
=

An EL1 exception renders the stack trace unreliable as it can happen
anywhere including the frame pointer prolog and epilog. The
unwinder needs to be able to detect the exception on the stack.

Currently, the EL1 exception handler sets up pt_regs on the stack.
pt_regs contains a stack frame field that can hold an FP and a PC.
The exception handler chains this stack frame field along with other
frames on the stack. In other words, the EL1 handler creates a
synthetic exception frame. Currently, the unwinder does not know
where this exception frame is in the stack trace.

In this patch, the LSB of the exception frame FP is set. This is
similar to what is done on X86. When the unwinder detects the frame
with the LSB set, it needs to make sure that it is really an
exception frame and not the result of any stack corruption.

It can do this if the FP and PC are also recorded elsewhere in the
pt_regs for comparison. Currently, the FP is also stored in
regs->regs[29]. The PC is stored in regs->pc. However, regs->pc can
be changed by lower level functions. So, the PC needs to be stored
somewhere else as well.

This patch defines a new field, pt_regs->orig_pc, and records the
PC there. With this, the unwinder can validate the exception frame
and set a flag so that the caller of the unwinder can know when
an exception frame is encountered.

Unwinder return value
=

Currently, the unwinder returns -EINVAL for stack trace termination
as well as stack trace error. In this patch, the unwinder returns
-ENOENT for stack trace termination and -EINVAL for error. This idea
has been plagiarized from Mark Brown.

Reliable stack trace function
=

arch_stack_walk_reliable() is implemented in this patch. It walks the
stack like the existing stack trace functions with a couple of additional
checks:

Return address check


For each frame, the return address is checked to see if it is
a proper kernel text address. If not, the stack walk fails.

Exception frame check
-

Each frame is checked to see if it is an EL1 exception frame.
If it is, the stack walk fails.

Signed-off-by: Madhavan T. Venkataraman 

Madhavan T. Venkataraman (1):
  arm64: Unwinder enhancements for reliable stack trace

 arch/arm64/include/asm/processor.h  |   2 +
 arch/arm64/include/asm/ptrace.h |   7 ++
 arch/arm64/include/asm/stacktrace.h |   5 ++
 arch/arm64/kernel/asm-offsets.c |   1 +
 arch/arm64/kernel/entry.S   |  14 +++-
 arch/arm64/kernel/head.S|   8 +--
 arch/arm64/kernel/process.c |  12 
 arch/arm64/kernel/stacktrace.c  | 103 +---
 8 files changed, 137 insertions(+), 15 deletions(-)


base-commit: e0756cfc7d7cd08c98a53b6009c091a3f6a50be6
-- 
2.25.1



[PATCH v2 4/4] [RFC] arm/trampfd: Provide support for the trampoline file descriptor

2020-09-22 Thread madvenka
From: "Madhavan T. Venkataraman" 

- Define architecture specific register names
- Architecture specific functions for:
- system call init
- code descriptor check
- data descriptor check
- Fill a page with a trampoline table,

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm/include/uapi/asm/ptrace.h |  21 +
 arch/arm/kernel/Makefile   |   1 +
 arch/arm/kernel/trampfd.c  | 124 +
 arch/arm/tools/syscall.tbl |   1 +
 4 files changed, 147 insertions(+)
 create mode 100644 arch/arm/kernel/trampfd.c

diff --git a/arch/arm/include/uapi/asm/ptrace.h 
b/arch/arm/include/uapi/asm/ptrace.h
index e61c65b4018d..598047768f9b 100644
--- a/arch/arm/include/uapi/asm/ptrace.h
+++ b/arch/arm/include/uapi/asm/ptrace.h
@@ -151,6 +151,27 @@ struct pt_regs {
 #define ARM_r0 uregs[0]
 #define ARM_ORIG_r0uregs[17]
 
+/*
+ * These register names are to be used by 32-bit applications.
+ */
+enum reg_32_name {
+   arm_min,
+   arm_r0 = arm_min,
+   arm_r1,
+   arm_r2,
+   arm_r3,
+   arm_r4,
+   arm_r5,
+   arm_r6,
+   arm_r7,
+   arm_r8,
+   arm_r9,
+   arm_r10,
+   arm_r11,
+   arm_r12,
+   arm_max,
+};
+
 /*
  * The size of the user-visible VFP state as seen by PTRACE_GET/SETVFPREGS
  * and core dumps.
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 89e5d864e923..652c54c2f19a 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -105,5 +105,6 @@ obj-$(CONFIG_SMP)   += psci_smp.o
 endif
 
 obj-$(CONFIG_HAVE_ARM_SMCCC)   += smccc-call.o
+obj-$(CONFIG_TRAMPFD)  += trampfd.o
 
 extra-y := $(head-y) vmlinux.lds
diff --git a/arch/arm/kernel/trampfd.c b/arch/arm/kernel/trampfd.c
new file mode 100644
index ..45146ed489e8
--- /dev/null
+++ b/arch/arm/kernel/trampfd.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trampoline FD - ARM support.
+ *
+ * Author: Madhavan T. Venkataraman (madve...@linux.microsoft.com)
+ *
+ * Copyright (c) 2020, Microsoft Corporation.
+ */
+
+#include 
+#include 
+
+#define TRAMPFD_CODE_SIZE  28
+
+/*
+ * trampfd syscall.
+ */
+void trampfd_arch(struct trampfd_info *info)
+{
+   info->code_size = TRAMPFD_CODE_SIZE;
+   info->ntrampolines = PAGE_SIZE / info->code_size;
+   info->code_offset = TRAMPFD_CODE_PGOFF << PAGE_SHIFT;
+   info->reserved = 0;
+}
+
+/*
+ * trampfd code descriptor check.
+ */
+int trampfd_code_arch(struct trampfd_code *code)
+{
+   int ntrampolines;
+   int min, max;
+
+   min = arm_min;
+   max = arm_max;
+   ntrampolines = PAGE_SIZE / TRAMPFD_CODE_SIZE;
+
+   if (code->reg < min || code->reg >= max)
+   return -EINVAL;
+
+   if (!code->ntrampolines || code->ntrampolines > ntrampolines)
+   return -EINVAL;
+   return 0;
+}
+
+/*
+ * trampfd data descriptor check.
+ */
+int trampfd_data_arch(struct trampfd_data *data)
+{
+   int min, max;
+
+   min = arm_min;
+   max = arm_max;
+
+   if (data->reg < min || data->reg >= max)
+   return -EINVAL;
+   return 0;
+}
+
+#define MOVW(ins, reg, imm32)  \
+{  \
+   u16 *_imm16 = (u16 *) &(imm32); /* little endian */ \
+   int _hw, _opcode;   \
+   \
+   for (_hw = 0; _hw < 2; _hw++) { \
+   /* movw or movt */  \
+   _opcode = _hw ? 0xe340 : 0xe300;\
+   *ins++ = _opcode | (_imm16[_hw] >> 12) << 16 |  \
+(reg) << 12 | (_imm16[_hw] & 0xFFF);   \
+   }   \
+}
+
+#define LDR(ins, reg)  \
+{  \
+   *ins++ = 0xe590 | (reg) << 16 | (reg) << 12;\
+}
+
+#define BX(ins, reg)   \
+{  \
+   *ins++ = 0xe12fff10 | (reg);\
+}
+
+void trampfd_code_fill(struct trampfd *trampfd, char *addr)
+{
+   char*eaddr = addr + PAGE_SIZE;
+   int creg = trampfd->code_reg - arm_min;
+   int dreg = trampfd->data_reg - arm_min;
+   u32 *code = trampfd->code;
+   u32 *data = trampfd->data;
+   u32 *instruction = (u32 *) addr;
+   int i;
+
+   for (i = 0; i < trampfd->ntrampolines; i++, code++, 

[PATCH v2 2/4] [RFC] x86/trampfd: Provide support for the trampoline file descriptor

2020-09-22 Thread madvenka
From: "Madhavan T. Venkataraman" 

- Define architecture specific register names
- Architecture specific functions for:
- system call init
- code descriptor check
- data descriptor check
- Fill a page with a trampoline table for:
- 32-bit user process
- 64-bit user process

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 arch/x86/include/uapi/asm/ptrace.h |  38 
 arch/x86/kernel/Makefile   |   1 +
 arch/x86/kernel/trampfd.c  | 238 +
 5 files changed, 279 insertions(+)
 create mode 100644 arch/x86/kernel/trampfd.c

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
b/arch/x86/entry/syscalls/syscall_32.tbl
index d8f8a1a69ed1..d4f17806c9ab 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -443,3 +443,4 @@
 437i386openat2 sys_openat2
 438i386pidfd_getfd sys_pidfd_getfd
 439i386faccessat2  sys_faccessat2
+440i386trampfd sys_trampfd
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 78847b32e137..91b37bc4b6f0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -360,6 +360,7 @@
 437common  openat2 sys_openat2
 438common  pidfd_getfd sys_pidfd_getfd
 439common  faccessat2  sys_faccessat2
+440common  trampfd sys_trampfd
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/uapi/asm/ptrace.h 
b/arch/x86/include/uapi/asm/ptrace.h
index 85165c0edafc..b4be362929b3 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -9,6 +9,44 @@
 
 #ifndef __ASSEMBLY__
 
+/*
+ * These register names are to be used by 32-bit applications.
+ */
+enum reg_32_name {
+   x32_min = 0,
+   x32_eax = x32_min,
+   x32_ebx,
+   x32_ecx,
+   x32_edx,
+   x32_esi,
+   x32_edi,
+   x32_ebp,
+   x32_max,
+};
+
+/*
+ * These register names are to be used by 64-bit applications.
+ */
+enum reg_64_name {
+   x64_min = x32_max,
+   x64_rax = x64_min,
+   x64_rbx,
+   x64_rcx,
+   x64_rdx,
+   x64_rsi,
+   x64_rdi,
+   x64_rbp,
+   x64_r8,
+   x64_r9,
+   x64_r10,
+   x64_r11,
+   x64_r12,
+   x64_r13,
+   x64_r14,
+   x64_r15,
+   x64_max,
+};
+
 #ifdef __i386__
 /* this struct defines the way the registers are stored on the
stack during a system call. */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77261db2391..feb7f4f311fd 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -157,3 +157,4 @@ ifeq ($(CONFIG_X86_64),y)
 endif
 
 obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)   += ima_arch.o
+obj-$(CONFIG_TRAMPFD)  += trampfd.o
diff --git a/arch/x86/kernel/trampfd.c b/arch/x86/kernel/trampfd.c
new file mode 100644
index ..7b812c200d01
--- /dev/null
+++ b/arch/x86/kernel/trampfd.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trampoline FD - X86 support.
+ *
+ * Author: Madhavan T. Venkataraman (madve...@linux.microsoft.com)
+ *
+ * Copyright (c) 2020, Microsoft Corporation.
+ */
+
+#include 
+#include 
+
+#define TRAMPFD_CODE_32_SIZE   24
+#define TRAMPFD_CODE_64_SIZE   40
+
+static inline bool is_compat(void)
+{
+   return (IS_ENABLED(CONFIG_X86_32) ||
+   (IS_ENABLED(CONFIG_COMPAT) && test_thread_flag(TIF_ADDR32)));
+}
+
+/*
+ * trampfd syscall.
+ */
+void trampfd_arch(struct trampfd_info *info)
+{
+   if (is_compat())
+   info->code_size = TRAMPFD_CODE_32_SIZE;
+   else
+   info->code_size = TRAMPFD_CODE_64_SIZE;
+   info->ntrampolines = PAGE_SIZE / info->code_size;
+   info->code_offset = TRAMPFD_CODE_PGOFF << PAGE_SHIFT;
+   info->reserved = 0;
+}
+
+/*
+ * trampfd code descriptor check.
+ */
+int trampfd_code_arch(struct trampfd_code *code)
+{
+   int ntrampolines;
+   int min, max;
+
+   if (is_compat()) {
+   min = x32_min;
+   max = x32_max;
+   ntrampolines = PAGE_SIZE / TRAMPFD_CODE_32_SIZE;
+   } else {
+   min = x64_min;
+   max = x64_max;
+   ntrampolines = PAGE_SIZE / TRAMPFD_CODE_64_SIZE;
+   }
+
+   if (code->reg < min || code->reg >= max)
+   return -EINVAL;
+
+   if (!code->ntrampolines || code->ntrampolines > ntrampolines)
+   return -EINVAL;
+   return 0;
+}
+
+/*
+ * trampfd data descriptor check.
+ */
+int trampfd_data_arch(struct trampfd_data *data)
+{
+   int min, max;

[PATCH v2 1/4] [RFC] fs/trampfd: Implement the trampoline file descriptor API

2020-09-22 Thread madvenka
From: "Madhavan T. Venkataraman" 

Introduction


Dynamic code is used in many different user applications. Dynamic code is
often generated at runtime. Dynamic code can also just be a pre-defined
sequence of machine instructions in a data buffer. Examples of dynamic
code are trampolines, JIT code, DBT code, etc.

Dynamic code is placed either in a data page or in a stack page. In order
to execute dynamic code, the page it resides in needs to be mapped with
execute permissions. Writable pages with execute permissions provide an
attack surface for hackers. Attackers can use this to inject malicious
code, modify existing code or do other harm.

To mitigate this, LSMs such as SELinux implement W^X. That is, they may not
allow pages to have both write and execute permissions. This prevents
dynamic code from executing and blocks applications that use it. To allow
genuine applications to run, exceptions have to be made for them (by setting
execmem, etc) which opens the door to security issues.

The W^X implementation today is not complete. There exist many user level
tricks that can be used to load and execute dynamic code. E.g.,

- Load the code into a file and map the file with R-X.

- Load the code in an RW- page. Change the permissions to R--. Then,
  change the permissions to R-X.

- Load the code in an RW- page. Remap the page with R-X to get a separate
  mapping to the same underlying physical page.

IMO, these are all security holes as an attacker can exploit them to inject
his own code.

In the future, these holes will definitely be closed. For instance, LSMs
(such as the IPE proposal [1]) may only allow code in properly signed object
files to be mapped with execute permissions. This will do two things:

- user level tricks using anonymous pages will fail as anonymous
  pages have no file identity

- loading the code in a temporary file and mapping it with R-X
  will fail as the temporary file would not have a signature

We need a way to execute such code without making security exceptions.
Trampolines are a good example of dynamic code. A couple of examples
of trampolines are given below. My first use case for this RFC is
libffi.

Solution


The solution is to convert dynamic code to static code and place it in a
source file. The binary generated from the source can be signed. The kernel
can use signature verification to authenticate the binary and allow the code
to be mapped and executed.

The problem is that the static code has to be able to find the data that it
needs when it executes. For functions, the ABI defines the way to pass
parameters. But, for arbitrary dynamic code, there isn't a standard ABI
compliant way to pass data to the code for most architectures. Each instance
of dynamic code defines its own way. For instance, co-location of code and
data and PC-relative data referencing are used in cases where the ISA
supports it.

We need one standard way that would work for all architectures and ABIs.

The solution has two parts:

1. The maintainer of the code writes the static code assuming that the data
   needed by the code is already pointed to by a designated register.

2. The kernel supplies a small universal trampoline that does the following:

- Load the address of the data in a designated register
- Load the address of the static code in a designated register
- Jump to the static code

User code would use a kernel supplied API to create and map the trampoline.
The address values would be baked into the code so that no special ISA
features are needed.

To conserve memory, the kernel will pack as many trampolines as possible in
a page and provide a trampoline table to user code. The table itself is
managed by the user.

Kernel API
==

A kernel API based on anonymous file descriptors is defined to create
trampolines. The following sections describe the API.

Create trampfd
==

This feature introduces a new trampfd system call.

struct trampfd_info info;
int trampfd;

trampfd = syscall(440, );

The kernel creates a trampoline file object and returns the following items
in info:

ntrampolines
The number of trampolines that can be created with one trampfd. The
user may create fewer trampolines if he wishes.

code_size
The size of each trampoline.

code_offset
The file offset to be used in mmap() to map the trampoline code.

Initialize trampfd
==

A trampfd is initialized in this manner:

struct trampfd_code code;
struct trampfd_data data;

/*
 * Code descriptor.
 */
code.ntrampolines = number of desired trampolines;
code.reg = code register name;
code.table = array of code addresses

/*
 * Data descriptor.
 */
data.reg = data register name;
data.table = array of data addresses


[PATCH v2 0/4] [RFC] Implement Trampoline File Descriptor

2020-09-22 Thread madvenka
can be signed. The
kernel can use signature verification to authenticate the binary and
allow the code to be mapped and executed.

The problem is that the static code has to be able to find the data that it
needs when it executes. For functions, the ABI defines the way to pass
parameters. But, for arbitrary dynamic code, there isn't a standard ABI
compliant way to pass data to the code for most architectures. Each instance
of dynamic code defines its own way. For instance, co-location of code and
data and PC-relative data referencing are used in cases where the ISA
supports it.

We need one standard way that would work for all architectures and ABIs.

The solution proposed here is:

1. Write the static code assuming that the data needed by the code is already
   pointed to by a designated register.

2. Get the kernel to supply a small universal trampoline that does the
   following:

- Load the address of the data in a designated register
- Load the address of the static code in a designated register
- Jump to the static code

User code would use a kernel supplied API to create and map the trampoline.
The address values would be baked into the code so that no special ISA
features are needed.

To conserve memory, the kernel will pack as many trampolines as possible in
a page and provide a trampoline table to user code. The table itself is
managed by the user.

Trampoline File Descriptor (trampfd)
==

I am proposing a kernel API using anonymous file descriptors that can be
used to create the trampolines. The API is described in patch 1/4 of this
patchset. I provide a summary here:

- Create a trampoline file object

- Write a code descriptor into the trampoline file and specify:

- the number of trampolines desired
- the name of the code register
- user pointer to a table of code addresses, one address
  per trampoline

- Write a data descriptor into the trampoline file and specify:

- the name of the data register
- user pointer to a table of data addresses, one address
  per trampoline

- mmap() the trampoline file. The kernel generates a table of
  trampolines in a page and returns the trampoline table address

- munmap() a trampoline file mapping

- Close the trampoline file

Each mmap() will only map a single base page. Large pages are not supported.

A trampoline file can only be mapped once in an address space.

Trampoline file mappings cannot be shared across address spaces. So,
sending the trampoline file descriptor over a unix domain socket and
mapping it in another process will not work.

It is recommended that the code descriptor and the code table be placed
in the .rodata section so an attacker cannot modify them.

Trampoline use and reuse


The code for trampoline X in the trampoline table is:

load_table[X], code_reg
load(code_reg), code_reg
load_table[X], data_reg
load(data_reg), data_reg
jumpcode_reg

The addresses _table[X] and _table[X] are baked into the
trampoline code. So, PC-relative data references are not needed. The user
can modify code_table[X] and data_table[X] dynamically.

For instance, within libffi, the same trampoline X can be used for different
closures at different times by setting:

data_table[X] = closure;
code_table[X] = ABI handling code;

Advantages of the Trampoline File Descriptor approach
=

- Using this support from the kernel, dynamic code can be converted to
  static code with a little effort so applications and libraries can move to
  a more secure model. In the simplest cases such as libffi, dynamic code can
  even be eliminated.

- This initial work is targeted towards X86 and ARM. But it can be supported
  easily on all architectures. We don't need any special ISA features such
  as PC-relative data referencing.

- The only code generation needed is for this small, universal trampoline.

- The kernel does not have to deal with any ABI issues in the generation of
  this trampoline.

- The kernel provides a trampoline table to conserve memory.

- An SELinux setting called "exectramp" can be implemented along the
  lines of "execmem", "execstack" and "execheap" to selectively allow the
  use of trampolines on a per application basis.

- In version 1, a trip to the kernel was required to execute the trampoline.
  In version 2, that is not required. So, there are no performance
  concerns in this approach.

libffi
==

I have implemented my solution for libffi and provided the changes for
X86 and ARM, 32-bit and 64-bit. Here is the reference patch:

http://linux.microsoft.com/~madvenka/libffi/libffi.v2.txt

If the trampfd patchset gets accepted, I will send the lib

[PATCH v2 3/4] [RFC] arm64/trampfd: Provide support for the trampoline file descriptor

2020-09-22 Thread madvenka
From: "Madhavan T. Venkataraman" 

- Define architecture specific register names
- Architecture specific functions for:
- system call init
- code descriptor check
- data descriptor check
- Fill a page with a trampoline table for:
- 32-bit user process
- 64-bit user process

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/include/asm/unistd.h  |   2 +-
 arch/arm64/include/asm/unistd32.h|   2 +
 arch/arm64/include/uapi/asm/ptrace.h |  59 +++
 arch/arm64/kernel/Makefile   |   2 +
 arch/arm64/kernel/trampfd.c  | 244 +++
 5 files changed, 308 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kernel/trampfd.c

diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 3b859596840d..b3b2019f8d16 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls   440
+#define __NR_compat_syscalls   441
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h 
b/arch/arm64/include/asm/unistd32.h
index 6d95d0c8bf2f..c0493c5322d9 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -885,6 +885,8 @@ __SYSCALL(__NR_openat2, sys_openat2)
 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 #define __NR_faccessat2 439
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
+#define __NR_trampfd 440
+__SYSCALL(__NR_trampfd, sys_trampfd)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/include/uapi/asm/ptrace.h 
b/arch/arm64/include/uapi/asm/ptrace.h
index 42cbe34d95ce..2778789c1cbe 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -88,6 +88,65 @@ struct user_pt_regs {
__u64   pstate;
 };
 
+/*
+ * These register names are to be used by 32-bit applications.
+ */
+enum reg_32_name {
+   arm_min,
+   arm_r0 = arm_min,
+   arm_r1,
+   arm_r2,
+   arm_r3,
+   arm_r4,
+   arm_r5,
+   arm_r6,
+   arm_r7,
+   arm_r8,
+   arm_r9,
+   arm_r10,
+   arm_r11,
+   arm_r12,
+   arm_max,
+};
+
+/*
+ * These register names are to be used by 64-bit applications.
+ */
+enum reg_64_name {
+   arm64_min = arm_max,
+   arm64_r0 = arm64_min,
+   arm64_r1,
+   arm64_r2,
+   arm64_r3,
+   arm64_r4,
+   arm64_r5,
+   arm64_r6,
+   arm64_r7,
+   arm64_r8,
+   arm64_r9,
+   arm64_r10,
+   arm64_r11,
+   arm64_r12,
+   arm64_r13,
+   arm64_r14,
+   arm64_r15,
+   arm64_r16,
+   arm64_r17,
+   arm64_r18,
+   arm64_r19,
+   arm64_r20,
+   arm64_r21,
+   arm64_r22,
+   arm64_r23,
+   arm64_r24,
+   arm64_r25,
+   arm64_r26,
+   arm64_r27,
+   arm64_r28,
+   arm64_r29,
+   arm64_max,
+};
+
 struct user_fpsimd_state {
__uint128_t vregs[32];
__u32   fpsr;
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index a561cbb91d4d..18d373fb1208 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -71,3 +71,5 @@ extra-y   += $(head-y) 
vmlinux.lds
 ifeq ($(CONFIG_DEBUG_EFI),y)
 AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\""
 endif
+
+obj-$(CONFIG_TRAMPFD)  += trampfd.o
diff --git a/arch/arm64/kernel/trampfd.c b/arch/arm64/kernel/trampfd.c
new file mode 100644
index ..3b40ebb12907
--- /dev/null
+++ b/arch/arm64/kernel/trampfd.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trampoline FD - ARM64 support.
+ *
+ * Author: Madhavan T. Venkataraman (madve...@linux.microsoft.com)
+ *
+ * Copyright (c) 2020, Microsoft Corporation.
+ */
+
+#include 
+#include 
+#include 
+
+#define TRAMPFD_CODE_32_SIZE   28
+#define TRAMPFD_CODE_64_SIZE   48
+
+static inline bool is_compat(void)
+{
+   return is_compat_thread(task_thread_info(current));
+}
+
+/*
+ * trampfd syscall.
+ */
+void trampfd_arch(struct trampfd_info *info)
+{
+   if (is_compat())
+   info->code_size = TRAMPFD_CODE_32_SIZE;
+   else
+   info->code_size = TRAMPFD_CODE_64_SIZE;
+   info->ntrampolines = PAGE_SIZE / info->code_size;
+   info->code_offset = TRAMPFD_CODE_PGOFF << PAGE_SHIFT;
+   info->reserved = 0;
+}
+
+/*
+ * trampfd code descriptor check.
+ */
+int trampfd_code_arch(struct trampfd_code *code)
+{
+   int ntrampolines;
+   int min, max;
+
+   if (is_compat()) {
+   min = arm_min;
+   max = arm_max;
+   ntrampolines = PAGE_SIZE / TRAMPFD_CODE_32_SIZE;
+   

[PATCH v2 1/4] [RFC] fs/trampfd: Implement the trampoline file descriptor API

2020-09-16 Thread madvenka
From: "Madhavan T. Venkataraman" 

Introduction


Dynamic code is used in many different user applications. Dynamic code is
often generated at runtime. Dynamic code can also just be a pre-defined
sequence of machine instructions in a data buffer. Examples of dynamic
code are trampolines, JIT code, DBT code, etc.

Dynamic code is placed either in a data page or in a stack page. In order
to execute dynamic code, the page it resides in needs to be mapped with
execute permissions. Writable pages with execute permissions provide an
attack surface for hackers. Attackers can use this to inject malicious
code, modify existing code or do other harm.

To mitigate this, LSMs such as SELinux implement W^X. That is, they may not
allow pages to have both write and execute permissions. This prevents
dynamic code from executing and blocks applications that use it. To allow
genuine applications to run, exceptions have to be made for them (by setting
execmem, etc) which opens the door to security issues.

The W^X implementation today is not complete. There exist many user level
tricks that can be used to load and execute dynamic code. E.g.,

- Load the code into a file and map the file with R-X.

- Load the code in an RW- page. Change the permissions to R--. Then,
  change the permissions to R-X.

- Load the code in an RW- page. Remap the page with R-X to get a separate
  mapping to the same underlying physical page.

IMO, these are all security holes as an attacker can exploit them to inject
his own code.

In the future, these holes will definitely be closed. For instance, LSMs
(such as the IPE proposal [1]) may only allow code in properly signed object
files to be mapped with execute permissions. This will do two things:

- user level tricks using anonymous pages will fail as anonymous
  pages have no file identity

- loading the code in a temporary file and mapping it with R-X
  will fail as the temporary file would not have a signature

We need a way to execute such code without making security exceptions.
Trampolines are a good example of dynamic code. A couple of examples
of trampolines are given below. My first use case for this RFC is
libffi.

Solution


The solution is to convert dynamic code to static code and place it in a
source file. The binary generated from the source can be signed. The kernel
can use signature verification to authenticate the binary and allow the code
to be mapped and executed.

The problem is that the static code has to be able to find the data that it
needs when it executes. For functions, the ABI defines the way to pass
parameters. But, for arbitrary dynamic code, there isn't a standard ABI
compliant way to pass data to the code for most architectures. Each instance
of dynamic code defines its own way. For instance, co-location of code and
data and PC-relative data referencing are used in cases where the ISA
supports it.

We need one standard way that would work for all architectures and ABIs.

The solution has two parts:

1. The maintainer of the code writes the static code assuming that the data
   needed by the code is already pointed to by a designated register.

2. The kernel supplies a small universal trampoline that does the following:

- Load the address of the data in a designated register
- Load the address of the static code in a designated register
- Jump to the static code

User code would use a kernel supplied API to create and map the trampoline.
The address values would be baked into the code so that no special ISA
features are needed.

To conserve memory, the kernel will pack as many trampolines as possible in
a page and provide a trampoline table to user code. The table itself is
managed by the user.

Kernel API
==

A kernel API based on anonymous file descriptors is defined to create
trampolines. The following sections describe the API.

Create trampfd
==

This feature introduces a new trampfd system call.

struct trampfd_info info;
int trampfd;

trampfd = syscall(440, );

The kernel creates a trampoline file object and returns the following items
in info:

ntrampolines
The number of trampolines that can be created with one trampfd. The
user may create fewer trampolines if he wishes.

code_size
The size of each trampoline.

code_offset
The file offset to be used in mmap() to map the trampoline code.

Initialize trampfd
==

A trampfd is initialized in this manner:

struct trampfd_code code;
struct trampfd_data data;

/*
 * Code descriptor.
 */
code.ntrampolines = number of desired trampolines;
code.reg = code register name;
code.table = array of code addresses

/*
 * Data descriptor.
 */
data.reg = data register name;
data.table = array of data addresses


[PATCH v2 3/4] [RFC] arm64/trampfd: Provide support for the trampoline file descriptor

2020-09-16 Thread madvenka
From: "Madhavan T. Venkataraman" 

- Define architecture specific register names
- Architecture specific functions for:
- system call init
- code descriptor check
- data descriptor check
- Fill a page with a trampoline table for:
- 32-bit user process
- 64-bit user process

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/include/asm/unistd.h  |   2 +-
 arch/arm64/include/asm/unistd32.h|   2 +
 arch/arm64/include/uapi/asm/ptrace.h |  59 +++
 arch/arm64/kernel/Makefile   |   2 +
 arch/arm64/kernel/trampfd.c  | 244 +++
 5 files changed, 308 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/kernel/trampfd.c

diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 3b859596840d..b3b2019f8d16 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls   440
+#define __NR_compat_syscalls   441
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h 
b/arch/arm64/include/asm/unistd32.h
index 6d95d0c8bf2f..c0493c5322d9 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -885,6 +885,8 @@ __SYSCALL(__NR_openat2, sys_openat2)
 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 #define __NR_faccessat2 439
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
+#define __NR_trampfd 440
+__SYSCALL(__NR_trampfd, sys_trampfd)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/include/uapi/asm/ptrace.h 
b/arch/arm64/include/uapi/asm/ptrace.h
index 42cbe34d95ce..2778789c1cbe 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -88,6 +88,65 @@ struct user_pt_regs {
__u64   pstate;
 };
 
+/*
+ * These register names are to be used by 32-bit applications.
+ */
+enum reg_32_name {
+   arm_min,
+   arm_r0 = arm_min,
+   arm_r1,
+   arm_r2,
+   arm_r3,
+   arm_r4,
+   arm_r5,
+   arm_r6,
+   arm_r7,
+   arm_r8,
+   arm_r9,
+   arm_r10,
+   arm_r11,
+   arm_r12,
+   arm_max,
+};
+
+/*
+ * These register names are to be used by 64-bit applications.
+ */
+enum reg_64_name {
+   arm64_min = arm_max,
+   arm64_r0 = arm64_min,
+   arm64_r1,
+   arm64_r2,
+   arm64_r3,
+   arm64_r4,
+   arm64_r5,
+   arm64_r6,
+   arm64_r7,
+   arm64_r8,
+   arm64_r9,
+   arm64_r10,
+   arm64_r11,
+   arm64_r12,
+   arm64_r13,
+   arm64_r14,
+   arm64_r15,
+   arm64_r16,
+   arm64_r17,
+   arm64_r18,
+   arm64_r19,
+   arm64_r20,
+   arm64_r21,
+   arm64_r22,
+   arm64_r23,
+   arm64_r24,
+   arm64_r25,
+   arm64_r26,
+   arm64_r27,
+   arm64_r28,
+   arm64_r29,
+   arm64_max,
+};
+
 struct user_fpsimd_state {
__uint128_t vregs[32];
__u32   fpsr;
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index a561cbb91d4d..18d373fb1208 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -71,3 +71,5 @@ extra-y   += $(head-y) 
vmlinux.lds
 ifeq ($(CONFIG_DEBUG_EFI),y)
 AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\""
 endif
+
+obj-$(CONFIG_TRAMPFD)  += trampfd.o
diff --git a/arch/arm64/kernel/trampfd.c b/arch/arm64/kernel/trampfd.c
new file mode 100644
index ..3b40ebb12907
--- /dev/null
+++ b/arch/arm64/kernel/trampfd.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trampoline FD - ARM64 support.
+ *
+ * Author: Madhavan T. Venkataraman (madve...@linux.microsoft.com)
+ *
+ * Copyright (c) 2020, Microsoft Corporation.
+ */
+
+#include 
+#include 
+#include 
+
+#define TRAMPFD_CODE_32_SIZE   28
+#define TRAMPFD_CODE_64_SIZE   48
+
+static inline bool is_compat(void)
+{
+   return is_compat_thread(task_thread_info(current));
+}
+
+/*
+ * trampfd syscall.
+ */
+void trampfd_arch(struct trampfd_info *info)
+{
+   if (is_compat())
+   info->code_size = TRAMPFD_CODE_32_SIZE;
+   else
+   info->code_size = TRAMPFD_CODE_64_SIZE;
+   info->ntrampolines = PAGE_SIZE / info->code_size;
+   info->code_offset = TRAMPFD_CODE_PGOFF << PAGE_SHIFT;
+   info->reserved = 0;
+}
+
+/*
+ * trampfd code descriptor check.
+ */
+int trampfd_code_arch(struct trampfd_code *code)
+{
+   int ntrampolines;
+   int min, max;
+
+   if (is_compat()) {
+   min = arm_min;
+   max = arm_max;
+   ntrampolines = PAGE_SIZE / TRAMPFD_CODE_32_SIZE;
+   

[PATCH v2 0/4] [RFC] Implement Trampoline File Descriptor

2020-09-16 Thread madvenka
can be signed. The
kernel can use signature verification to authenticate the binary and
allow the code to be mapped and executed.

The problem is that the static code has to be able to find the data that it
needs when it executes. For functions, the ABI defines the way to pass
parameters. But, for arbitrary dynamic code, there isn't a standard ABI
compliant way to pass data to the code for most architectures. Each instance
of dynamic code defines its own way. For instance, co-location of code and
data and PC-relative data referencing are used in cases where the ISA
supports it.

We need one standard way that would work for all architectures and ABIs.

The solution proposed here is:

1. Write the static code assuming that the data needed by the code is already
   pointed to by a designated register.

2. Get the kernel to supply a small universal trampoline that does the
   following:

- Load the address of the data in a designated register
- Load the address of the static code in a designated register
- Jump to the static code

User code would use a kernel supplied API to create and map the trampoline.
The address values would be baked into the code so that no special ISA
features are needed.

To conserve memory, the kernel will pack as many trampolines as possible in
a page and provide a trampoline table to user code. The table itself is
managed by the user.

Trampoline File Descriptor (trampfd)
==

I am proposing a kernel API using anonymous file descriptors that can be
used to create the trampolines. The API is described in patch 1/4 of this
patchset. I provide a summary here:

- Create a trampoline file object

- Write a code descriptor into the trampoline file and specify:

- the number of trampolines desired
- the name of the code register
- user pointer to a table of code addresses, one address
  per trampoline

- Write a data descriptor into the trampoline file and specify:

- the name of the data register
- user pointer to a table of data addresses, one address
  per trampoline

- mmap() the trampoline file. The kernel generates a table of
  trampolines in a page and returns the trampoline table address

- munmap() a trampoline file mapping

- Close the trampoline file

Each mmap() will only map a single base page. Large pages are not supported.

A trampoline file can only be mapped once in an address space.

Trampoline file mappings cannot be shared across address spaces. So,
sending the trampoline file descriptor over a unix domain socket and
mapping it in another process will not work.

It is recommended that the code descriptor and the code table be placed
in the .rodata section so an attacker cannot modify them.

Trampoline use and reuse


The code for trampoline X in the trampoline table is:

load_table[X], code_reg
load(code_reg), code_reg
load_table[X], data_reg
load(data_reg), data_reg
jumpcode_reg

The addresses _table[X] and _table[X] are baked into the
trampoline code. So, PC-relative data references are not needed. The user
can modify code_table[X] and data_table[X] dynamically.

For instance, within libffi, the same trampoline X can be used for different
closures at different times by setting:

data_table[X] = closure;
code_table[X] = ABI handling code;

Advantages of the Trampoline File Descriptor approach
=

- Using this support from the kernel, dynamic code can be converted to
  static code with a little effort so applications and libraries can move to
  a more secure model. In the simplest cases such as libffi, dynamic code can
  even be eliminated.

- This initial work is targeted towards X86 and ARM. But it can be supported
  easily on all architectures. We don't need any special ISA features such
  as PC-relative data referencing.

- The only code generation needed is for this small, universal trampoline.

- The kernel does not have to deal with any ABI issues in the generation of
  this trampoline.

- The kernel provides a trampoline table to conserve memory.

- An SELinux setting called "exectramp" can be implemented along the
  lines of "execmem", "execstack" and "execheap" to selectively allow the
  use of trampolines on a per application basis.

- In version 1, a trip to the kernel was required to execute the trampoline.
  In version 2, that is not required. So, there are no performance
  concerns in this approach.

libffi
==

I have implemented my solution for libffi and provided the changes for
X86 and ARM, 32-bit and 64-bit. Here is the reference patch:

http://linux.microsoft.com/~madvenka/libffi/libffi.v2.txt

If the trampfd patchset gets accepted, I will send the lib

[PATCH v2 2/4] [RFC] x86/trampfd: Provide support for the trampoline file descriptor

2020-09-16 Thread madvenka
From: "Madhavan T. Venkataraman" 

- Define architecture specific register names
- Architecture specific functions for:
- system call init
- code descriptor check
- data descriptor check
- Fill a page with a trampoline table for:
- 32-bit user process
- 64-bit user process

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 arch/x86/include/uapi/asm/ptrace.h |  38 
 arch/x86/kernel/Makefile   |   1 +
 arch/x86/kernel/trampfd.c  | 238 +
 5 files changed, 279 insertions(+)
 create mode 100644 arch/x86/kernel/trampfd.c

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
b/arch/x86/entry/syscalls/syscall_32.tbl
index d8f8a1a69ed1..d4f17806c9ab 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -443,3 +443,4 @@
 437i386openat2 sys_openat2
 438i386pidfd_getfd sys_pidfd_getfd
 439i386faccessat2  sys_faccessat2
+440i386trampfd sys_trampfd
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 78847b32e137..91b37bc4b6f0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -360,6 +360,7 @@
 437common  openat2 sys_openat2
 438common  pidfd_getfd sys_pidfd_getfd
 439common  faccessat2  sys_faccessat2
+440common  trampfd sys_trampfd
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/uapi/asm/ptrace.h 
b/arch/x86/include/uapi/asm/ptrace.h
index 85165c0edafc..b4be362929b3 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -9,6 +9,44 @@
 
 #ifndef __ASSEMBLY__
 
+/*
+ * These register names are to be used by 32-bit applications.
+ */
+enum reg_32_name {
+   x32_min = 0,
+   x32_eax = x32_min,
+   x32_ebx,
+   x32_ecx,
+   x32_edx,
+   x32_esi,
+   x32_edi,
+   x32_ebp,
+   x32_max,
+};
+
+/*
+ * These register names are to be used by 64-bit applications.
+ */
+enum reg_64_name {
+   x64_min = x32_max,
+   x64_rax = x64_min,
+   x64_rbx,
+   x64_rcx,
+   x64_rdx,
+   x64_rsi,
+   x64_rdi,
+   x64_rbp,
+   x64_r8,
+   x64_r9,
+   x64_r10,
+   x64_r11,
+   x64_r12,
+   x64_r13,
+   x64_r14,
+   x64_r15,
+   x64_max,
+};
+
 #ifdef __i386__
 /* this struct defines the way the registers are stored on the
stack during a system call. */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77261db2391..feb7f4f311fd 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -157,3 +157,4 @@ ifeq ($(CONFIG_X86_64),y)
 endif
 
 obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)   += ima_arch.o
+obj-$(CONFIG_TRAMPFD)  += trampfd.o
diff --git a/arch/x86/kernel/trampfd.c b/arch/x86/kernel/trampfd.c
new file mode 100644
index ..7b812c200d01
--- /dev/null
+++ b/arch/x86/kernel/trampfd.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trampoline FD - X86 support.
+ *
+ * Author: Madhavan T. Venkataraman (madve...@linux.microsoft.com)
+ *
+ * Copyright (c) 2020, Microsoft Corporation.
+ */
+
+#include 
+#include 
+
+#define TRAMPFD_CODE_32_SIZE   24
+#define TRAMPFD_CODE_64_SIZE   40
+
+static inline bool is_compat(void)
+{
+   return (IS_ENABLED(CONFIG_X86_32) ||
+   (IS_ENABLED(CONFIG_COMPAT) && test_thread_flag(TIF_ADDR32)));
+}
+
+/*
+ * trampfd syscall.
+ */
+void trampfd_arch(struct trampfd_info *info)
+{
+   if (is_compat())
+   info->code_size = TRAMPFD_CODE_32_SIZE;
+   else
+   info->code_size = TRAMPFD_CODE_64_SIZE;
+   info->ntrampolines = PAGE_SIZE / info->code_size;
+   info->code_offset = TRAMPFD_CODE_PGOFF << PAGE_SHIFT;
+   info->reserved = 0;
+}
+
+/*
+ * trampfd code descriptor check.
+ */
+int trampfd_code_arch(struct trampfd_code *code)
+{
+   int ntrampolines;
+   int min, max;
+
+   if (is_compat()) {
+   min = x32_min;
+   max = x32_max;
+   ntrampolines = PAGE_SIZE / TRAMPFD_CODE_32_SIZE;
+   } else {
+   min = x64_min;
+   max = x64_max;
+   ntrampolines = PAGE_SIZE / TRAMPFD_CODE_64_SIZE;
+   }
+
+   if (code->reg < min || code->reg >= max)
+   return -EINVAL;
+
+   if (!code->ntrampolines || code->ntrampolines > ntrampolines)
+   return -EINVAL;
+   return 0;
+}
+
+/*
+ * trampfd data descriptor check.
+ */
+int trampfd_data_arch(struct trampfd_data *data)
+{
+   int min, max;

[PATCH v2 4/4] [RFC] arm/trampfd: Provide support for the trampoline file descriptor

2020-09-16 Thread madvenka
From: "Madhavan T. Venkataraman" 

- Define architecture specific register names
- Architecture specific functions for:
- system call init
- code descriptor check
- data descriptor check
- Fill a page with a trampoline table,

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm/include/uapi/asm/ptrace.h |  21 +
 arch/arm/kernel/Makefile   |   1 +
 arch/arm/kernel/trampfd.c  | 124 +
 arch/arm/tools/syscall.tbl |   1 +
 4 files changed, 147 insertions(+)
 create mode 100644 arch/arm/kernel/trampfd.c

diff --git a/arch/arm/include/uapi/asm/ptrace.h 
b/arch/arm/include/uapi/asm/ptrace.h
index e61c65b4018d..598047768f9b 100644
--- a/arch/arm/include/uapi/asm/ptrace.h
+++ b/arch/arm/include/uapi/asm/ptrace.h
@@ -151,6 +151,27 @@ struct pt_regs {
 #define ARM_r0 uregs[0]
 #define ARM_ORIG_r0uregs[17]
 
+/*
+ * These register names are to be used by 32-bit applications.
+ */
+enum reg_32_name {
+   arm_min,
+   arm_r0 = arm_min,
+   arm_r1,
+   arm_r2,
+   arm_r3,
+   arm_r4,
+   arm_r5,
+   arm_r6,
+   arm_r7,
+   arm_r8,
+   arm_r9,
+   arm_r10,
+   arm_r11,
+   arm_r12,
+   arm_max,
+};
+
 /*
  * The size of the user-visible VFP state as seen by PTRACE_GET/SETVFPREGS
  * and core dumps.
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 89e5d864e923..652c54c2f19a 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -105,5 +105,6 @@ obj-$(CONFIG_SMP)   += psci_smp.o
 endif
 
 obj-$(CONFIG_HAVE_ARM_SMCCC)   += smccc-call.o
+obj-$(CONFIG_TRAMPFD)  += trampfd.o
 
 extra-y := $(head-y) vmlinux.lds
diff --git a/arch/arm/kernel/trampfd.c b/arch/arm/kernel/trampfd.c
new file mode 100644
index ..45146ed489e8
--- /dev/null
+++ b/arch/arm/kernel/trampfd.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trampoline FD - ARM support.
+ *
+ * Author: Madhavan T. Venkataraman (madve...@linux.microsoft.com)
+ *
+ * Copyright (c) 2020, Microsoft Corporation.
+ */
+
+#include 
+#include 
+
+#define TRAMPFD_CODE_SIZE  28
+
+/*
+ * trampfd syscall.
+ */
+void trampfd_arch(struct trampfd_info *info)
+{
+   info->code_size = TRAMPFD_CODE_SIZE;
+   info->ntrampolines = PAGE_SIZE / info->code_size;
+   info->code_offset = TRAMPFD_CODE_PGOFF << PAGE_SHIFT;
+   info->reserved = 0;
+}
+
+/*
+ * trampfd code descriptor check.
+ */
+int trampfd_code_arch(struct trampfd_code *code)
+{
+   int ntrampolines;
+   int min, max;
+
+   min = arm_min;
+   max = arm_max;
+   ntrampolines = PAGE_SIZE / TRAMPFD_CODE_SIZE;
+
+   if (code->reg < min || code->reg >= max)
+   return -EINVAL;
+
+   if (!code->ntrampolines || code->ntrampolines > ntrampolines)
+   return -EINVAL;
+   return 0;
+}
+
+/*
+ * trampfd data descriptor check.
+ */
+int trampfd_data_arch(struct trampfd_data *data)
+{
+   int min, max;
+
+   min = arm_min;
+   max = arm_max;
+
+   if (data->reg < min || data->reg >= max)
+   return -EINVAL;
+   return 0;
+}
+
+#define MOVW(ins, reg, imm32)  \
+{  \
+   u16 *_imm16 = (u16 *) &(imm32); /* little endian */ \
+   int _hw, _opcode;   \
+   \
+   for (_hw = 0; _hw < 2; _hw++) { \
+   /* movw or movt */  \
+   _opcode = _hw ? 0xe340 : 0xe300;\
+   *ins++ = _opcode | (_imm16[_hw] >> 12) << 16 |  \
+(reg) << 12 | (_imm16[_hw] & 0xFFF);   \
+   }   \
+}
+
+#define LDR(ins, reg)  \
+{  \
+   *ins++ = 0xe590 | (reg) << 16 | (reg) << 12;\
+}
+
+#define BX(ins, reg)   \
+{  \
+   *ins++ = 0xe12fff10 | (reg);\
+}
+
+void trampfd_code_fill(struct trampfd *trampfd, char *addr)
+{
+   char*eaddr = addr + PAGE_SIZE;
+   int creg = trampfd->code_reg - arm_min;
+   int dreg = trampfd->data_reg - arm_min;
+   u32 *code = trampfd->code;
+   u32 *data = trampfd->data;
+   u32 *instruction = (u32 *) addr;
+   int i;
+
+   for (i = 0; i < trampfd->ntrampolines; i++, code++, 

[PATCH v1 4/4] [RFC] arm/trampfd: Provide support for the trampoline file descriptor

2020-07-28 Thread madvenka
From: "Madhavan T. Venkataraman" 

Implement 32-bit ARM support for the trampoline file descriptor.

- Define architecture specific register names
- Handle the trampoline invocation page fault
- Setup the user register context on trampoline invocation
- Setup the user stack context on trampoline invocation

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm/include/uapi/asm/ptrace.h |  20 +++
 arch/arm/kernel/Makefile   |   1 +
 arch/arm/kernel/trampfd.c  | 214 +
 arch/arm/mm/fault.c|  12 +-
 arch/arm/tools/syscall.tbl |   1 +
 5 files changed, 246 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm/kernel/trampfd.c

diff --git a/arch/arm/include/uapi/asm/ptrace.h 
b/arch/arm/include/uapi/asm/ptrace.h
index e61c65b4018d..47b1c5e2f32c 100644
--- a/arch/arm/include/uapi/asm/ptrace.h
+++ b/arch/arm/include/uapi/asm/ptrace.h
@@ -151,6 +151,26 @@ struct pt_regs {
 #define ARM_r0 uregs[0]
 #define ARM_ORIG_r0uregs[17]
 
+/*
+ * These register names are to be used by 32-bit applications.
+ */
+enum reg_32_name {
+   arm_r0,
+   arm_r1,
+   arm_r2,
+   arm_r3,
+   arm_r4,
+   arm_r5,
+   arm_r6,
+   arm_r7,
+   arm_r8,
+   arm_r9,
+   arm_r10,
+   arm_ip,
+   arm_pc,
+   arm_max,
+};
+
 /*
  * The size of the user-visible VFP state as seen by PTRACE_GET/SETVFPREGS
  * and core dumps.
diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 89e5d864e923..652c54c2f19a 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -105,5 +105,6 @@ obj-$(CONFIG_SMP)   += psci_smp.o
 endif
 
 obj-$(CONFIG_HAVE_ARM_SMCCC)   += smccc-call.o
+obj-$(CONFIG_TRAMPFD)  += trampfd.o
 
 extra-y := $(head-y) vmlinux.lds
diff --git a/arch/arm/kernel/trampfd.c b/arch/arm/kernel/trampfd.c
new file mode 100644
index ..50fc5706e85b
--- /dev/null
+++ b/arch/arm/kernel/trampfd.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trampoline File Descriptor - ARM support.
+ *
+ * Author: Madhavan T. Venkataraman (madve...@linux.microsoft.com)
+ *
+ * Copyright (c) 2020, Microsoft Corporation.
+ */
+
+#include 
+#include 
+#include 
+
+/*  Register Context  
*/
+
+static void set_reg(long *uregs, u32 name, u64 value)
+{
+   switch (name) {
+   case arm_r0:
+   case arm_r1:
+   case arm_r2:
+   case arm_r3:
+   case arm_r4:
+   case arm_r5:
+   case arm_r6:
+   case arm_r7:
+   case arm_r8:
+   case arm_r9:
+   case arm_r10:
+   uregs[name] = (__u64)value;
+   break;
+   case arm_ip:
+   ARM_ip = (__u64)value;
+   break;
+   case arm_pc:
+   ARM_pc = (__u64)value;
+   break;
+   default:
+   WARN(1, "%s: Illegal register name %d\n", __func__, name);
+   break;
+   }
+}
+
+static void set_regs(long *uregs, struct trampfd_regs *tregs)
+{
+   struct trampfd_reg  *reg = tregs->regs;
+   struct trampfd_reg  *reg_end = reg + tregs->nregs;
+
+   for (; reg < reg_end; reg++)
+   set_reg(uregs, reg->name, reg->value);
+}
+
+/*
+ * Check if the register names are valid. Check if the user PC has been set.
+ */
+bool trampfd_valid_regs(struct trampfd_regs *tregs)
+{
+   struct trampfd_reg  *reg = tregs->regs;
+   struct trampfd_reg  *reg_end = reg + tregs->nregs;
+   boolpc_set = false;
+
+   for (; reg < reg_end; reg++) {
+   if (reg->name >= arm_max || reg->reserved)
+   return false;
+   if (reg->name == arm_pc && reg->value)
+   pc_set = true;
+   }
+   return pc_set;
+}
+EXPORT_SYMBOL_GPL(trampfd_valid_regs);
+
+/*
+ * Check if the PC specified in a register context is allowed.
+ */
+bool trampfd_allowed_pc(struct trampfd *trampfd, struct trampfd_regs *tregs)
+{
+   struct trampfd_reg  *reg = tregs->regs;
+   struct trampfd_reg  *reg_end = reg + tregs->nregs;
+   struct trampfd_values   *allowed_pcs = trampfd->allowed_pcs;
+   u64 *allowed_values, pc_value = 0;
+   u32 nvalues, pc_name;
+   int i;
+
+   if (!allowed_pcs)
+   return true;
+
+   pc_name = arm_pc;
+
+   /*
+* Find the PC register and its value. If the PC register has been
+* specified multiple times, only the last one counts.
+*/
+   for (; reg < reg_end; reg++) {
+   if (reg->name == pc_name)
+   pc_value = reg->value;
+   }
+
+   allowed_values = allowed_pcs->values;
+   nvalues = allowed_pcs->nvalues;
+
+   for (i = 0; i < nvalues; i++) {
+   if (pc_value == allowed_values[i])
+   

[PATCH v1 1/4] [RFC] fs/trampfd: Implement the trampoline file descriptor API

2020-07-28 Thread madvenka
From: "Madhavan T. Venkataraman" 

There are many applications that use trampoline code. Trampoline code is
usually placed in a data page or a stack page. In order to execute a
trampoline, the page that contains the trampoline needs to have execute
permissions.

Writable pages with execute permissions provide an attack surface for
hackers. To mitigate this, LSMs such as SELinux may prevent a page from
having both write and execute permissions.

An application may attempt to circumvent this by writing the trampoline
code into a temporary file and mapping the file into its process
address space with just execute permissions. This presents the same
opportunity to hackers as before. LSMs that implement cryptographic
verification of files can prevent such temporary files from being mapped.

Such security mitigations prevent genuine trampoline code from running
as well.

Typically, trampolines simply load some values in some registers and/or
push some values on the stack and jump to a target PC. For such simple
trampolines, an application could request the kernel to do that work
instead of executing trampoline code to do that work. trampfd allows
applications to do exactly this.

Such applications can then run without having to relax security
settings for them. For instance, libffi trampolines can easily be
replaced by trampfd. libffi is used by a variety of applications.

trampfd_create() system call


A new system call is introduced to create a trampoline. The system call
number for this is 440. The system call is invoked like this:

int trampfd;

trampfd = syscall(440, type, data);

typeTrampoline type.
dataTrampoline type-specific data.

Types of trampolines


Different types of trampolines can be defined based on the desired
functionality. In this initial work, the following type is defined:

TRAMPFD_USER

This implements the simple trampoline type I referred to earlier.
The type-specific structure for TRAMPFD_USER is struct trampfd_user.

Trampoline contexts
---

A trampoline can have one or more contexts associated with it. Contexts
are of two kinds:

- Contexts that can be specified by the user. These can be added,
  retrieved and removed by user code.

- Contexts that are specified by the kernel. This can only be
  added by the kernel. But these can be read by the user.

In this initial work, I define the following contexts:

User specifiable:

Register Context


Contains register name-value pairs. When a trampoline is invoked,
the specified values are loaded in the specified registers. This
includes the value of the PC register. The kernel specifies the
subset of registers that can be specified.

Stack Context
-

Contains data to push on the user stack when a trampoline is
invoked.

Allowed PCs
---

This specifies a list of PCs that the trampoline is allowed to
jump to. This prevents a hacker from modifying the trampoline's
target PC.

Kernel specified:

Mapping parameters
--

Used to map a trampoline into an address space. Mapping parameters
are determined by the kernel based on the trampoline type and
type-specific information.

Other contexts can be defined in the future.

How to set and read contexts


A symbolic file offset is associated with each context type.

TRAMPFD_MAP_OFFSET
TRAMPFD_REGS_OFFSET
TRAMPFD_STACK_OFFSET
TRAMPFD_PCS_OFFSET

A structure is defined for each context type as well:

struct trampfd_map
struct trampfd_regs
struct trampfd_stack
struct trampfd_pcs

To set/retrieve a context, seek to the corresponding offset and
write()/read() the corresponding structure. As a convenience, pread()
and pwrite() can be used so it can be done in one call instead of two.

Invoking a trampoline
-

Map the file descriptor into process address space using mmap(). The
kernel returns an address to invoke the trampoline with. The protection
for the mapping is set to PROT_NONE.

Execute the trampoline in one of two ways depending upon what the target
PC points to:

   - Branch to the trampoline address.

   - Use the trampoline address as a function pointer and call it.

Because the user process does not have execute permissions on the
trampoline address, it traps into the kernel. The kernel recognizes
it as a trampoline invocation and performs the action indicated by the
trampoline's type and context. In the case of TRAMPFD_USER, the
kernel loads the user registers with the values specified in the
register context, pushes the values specfied in the stack context on
the user stack and sets the user PC to point to the PC register value
in 

[PATCH v1 2/4] [RFC] x86/trampfd: Provide support for the trampoline file descriptor

2020-07-28 Thread madvenka
From: "Madhavan T. Venkataraman" 

Implement 32-bit and 64-bit X86 support for the trampoline file descriptor.

- Define architecture specific register names
- Handle the trampoline invocation page fault
- Setup the user register context on trampoline invocation
- Setup the user stack context on trampoline invocation

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/x86/entry/syscalls/syscall_32.tbl |   1 +
 arch/x86/entry/syscalls/syscall_64.tbl |   1 +
 arch/x86/include/uapi/asm/ptrace.h |  38 +++
 arch/x86/kernel/Makefile   |   2 +
 arch/x86/kernel/trampfd.c  | 313 +
 arch/x86/mm/fault.c|  11 +
 6 files changed, 366 insertions(+)
 create mode 100644 arch/x86/kernel/trampfd.c

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
b/arch/x86/entry/syscalls/syscall_32.tbl
index d8f8a1a69ed1..77eb50414591 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -443,3 +443,4 @@
 437i386openat2 sys_openat2
 438i386pidfd_getfd sys_pidfd_getfd
 439i386faccessat2  sys_faccessat2
+440i386trampfd_create  sys_trampfd_create
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 78847b32e137..9d962de1d21f 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -360,6 +360,7 @@
 437common  openat2 sys_openat2
 438common  pidfd_getfd sys_pidfd_getfd
 439common  faccessat2  sys_faccessat2
+440common  trampfd_create  sys_trampfd_create
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/x86/include/uapi/asm/ptrace.h 
b/arch/x86/include/uapi/asm/ptrace.h
index 85165c0edafc..b031598f857e 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -9,6 +9,44 @@
 
 #ifndef __ASSEMBLY__
 
+/*
+ * These register names are to be used by 32-bit applications.
+ */
+enum reg_32_name {
+   x32_eax,
+   x32_ebx,
+   x32_ecx,
+   x32_edx,
+   x32_esi,
+   x32_edi,
+   x32_ebp,
+   x32_eip,
+   x32_max,
+};
+
+/*
+ * These register names are to be used by 64-bit applications.
+ */
+enum reg_64_name {
+   x64_rax = x32_max,
+   x64_rbx,
+   x64_rcx,
+   x64_rdx,
+   x64_rsi,
+   x64_rdi,
+   x64_rbp,
+   x64_r8,
+   x64_r9,
+   x64_r10,
+   x64_r11,
+   x64_r12,
+   x64_r13,
+   x64_r14,
+   x64_r15,
+   x64_rip,
+   x64_max,
+};
+
 #ifdef __i386__
 /* this struct defines the way the registers are stored on the
stack during a system call. */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77261db2391..5d968ac4c7d9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -157,3 +157,5 @@ ifeq ($(CONFIG_X86_64),y)
 endif
 
 obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT)   += ima_arch.o
+
+obj-$(CONFIG_TRAMPFD)  += trampfd.o
diff --git a/arch/x86/kernel/trampfd.c b/arch/x86/kernel/trampfd.c
new file mode 100644
index ..f6b5507134d2
--- /dev/null
+++ b/arch/x86/kernel/trampfd.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trampoline File Descriptor - X86 support.
+ *
+ * Author: Madhavan T. Venkataraman (madve...@linux.microsoft.com)
+ *
+ * Copyright (c) 2020, Microsoft Corporation.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+/*  Register Context  
*/
+
+static inline bool is_compat(void)
+{
+   return (IS_ENABLED(CONFIG_X86_32) ||
+   (IS_ENABLED(CONFIG_COMPAT) && test_thread_flag(TIF_ADDR32)));
+}
+
+static void set_reg_32(struct pt_regs *pt_regs, u32 name, u64 value)
+{
+   switch (name) {
+   case x32_eax:
+   pt_regs->ax = (unsigned long)value;
+   break;
+   case x32_ebx:
+   pt_regs->bx = (unsigned long)value;
+   break;
+   case x32_ecx:
+   pt_regs->cx = (unsigned long)value;
+   break;
+   case x32_edx:
+   pt_regs->dx = (unsigned long)value;
+   break;
+   case x32_esi:
+   pt_regs->si = (unsigned long)value;
+   break;
+   case x32_edi:
+   pt_regs->di = (unsigned long)value;
+   break;
+   case x32_ebp:
+   pt_regs->bp = (unsigned long)value;
+   break;
+   case x32_eip:
+   pt_regs->ip = (unsigned long)value;
+   break;
+   default:
+   WARN(1, "%s: Illegal register name %d\n", __func__, name);
+   break;
+   }
+}
+
+#ifdef __i386__
+
+static void set_reg_64(struct pt_regs *pt_regs, u32 name, u64 value)
+{
+}
+
+#else
+
+static void set_reg_64(struct 

[PATCH v1 0/4] [RFC] Implement Trampoline File Descriptor

2020-07-28 Thread madvenka
rnel gets control. The kernel recognizes that this is a trampoline
invocation. It sets up the user registers based on the specified
register context, and/or pushes values on the user stack based on the
specified stack context, and sets the user PC to the requested target
PC. When the kernel returns, execution continues at the target PC.
So, the kernel does the work of the trampoline on behalf of the
application.

In this case, the attack surface is the context buffer. A hacker may
attack an application with a vulnerability and may be able to modify the
context buffer. So, when the register or stack context is set for
a trampoline, the values may have been tampered with. From an attack
surface perspective, this is similar to Trampoline Emulation. But
with trampfd, user code can retrieve a trampoline's context from the
kernel and add defensive checks to see if the context has been
tampered with.

As for the target PC, trampfd implements a measure called the
"Allowed PCs" context (see Advantages) to prevent a hacker from making
the target PC point to arbitrary locations. So, the attack surface is
narrower than Trampoline Emulation.

Advantages of the Trampoline File Descriptor approach
-

- trampfd is customizable. The user can specify any combination of
  allowed register name-value pairs in the register context and the kernel
  will set it up accordingly. This allows different user trampolines to be
  converted to use trampfd.

- trampfd allows a stack context to be set up so that trampolines that
  need to push values on the user stack can do that.

- The initial work is targeted for X86 and ARM. But the implementation
  leverages small portions of existing signal delivery code. Specifically,
  it uses pt_regs for setting up user registers and copy_to_user()
  to push values on the stack. So, this can be very easily ported to other
  architectures.

- trampfd provides a basic framework. In the future, new trampoline types
  can be implemented, new contexts can be defined, and additional rules
  can be implemented for security purposes.

- For instance, trampfd defines an "Allowed PCs" context in this initial
  work. As an example, libffi can create a read-only array of all ABI
  handlers for an architecture at build time. This array can be used to
  set the list of allowed PCs for a trampoline. This will mean that a hacker
  cannot hack the PC part of the register context and make it point to
  arbitrary locations.

- An SELinux setting called "exectramp" can be implemented along the
  lines of "execmem", "execstack" and "execheap" to selectively allow the
  use of trampolines on a per application basis.

- User code can add defensive checks in the code before invoking a
  trampoline to make sure that a hacker has not modified the context data.
  It can do this by getting the trampoline context from the kernel and
  double checking it.

- In the future, if the kernel can be enhanced to use a safe code
  generation component, that code can be placed in the trampoline mapping
  pages. Then, the trampoline invocation does not have to incur a trip
  into the kernel.

- Also, if the kernel can be enhanced to use a safe code generation
  component, other forms of dynamic code such as JIT code can be
  addressed by the trampfd framework.

- Trampolines can be shared across processes which can give rise to
  interesting uses in the future.

- Trampfd can be used for other purposes to extend the kernel's
  functionality.

libffi
--

I have implemented my solution for libffi and provided the changes for
X86 and ARM, 32-bit and 64-bit. Here is the reference patch:

http://linux.microsoft.com/~madvenka/libffi/libffi.txt

If the trampfd patchset gets accepted, I will send the libffi changes
to the maintainers for a review. BTW, I have also successfully executed
the libffi self tests.

Work that is pending


- I am working on implementing an SELinux setting called "exectramp"
  similar to "execmem" to allow the use of trampfd on a per application
  basis.

- I have a comprehensive test program to test the kernel API. I am
  working on adding it to selftests.

References
--

[1] https://microsoft.github.io/ipe/
---
Madhavan T. Venkataraman (4):
  fs/trampfd: Implement the trampoline file descriptor API
  x86/trampfd: Support for the trampoline file descriptor
  arm64/trampfd: Support for the trampoline file descriptor
  arm/trampfd: Support for the trampoline file descriptor

 arch/arm/include/uapi/asm/ptrace.h |  20 ++
 arch/arm/kernel/Makefile   |   1 +
 arch/arm/kernel/trampfd.c  | 214 +
 arch/arm/mm/fault.c|  12 +-
 arch/arm/tools/syscall.tbl |   1 +
 arch/arm64/include/asm/ptrace.h|   9 +
 arch/arm64/include/asm/unistd.h|   2 +-
 arch/arm64/include/asm/unistd32.h  |   2 

[PATCH v1 3/4] [RFC] arm64/trampfd: Provide support for the trampoline file descriptor

2020-07-28 Thread madvenka
From: "Madhavan T. Venkataraman" 

Implement 64-bit ARM support for the trampoline file descriptor.

- Define architecture specific register names
- Handle the trampoline invocation page fault
- Setup the user register context on trampoline invocation
- Setup the user stack context on trampoline invocation

Signed-off-by: Madhavan T. Venkataraman 
---
 arch/arm64/include/asm/ptrace.h  |   9 +
 arch/arm64/include/asm/unistd.h  |   2 +-
 arch/arm64/include/asm/unistd32.h|   2 +
 arch/arm64/include/uapi/asm/ptrace.h |  57 ++
 arch/arm64/kernel/Makefile   |   2 +
 arch/arm64/kernel/trampfd.c  | 278 +++
 arch/arm64/mm/fault.c|  15 +-
 7 files changed, 361 insertions(+), 4 deletions(-)
 create mode 100644 arch/arm64/kernel/trampfd.c

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 953b6a1ce549..dad6cdbd59c6 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -232,6 +232,15 @@ static inline unsigned long user_stack_pointer(struct 
pt_regs *regs)
return regs->sp;
 }
 
+static inline void user_stack_pointer_set(struct pt_regs *regs,
+ unsigned long val)
+{
+   if (compat_user_mode(regs))
+   regs->compat_sp = val;
+   else
+   regs->sp = val;
+}
+
 extern int regs_query_register_offset(const char *name);
 extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
   unsigned int n);
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 3b859596840d..b3b2019f8d16 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@
 #define __ARM_NR_compat_set_tls(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls   440
+#define __NR_compat_syscalls   441
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h 
b/arch/arm64/include/asm/unistd32.h
index 6d95d0c8bf2f..821ddcaf9683 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -885,6 +885,8 @@ __SYSCALL(__NR_openat2, sys_openat2)
 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 #define __NR_faccessat2 439
 __SYSCALL(__NR_faccessat2, sys_faccessat2)
+#define __NR_trampfd_create 440
+__SYSCALL(__NR_trampfd_create, sys_trampfd_create)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/arm64/include/uapi/asm/ptrace.h 
b/arch/arm64/include/uapi/asm/ptrace.h
index 42cbe34d95ce..f4d1974dd795 100644
--- a/arch/arm64/include/uapi/asm/ptrace.h
+++ b/arch/arm64/include/uapi/asm/ptrace.h
@@ -88,6 +88,63 @@ struct user_pt_regs {
__u64   pstate;
 };
 
+/*
+ * These register names are to be used by 32-bit applications.
+ */
+enum reg_32_name {
+   arm_r0,
+   arm_r1,
+   arm_r2,
+   arm_r3,
+   arm_r4,
+   arm_r5,
+   arm_r6,
+   arm_r7,
+   arm_r8,
+   arm_r9,
+   arm_r10,
+   arm_ip,
+   arm_pc,
+   arm_max,
+};
+
+/*
+ * These register names are to be used by 64-bit applications.
+ */
+enum reg_64_name {
+   arm64_r0 = arm_max,
+   arm64_r1,
+   arm64_r2,
+   arm64_r3,
+   arm64_r4,
+   arm64_r5,
+   arm64_r6,
+   arm64_r7,
+   arm64_r8,
+   arm64_r9,
+   arm64_r10,
+   arm64_r11,
+   arm64_r12,
+   arm64_r13,
+   arm64_r14,
+   arm64_r15,
+   arm64_r16,
+   arm64_r17,
+   arm64_r18,
+   arm64_r19,
+   arm64_r20,
+   arm64_r21,
+   arm64_r22,
+   arm64_r23,
+   arm64_r24,
+   arm64_r25,
+   arm64_r26,
+   arm64_r27,
+   arm64_r28,
+   arm64_pc,
+   arm64_max,
+};
+
 struct user_fpsimd_state {
__uint128_t vregs[32];
__u32   fpsr;
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index a561cbb91d4d..18d373fb1208 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -71,3 +71,5 @@ extra-y   += $(head-y) 
vmlinux.lds
 ifeq ($(CONFIG_DEBUG_EFI),y)
 AFLAGS_head.o += -DVMLINUX_PATH="\"$(realpath $(objtree)/vmlinux)\""
 endif
+
+obj-$(CONFIG_TRAMPFD)  += trampfd.o
diff --git a/arch/arm64/kernel/trampfd.c b/arch/arm64/kernel/trampfd.c
new file mode 100644
index ..d79e749e0c30
--- /dev/null
+++ b/arch/arm64/kernel/trampfd.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trampoline File Descriptor - ARM64 support.
+ *
+ * Author: Madhavan T. Venkataraman (madve...@linux.microsoft.com)
+ *
+ * Copyright (c) 2020, Microsoft Corporation.
+ */
+
+#include 
+#include 
+#include 
+
+/*  Register Context  
*/
+
+static