[PATCH] perf svghelper: fix memory leak in svg_build_topology_map
Fix leak of memory pointed to by t.sib_thr and t.sib_core in svg_build_topology_map in the non-error path. Signed-off-by: Li Bin --- tools/perf/util/svghelper.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index 96f941e..d50955f 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -754,6 +754,7 @@ int svg_build_topology_map(struct perf_env *env) int i, nr_cpus; struct topology t; char *sib_core, *sib_thr; + int ret = -1; nr_cpus = min(env->nr_cpus_online, MAX_NR_CPUS); @@ -798,12 +799,11 @@ int svg_build_topology_map(struct perf_env *env) topology_map[i] = -1; scan_core_topology(topology_map, &t, nr_cpus); - - return 0; + ret = 0; exit: zfree(&t.sib_core); zfree(&t.sib_thr); - return -1; + return ret; } -- 1.7.12.4
[PATCH] sched: fix typo in error message
Signed-off-by: Li Bin --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 64cc564..cf15c1c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1618,7 +1618,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve if (!cpumask_subset(sched_domain_span(child), sched_domain_span(sd))) { - pr_err("BUG: arch topology borken\n"); + pr_err("BUG: arch topology broken\n"); #ifdef CONFIG_SCHED_DEBUG pr_err(" the %s domain not a subset of the %s domain\n", child->name, sd->name); -- 1.7.12.4
[PATCH v2] prctl: fix compat handling for prctl
The member auxv in prctl_mm_map structure which be shared with userspace is pointer type, but the kernel supporting COMPAT didn't handle it. This patch fix the compat handling for prctl syscall. Signed-off-by: Li Bin --- kernel/sys.c | 42 ++ 1 file changed, 42 insertions(+) diff --git a/kernel/sys.c b/kernel/sys.c index ad69218..d4259938 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1969,6 +1969,26 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) } #ifdef CONFIG_CHECKPOINT_RESTORE + +#ifdef CONFIG_COMPAT +struct compat_prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack;/* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + compat_uptr_t auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; +#endif + static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) { struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; @@ -1986,6 +2006,28 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (data_size != sizeof(prctl_map)) return -EINVAL; +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) { + struct compat_prctl_mm_map prctl_map32; + if (copy_from_user(&prctl_map32, addr, sizeof(prctl_map32))) + return -EFAULT; + + prctl_map.start_code = prctl_map32.start_code; + prctl_map.end_code = prctl_map32.end_code; + prctl_map.start_data = prctl_map32.start_data; + prctl_map.end_data = prctl_map32.end_data; + prctl_map.start_brk = prctl_map32.start_brk; + prctl_map.brk = prctl_map32.brk; + prctl_map.start_stack = prctl_map32.start_stack; + prctl_map.arg_start = prctl_map32.arg_start; + prctl_map.arg_end = prctl_map32.arg_end; + prctl_map.env_start = prctl_map32.env_start; + prctl_map.env_end = prctl_map32.env_end; + prctl_map.auxv = compat_ptr(prctl_map32.auxv); + prctl_map.auxv_size = prctl_map32.auxv_size; + prctl_map.exe_fd = prctl_map32.exe_fd; + } else +#endif if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) return -EFAULT; -- 1.7.12.4
[PATCH] prctl: fix compat handling for prctl
The member auxv in prctl_mm_map structure which be shared with userspace is pointer type, but the kernel supporting COMPAT didn't handle it. This patch fix the compat handling for prctl syscall. Signed-off-by: Li Bin --- kernel/sys.c | 41 + 1 file changed, 41 insertions(+) diff --git a/kernel/sys.c b/kernel/sys.c index ad69218..03b9731 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1968,6 +1968,25 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) return error; } +#ifdef CONFIG_COMPAT +struct compat_prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack;/* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + compat_uptr_t auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; +#endif + #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) { @@ -1986,6 +2005,28 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (data_size != sizeof(prctl_map)) return -EINVAL; +#ifdef CONFIG_COMPAT + if (is_compat_task()) { + struct compat_prctl_mm_map prctl_map32; + if (copy_from_user(&prctl_map32, addr, sizeof(prctl_map32))) + return -EFAULT; + + prctl_map.start_code = prctl_map32.start_code; + prctl_map.end_code = prctl_map32.end_code; + prctl_map.start_data = prctl_map32.start_data; + prctl_map.end_data = prctl_map32.end_data; + prctl_map.start_brk = prctl_map32.start_brk; + prctl_map.brk = prctl_map32.brk; + prctl_map.start_stack = prctl_map32.start_stack; + prctl_map.arg_start = prctl_map32.arg_start; + prctl_map.arg_end = prctl_map32.arg_end; + prctl_map.env_start = prctl_map32.env_start; + prctl_map.env_end = prctl_map32.env_end; + prctl_map.auxv = compat_ptr(prctl_map32.auxv); + prctl_map.auxv_size = prctl_map32.auxv_size; + prctl_map.exe_fd = prctl_map32.exe_fd; + } else +#endif if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) return -EFAULT; -- 1.7.12.4
[PATCH v2 1/2] sched/rt.c: pick and check task if double_lock_balance() unlock the rq
From: Zhou Chengming push_rt_task() pick the first pushable task and find an eligible lowest_rq, then double_lock_balance(rq, lowest_rq). So if double_lock_balance() unlock the rq (when double_lock_balance() return 1), we have to check if this task is still on the rq. The problem is that the check conditions are not sufficient: if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { cpu2cpu1cpu0 push_rt_task(rq1) pick task_A on rq1 find rq0 double_lock_balance(rq1, rq0) unlock(rq1) rq1 __schedule pick task_A run task_A sleep (dequeued) lock(rq0) lock(rq1) do_above_check(task_A) task_rq(task_A) == rq1 cpus_allowed unchanged task_running == false rt_task(task_A) == true try_to_wake_up(task_A) select_cpu = cpu3 enqueue(rq3, task_A) task_A->on_rq = 1 task_on_rq_queued(task_A) above_check passed, return rq0 ... migrate task_A from rq1 to rq0 So we can't rely on these checks of task_A to make sure the task_A is still on the rq1, even though we hold the rq1->lock. This patch will repick the first pushable task to be sure the task is still on the rq. Signed-off-by: Zhou Chengming Signed-off-by: Li Bin Acked-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) --- kernel/sched/rt.c | 50 -- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index aad49451..ff3bfce 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1695,6 +1695,26 @@ static int find_lowest_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!rt_task(p)); + + return p; +} + /* Will lock the rq it finds */ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) { @@ -1722,17 +1742,15 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) /* if the prio of this runqueue changed, try again */ if (double_lock_balance(rq, lowest_rq)) { + struct task_struct *next_task; /* * We had to unlock the run queue. In * the mean time, task could have * migrated already or had its affinity changed. -* Also make sure that it wasn't scheduled on its rq. */ - if (unlikely(task_rq(task) != rq || -!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || -task_running(rq, task) || -!rt_task(task) || -!task_on_rq_queued(task))) { + next_task = pick_next_pushable_task(rq); + if (unlikely(next_task != task || +!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1752,26 +1770,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static struct task_struct *pick_next_pushable_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_tasks(rq)) - return NULL; - - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!rt_task(p)); - - return p; -} - /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task -- 1.7.12.4
[PATCH v2 2/2] sched/deadline.c: pick and check task if double_lock_balance() unlock the rq
push_dl_task() pick the first pushable task and find an eligible lowest_rq, then double_lock_balance(rq, lowest_rq). So if double_lock_balance() unlock the rq (when double_lock_balance() return 1), we have to check if this task is still on the rq. The problem is that the check conditions are not sufficient: if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { cpu2cpu1cpu0 push_dl_task(rq1) pick task_A on rq1 find rq0 double_lock_balance(rq1, rq0) unlock(rq1) rq1 __schedule pick task_A run task_A sleep (dequeued) lock(rq0) lock(rq1) do_above_check(task_A) task_rq(task_A) == rq1 cpus_allowed unchanged task_running == false dl_task(task_A) == true try_to_wake_up(task_A) select_cpu = cpu3 enqueue(rq3, task_A) task_A->on_rq = 1 task_on_rq_queued(task_A) above_check passed, return rq0 ... migrate task_A from rq1 to rq0 So we can't rely on these checks of task_A to make sure the task_A is still on the rq1, even though we hold the rq1->lock. This patch will repick the first pushable task to be sure the task is still on the rq. Signed-off-by: Li Bin Acked-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) --- kernel/sched/deadline.c | 55 +++-- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9df0978..8e0f6a4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1936,6 +1936,26 @@ static int find_later_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, +struct task_struct, pushable_dl_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!dl_task(p)); + + return p; +} + /* Locks the rq it finds */ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) { @@ -1965,11 +1985,16 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { - if (unlikely(task_rq(task) != rq || -!cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || -task_running(rq, task) || -!dl_task(task) || -!task_on_rq_queued(task))) { + struct task_struct *next_task; + /* +* We had to unlock the run queue. In +* the mean time, task could have +* migrated already or had its affinity changed. +* Also make sure that it wasn't scheduled on its rq. +*/ + next_task = pick_next_pushable_dl_task(rq); + if (unlikely(next_task != task || + !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed))) { double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -1994,26 +2019,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) return later_rq; } -static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_dl_tasks(rq)) - return NULL; - - p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, -struct task_struct, pushable_dl_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!dl_task(p)); - - return p; -} - /* * See if the non running -deadline tasks on this rq * can be sent to some other CPU where they can preempt -- 1.7.12.4
[PATCH v2 0/2] sched: pick and check task if double_lock_balance() unlock the rq
Changes in v2: * Only change the comment and coding style as suggested by Steve Li Bin (1): sched/deadline.c: pick and check task if double_lock_balance() unlock the rq Zhou Chengming (1): sched/rt.c: pick and check task if double_lock_balance() unlock the rq kernel/sched/deadline.c | 55 +++-- kernel/sched/rt.c | 50 +--- 2 files changed, 54 insertions(+), 51 deletions(-) -- 1.7.12.4
[PATCH 2/2] sched/deadline.c: pick and check task if double_lock_balance() unlock the rq
push_dl_task() pick the first pushable task and find an eligible later_rq, then double_lock_balance(rq, lowest_rq). So if double_lock_balance() unlock the rq (when double_lock_balance() return 1), we have to check if this task is still on the rq. The problem is that the check conditions are not sufficient: if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { cpu2cpu1cpu0 push_dl_task(rq1) pick task_A on rq1 find rq0 double_lock_balance(rq1, rq0) unlock(rq1) rq1 __schedule pick task_A run task_A sleep (dequeued) lock(rq0) lock(rq1) do_above_check(task_A) task_rq(task_A) == rq1 cpus_allowed unchanged task_running == false dl_task(task_A) == true try_to_wake_up(task_A) select_cpu = cpu3 enqueue(rq3, task_A) task_A->on_rq = 1 task_on_rq_queued(task_A) above_check passed, return rq0 ... migrate task_A from rq1 to rq0 So we can't rely on these checks of task_A to make sure the task_A is still on the rq1, even though we hold the rq1->lock. This patch will repick the first pushable task to be sure the task is still on the rq. Signed-off-by: Li Bin --- kernel/sched/deadline.c | 48 +++- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9df0978..87cd7ca 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1936,6 +1936,26 @@ static int find_later_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_dl_tasks(rq)) + return NULL; + + p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, +struct task_struct, pushable_dl_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!dl_task(p)); + + return p; +} + /* Locks the rq it finds */ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) { @@ -1965,11 +1985,9 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { - if (unlikely(task_rq(task) != rq || -!cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || -task_running(rq, task) || -!dl_task(task) || -!task_on_rq_queued(task))) { + struct task_struct *next_task = pick_next_pushable_dl_task(rq); + if (unlikely(next_task != task || + !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed))) { double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -1994,26 +2012,6 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) return later_rq; } -static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_dl_tasks(rq)) - return NULL; - - p = rb_entry(rq->dl.pushable_dl_tasks_root.rb_leftmost, -struct task_struct, pushable_dl_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!dl_task(p)); - - return p; -} - /* * See if the non running -deadline tasks on this rq * can be sent to some other CPU where they can preempt -- 1.7.12.4
[PATCH 1/2] sched/rt.c: pick and check task if double_lock_balance() unlock the rq
From: Zhou Chengming push_rt_task() pick the first pushable task and find an eligible lowest_rq, then double_lock_balance(rq, lowest_rq). So if double_lock_balance() unlock the rq (when double_lock_balance() return 1), we have to check if this task is still on the rq. The problem is that the check conditions are not sufficient: if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { cpu2cpu1cpu0 push_rt_task(rq1) pick task_A on rq1 find rq0 double_lock_balance(rq1, rq0) unlock(rq1) rq1 __schedule pick task_A run task_A sleep (dequeued) lock(rq0) lock(rq1) do_above_check(task_A) task_rq(task_A) == rq1 cpus_allowed unchanged task_running == false rt_task(task_A) == true try_to_wake_up(task_A) select_cpu = cpu3 enqueue(rq3, task_A) task_A->on_rq = 1 task_on_rq_queued(task_A) above_check passed, return rq0 ... migrate task_A from rq1 to rq0 So we can't rely on these checks of task_A to make sure the task_A is still on the rq1, even though we hold the rq1->lock. This patch will repick the first pushable task to be sure the task is still on the rq. Signed-off-by: Zhou Chengming Signed-off-by: Li Bin --- kernel/sched/rt.c | 49 +++-- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index aad49451..e51d574 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1695,6 +1695,26 @@ static int find_lowest_rq(struct task_struct *task) return -1; } +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->nr_cpus_allowed <= 1); + + BUG_ON(!task_on_rq_queued(p)); + BUG_ON(!rt_task(p)); + + return p; +} + /* Will lock the rq it finds */ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) { @@ -1726,13 +1746,10 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * We had to unlock the run queue. In * the mean time, task could have * migrated already or had its affinity changed. -* Also make sure that it wasn't scheduled on its rq. */ - if (unlikely(task_rq(task) != rq || -!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || -task_running(rq, task) || -!rt_task(task) || -!task_on_rq_queued(task))) { + struct task_struct *next_task = pick_next_pushable_task(rq); + if (unlikely(next_task != task || +!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; @@ -1752,26 +1769,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static struct task_struct *pick_next_pushable_task(struct rq *rq) -{ - struct task_struct *p; - - if (!has_pushable_tasks(rq)) - return NULL; - - p = plist_first_entry(&rq->rt.pushable_tasks, - struct task_struct, pushable_tasks); - - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); - - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!rt_task(p)); - - return p; -} - /* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task -- 1.7.12.4
[PATCH 0/2] sched: pick and check task if double_lock_balance() unlock the rq
Li Bin (1): sched/deadline.c: pick and check task if double_lock_balance() unlock the rq Zhou Chengming (1): sched/rt.c: pick and check task if double_lock_balance() unlock the rq kernel/sched/deadline.c | 48 +++- kernel/sched/rt.c | 49 +++-- 2 files changed, 46 insertions(+), 51 deletions(-) -- 1.7.12.4
[PATCH v2] workqueue: Fix NULL pointer dereference
When queue_work() is used in irq (not in task context), there is a potential case that trigger NULL pointer dereference. worker_thread() |-spin_lock_irq() |-process_one_work() |-worker->current_pwq = pwq |-spin_unlock_irq() |-worker->current_func(work) |-spin_lock_irq() |-worker->current_pwq = NULL |-spin_unlock_irq() //interrupt here |-irq_handler |-__queue_work() //assuming that the wq is draining |-is_chained_work(wq) |-current_wq_worker() //Here, 'current' is the interrupted worker! |-current->current_pwq is NULL here! |-schedule() Avoid it by checking for task context in current_wq_worker(), and if not in task context, we shouldn't use the 'current' to check the condition. Reported-by: Xiaofei Tan Signed-off-by: Li Bin --- kernel/workqueue_internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 8635417..29fa81f 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -9,6 +9,7 @@ #include #include +#include struct worker_pool; @@ -59,7 +60,7 @@ struct worker { */ static inline struct worker *current_wq_worker(void) { - if (current->flags & PF_WQ_WORKER) + if (in_task() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } -- 1.7.12.4
Re: [PATCH] workqueue: Fix NULL pointer dereference
Hi, Jiangshan on 2017/10/26 23:55, Lai Jiangshan wrote: > On Tue, Oct 24, 2017 at 9:18 AM, Li Bin wrote: > > I remember that softirq can be invoked when irq_eixt(), > and in this case the current->current_pwq is also NULL > if __queue_work() is called in the soft irq. > > So in_task() might be better than !in_irq() for the fix? > Good catch, I will fix it and resend the patch. Thanks, Li Bin >> |-schedule() >> >> >> Avoid it by checking for irq context in current_wq_worker(), and >> if in irq context, we shouldn't use the 'current' to check the >> condition. >> >> Reported-by: Xiaofei Tan >> Signed-off-by: Li Bin >> --- >> kernel/workqueue_internal.h | 3 ++- >> 1 file changed, 2 insertions(+), 1 deletion(-) >> >> diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h >> index 8635417..d81cb9b 100644 >> --- a/kernel/workqueue_internal.h >> +++ b/kernel/workqueue_internal.h >> @@ -9,6 +9,7 @@ >> >> #include >> #include >> +#include >> >> struct worker_pool; >> >> @@ -59,7 +60,7 @@ struct worker { >> */ >> static inline struct worker *current_wq_worker(void) >> { >> - if (current->flags & PF_WQ_WORKER) >> + if (!in_irq() && (current->flags & PF_WQ_WORKER)) >> return kthread_data(current); >> return NULL; >> } >> -- >> 1.7.12.4 >> > > . >
[PATCH] workqueue: Fix NULL pointer dereference
When queue_work() is used in irq handler, there is a potential case that trigger NULL pointer dereference. worker_thread() |-spin_lock_irq() |-process_one_work() |-worker->current_pwq = pwq |-spin_unlock_irq() |-worker->current_func(work) |-spin_lock_irq() |-worker->current_pwq = NULL |-spin_unlock_irq() //interrupt here |-irq_handler |-__queue_work() //assuming that the wq is draining |-is_chained_work(wq) |-current_wq_worker() //Here, 'current' is the interrupted worker! |-current->current_pwq is NULL here! |-schedule() Avoid it by checking for irq context in current_wq_worker(), and if in irq context, we shouldn't use the 'current' to check the condition. Reported-by: Xiaofei Tan Signed-off-by: Li Bin --- kernel/workqueue_internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 8635417..d81cb9b 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -9,6 +9,7 @@ #include #include +#include struct worker_pool; @@ -59,7 +60,7 @@ struct worker { */ static inline struct worker *current_wq_worker(void) { - if (current->flags & PF_WQ_WORKER) + if (!in_irq() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } -- 1.7.12.4
Re: [Question] null pointer risk of kernel workqueue
on 2017/10/23 22:03, Tejun Heo wrote: >> >> And I think the following patch can solve the bug, right? >> >> diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h >> index 8635417..650680c 100644 >> --- a/kernel/workqueue_internal.h >> +++ b/kernel/workqueue_internal.h >> @@ -59,7 +59,7 @@ struct worker { >> */ >> static inline struct worker *current_wq_worker(void) >> { >> - if (current->flags & PF_WQ_WORKER) >> + if (!in_irq() && (current->flags & PF_WQ_WORKER)) >> return kthread_data(current); >> return NULL; >> } > > Yeah, that makes sense to me. Can you please resend the patch with > patch description and SOB? Ok, I will resend the patch soon. Thanks, Li Bin > > Thanks. >
Re: [Question] null pointer risk of kernel workqueue
on 2017/10/21 23:35, Tejun Heo wrote: > On Fri, Oct 20, 2017 at 02:57:18PM +0800, tanxiaofei wrote: >> Hi Tejun, >> >> Any comments about this? > > I think you're confused, or at least can't understand what you're > trying to say. Can you create a rero? > Hi Tejun, The case is as following: worker_thread() |-spin_lock_irq() |-process_one_work() |-worker->current_pwq = pwq |-spin_unlock_irq() |-worker->current_func(work) |-spin_lock_irq() |-worker->current_pwq = NULL |-spin_unlock_irq() //interrupt here |-irq_handler |-__queue_work() //assuming that the wq is draining |-if (unlikely(wq->flags & __WQ_DRAINING) &&WARN_ON_ONCE(!is_chained_work(wq))) |-is_chained_work(wq) |-current_wq_worker() // Here, 'current' is the interrupted worker! |-current->current_pwq is NULL here! |-schedule() And I think the following patch can solve the bug, right? diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 8635417..650680c 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -59,7 +59,7 @@ struct worker { */ static inline struct worker *current_wq_worker(void) { - if (current->flags & PF_WQ_WORKER) + if (!in_irq() && (current->flags & PF_WQ_WORKER)) return kthread_data(current); return NULL; } Thanks, Li Bin > Thanks. >
[tip:perf/core] perf symbols: Fix plt entry calculation for ARM and AARCH64
Commit-ID: b2f7605076d6cdd68162c42c34caadafbbe4c69f Gitweb: http://git.kernel.org/tip/b2f7605076d6cdd68162c42c34caadafbbe4c69f Author: Li Bin AuthorDate: Mon, 5 Jun 2017 08:34:09 +0800 Committer: Arnaldo Carvalho de Melo CommitDate: Tue, 29 Aug 2017 11:41:27 -0300 perf symbols: Fix plt entry calculation for ARM and AARCH64 On x86, the plt header size is as same as the plt entry size, and can be identified from shdr's sh_entsize of the plt. But we can't assume that the sh_entsize of the plt shdr is always the plt entry size in all architecture, and the plt header size may be not as same as the plt entry size in some architecure. On ARM, the plt header size is 20 bytes and the plt entry size is 12 bytes (don't consider the FOUR_WORD_PLT case) that refer to the binutils implementation. The plt section is as follows: Disassembly of section .plt: 04a0 <__cxa_finalize@plt-0x14>: 4a0: e52de004push{lr}; (str lr, [sp, #-4]!) 4a4: e59fe004ldr lr, [pc, #4]; 4b0 <_init+0x1c> 4a8: e08fe00eadd lr, pc, lr 4ac: e5bef008ldr pc, [lr, #8]! 4b0: 8424.word 0x8424 04b4 <__cxa_finalize@plt>: 4b4: e28fc600add ip, pc, #0, 12 4b8: e28cca08add ip, ip, #8, 20 ; 0x8000 4bc: e5bcf424ldr pc, [ip, #1060]!; 0x424 04c0 : 4c0: e28fc600add ip, pc, #0, 12 4c4: e28cca08add ip, ip, #8, 20 ; 0x8000 4c8: e5bcf41cldr pc, [ip, #1052]!; 0x41c On AARCH64, the plt header size is 32 bytes and the plt entry size is 16 bytes. The plt section is as follows: Disassembly of section .plt: 0560 <__cxa_finalize@plt-0x20>: 560: a9bf7bf0stp x16, x30, [sp,#-16]! 564: 9090adrpx16, 1 <__FRAME_END__+0xf8a8> 568: f944be11ldr x17, [x16,#2424] 56c: 9125e210add x16, x16, #0x978 570: d61f0220br x17 574: d503201fnop 578: d503201fnop 57c: d503201fnop 0580 <__cxa_finalize@plt>: 580: 9090adrpx16, 1 <__FRAME_END__+0xf8a8> 584: f944c211ldr x17, [x16,#2432] 588: 91260210add x16, x16, #0x980 58c: d61f0220br x17 0590 <__gmon_start__@plt>: 590: 9090adrpx16, 1 <__FRAME_END__+0xf8a8> 594: f944c611ldr x17, [x16,#2440] 598: 91262210add x16, x16, #0x988 59c: d61f0220br x17 NOTES: In addition to ARM and AARCH64, other architectures, such as s390/alpha/mips/parisc/poperpc/sh/sparc/xtensa also need to consider this issue. Signed-off-by: Li Bin Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Alexis Berlemont Cc: David Tolnay Cc: Hanjun Guo Cc: Hemant Kumar Cc: Masami Hiramatsu Cc: Milian Wolff Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Cc: zhangmengt...@huawei.com Link: http://lkml.kernel.org/r/1496622849-21877-1-git-send-email-huawei.li...@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/symbol-elf.c | 27 ++- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index a704790..5c39f42 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -259,7 +259,7 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map * { uint32_t nr_rel_entries, idx; GElf_Sym sym; - u64 plt_offset; + u64 plt_offset, plt_header_size, plt_entry_size; GElf_Shdr shdr_plt; struct symbol *f; GElf_Shdr shdr_rel_plt, shdr_dynsym; @@ -326,6 +326,23 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map * nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize; plt_offset = shdr_plt.sh_offset; + switch (ehdr.e_machine) { + case EM_ARM: + plt_header_size = 20; + plt_entry_size = 12; + break; + + case EM_AARCH64: + plt_header_size = 32; + plt_entry_size = 16; + break; + + default: /* FIXME: s390/alpha/mips/parisc/poperpc/sh/sparc/xtensa need to be checked */ + plt_header_size = shdr_plt.sh_entsize; + plt_entry_size = shdr_plt.sh_entsize; + break; + } + plt_offset += plt_header_size; if (shdr_rel_plt.sh_type == SHT_RELA) { GElf_Rela pos_mem, *pos; @@ -335,7 +352,6 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map * const char *elf_name = NULL; char *dema
[tip:perf/core] perf probe: Fix kprobe blacklist checking condition
Commit-ID: 2c29461e273abaf149cf8220c3403e9d67dd8b61 Gitweb: http://git.kernel.org/tip/2c29461e273abaf149cf8220c3403e9d67dd8b61 Author: Li Bin AuthorDate: Tue, 29 Aug 2017 20:57:23 +0800 Committer: Arnaldo Carvalho de Melo CommitDate: Tue, 29 Aug 2017 11:14:12 -0300 perf probe: Fix kprobe blacklist checking condition The commit 9aaf5a5f479b ("perf probe: Check kprobes blacklist when adding new events"), 'perf probe' supports checking the blacklist of the fuctions which can not be probed. But the checking condition is wrong, that the end_addr of the symbol which is the start_addr of the next symbol can't be included. Committer notes: IOW make it match its kernel counterpart in kernel/kprobes.c: bool within_kprobe_blacklist(unsigned long addr) Each entry have as its end address not its end address, but the first address _outside_ that symbol, which for related functions, is the first address of the next symbol, like these from kernel/trace/trace_probe.c: 0xbd198df0-0xbd198e40 print_type_u8 0xbd198e40-0xbd198e90 print_type_u16 0xbd198e90-0xbd198ee0 print_type_u32 0xbd198ee0-0xbd198f30 print_type_u64 0xbd198f30-0xbd198f80 print_type_s8 0xbd198f80-0xbd198fd0 print_type_s16 0xbd198fd0-0xbd199020 print_type_s32 0xbd199020-0xbd199070 print_type_s64 0xbd199070-0xbd1990c0 print_type_x8 0xbd1990c0-0xbd199110 print_type_x16 0xbd199110-0xbd199160 print_type_x32 0xbd199160-0xbd1991b0 print_type_x64 But not always: 0xbd1997b0-0xbd1997c0 fetch_kernel_stack_address (kernel/trace/trace_probe.c) 0xbd1c57f0-0xbd1c58b0 __context_tracking_enter (kernel/context_tracking.c) Signed-off-by: Li Bin Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Cc: zhangmengt...@huawei.com Fixes: 9aaf5a5f479b ("perf probe: Check kprobes blacklist when adding new events") Link: http://lkml.kernel.org/r/1504011443-7269-1-git-send-email-huawei.li...@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/probe-event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index d7cd114..b7aaf9b 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -2395,7 +2395,7 @@ kprobe_blacklist__find_by_address(struct list_head *blacklist, struct kprobe_blacklist_node *node; list_for_each_entry(node, blacklist, list) { - if (node->start <= address && address <= node->end) + if (node->start <= address && address < node->end) return node; }
[PATCH] perf probe: Fix kprobe blacklist checking condition
The commit 9aaf5a5("perf probe: Check kprobes blacklist when adding new events"), perf probe supports checking the blacklist of the fuctions which can not be probed. But the checking condition is wrong, that the end_addr of the symbol which is the start_addr of the next symbol can't be included. Signed-off-by: Li Bin --- tools/perf/util/probe-event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index a2670e9..bf7c928 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -2373,7 +2373,7 @@ static int kprobe_blacklist__load(struct list_head *blacklist) struct kprobe_blacklist_node *node; list_for_each_entry(node, blacklist, list) { - if (node->start <= address && address <= node->end) + if (node->start <= address && address < node->end) return node; } -- 1.7.12.4
[PATCH] perf symbols: Fix plt entry calculation for ARM and AARCH64
On x86, the plt header size is as same as the plt entry size, and can be identified from shdr's sh_entsize of the plt. But we cann't assume that the sh_entsize of the plt shdr is always the plt entry size in all architecture, and the plt header size may be not as same as the plt entry size in some architecure. On ARM, the plt header size is 20 bytes and the plt entry size is 12 bytes(don't consider the FOUR_WORD_PLT case) that refer to the binutils implementation. plt section is as follows: Disassembly of section .plt: 04a0 <__cxa_finalize@plt-0x14>: 4a0: e52de004push{lr}; (str lr, [sp, #-4]!) 4a4: e59fe004ldr lr, [pc, #4]; 4b0 <_init+0x1c> 4a8: e08fe00eadd lr, pc, lr 4ac: e5bef008ldr pc, [lr, #8]! 4b0: 8424.word 0x8424 04b4 <__cxa_finalize@plt>: 4b4: e28fc600add ip, pc, #0, 12 4b8: e28cca08add ip, ip, #8, 20 ; 0x8000 4bc: e5bcf424ldr pc, [ip, #1060]!; 0x424 04c0 : 4c0: e28fc600add ip, pc, #0, 12 4c4: e28cca08add ip, ip, #8, 20 ; 0x8000 4c8: e5bcf41cldr pc, [ip, #1052]!; 0x41c On AARCH64, the plt header size is 32 bytes and the plt entry size is 16 bytes. plt section is as follows: Disassembly of section .plt: 0560 <__cxa_finalize@plt-0x20>: 560: a9bf7bf0stp x16, x30, [sp,#-16]! 564: 9090adrpx16, 1 <__FRAME_END__+0xf8a8> 568: f944be11ldr x17, [x16,#2424] 56c: 9125e210add x16, x16, #0x978 570: d61f0220br x17 574: d503201fnop 578: d503201fnop 57c: d503201fnop 0580 <__cxa_finalize@plt>: 580: 9090adrpx16, 1 <__FRAME_END__+0xf8a8> 584: f944c211ldr x17, [x16,#2432] 588: 91260210add x16, x16, #0x980 58c: d61f0220br x17 0590 <__gmon_start__@plt>: 590: 9090adrpx16, 1 <__FRAME_END__+0xf8a8> 594: f944c611ldr x17, [x16,#2440] 598: 91262210add x16, x16, #0x988 59c: d61f0220br x17 NOTES: In addition to ARM and AARCH64, other architectures, such as s390/alpha/mips/parisc/poperpc/sh/sparc/xtensa also need to consider this issue. Signed-off-by: Li Bin --- tools/perf/util/symbol-elf.c | 27 ++- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index e7ee47f..1bccf71 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -259,7 +259,7 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map * { uint32_t nr_rel_entries, idx; GElf_Sym sym; - u64 plt_offset; + u64 plt_offset, plt_header_size, plt_entry_size; GElf_Shdr shdr_plt; struct symbol *f; GElf_Shdr shdr_rel_plt, shdr_dynsym; @@ -326,6 +326,23 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map * nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize; plt_offset = shdr_plt.sh_offset; + switch (ehdr.e_machine) { + case EM_ARM: + plt_header_size = 20; + plt_entry_size = 12; + break; + + case EM_AARCH64: + plt_header_size = 32; + plt_entry_size = 16; + break; + + default: /* FIXME: s390/alpha/mips/parisc/poperpc/sh/sparc/xtensa need to be checked */ + plt_header_size = shdr_plt.sh_entsize; + plt_entry_size = shdr_plt.sh_entsize; + break; + } + plt_offset += plt_header_size; if (shdr_rel_plt.sh_type == SHT_RELA) { GElf_Rela pos_mem, *pos; @@ -335,7 +352,6 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map * const char *elf_name = NULL; char *demangled = NULL; symidx = GELF_R_SYM(pos->r_info); - plt_offset += shdr_plt.sh_entsize; gelf_getsym(syms, symidx, &sym); elf_name = elf_sym__name(&sym, symstrs); @@ -346,11 +362,12 @@ int dso__synthesize_plt_symbols(struct dso *dso, struct symsrc *ss, struct map * "%s@plt", elf_name); free(demangled); - f = symbol__new(plt_offset, shdr_plt.sh_entsize, + f = symbol__new(plt_offset, plt_entry_size, STB_GLOBAL, sympltname); if (!f)
Re: [PATCH] reduce the time of finding symbols for module
Hi, on 2017/3/29 8:03, Jessica Yu wrote: > +++ Miroslav Benes [28/03/17 13:16 +0200]: >> On Tue, 28 Mar 2017, zhouchengming wrote: >> >>> On 2017/3/28 17:00, Miroslav Benes wrote: >>> > >>> > Hi, >>> > >>> > On Tue, 28 Mar 2017, Zhou Chengming wrote: >>> > >>> > > It's reported that the time of insmoding a klp.ko for one of our >>> > > out-tree modules is too long. >>> > > >>> > > ~ time sudo insmod klp.ko >>> > > real0m23.799s >>> > > user0m0.036s >>> > > sys0m21.256s >>> > >>> > Is this stable through several (>=10) runs? 23 seconds are really >>> > suspicious. Yes, there is a linear search through all the kallsyms in >>> > kallsyms_on_each_symbol(), but there are something like 70k symbols on my >>> > machine (that is, way less than 1M). 23 seconds are somewhat unexpected. >>> > >>> >>> Yes, it's stable through several runs. >>> >>> I think the big reason is that our out-tree module used a lot of static >>> local >>> variables. We can see '.rela.kpatch.dynrelas' contains many entries, so it >>> will >>> waste a lot of time if we use kallsyms_on_each_symbol() to find these >>> symbols >>> of module. >> >> Ok, it means that you have a lot of relocation records which reference >> your out-of-tree module. Then for each such entry klp_resolve_symbol() >> is called and then klp_find_object_symbol() to actually resolve it. So if >> you have 20k entries, you walk through vmlinux kallsyms table 20k times. >> It is unneeded and that is why your fix works. >> >> But if there were 20k modules loaded, the problem would still be there. >> >> I think it would be really nice to fix kallsyms :). Replace ordinary array >> and the linear search with a hash table. >> >>> Relocation section '.rela.kpatch.funcs' at offset 0x382e0 contains 3 >>> entries: >>> Offset Info Type Sym. ValueSym. Name + >>> Addend >>> 00330101 R_AARCH64_ABS64 value_show + 0 >>> 0020 000b0101 R_AARCH64_ABS64 >>> .kpatch.strings >>> + 8 >>> 0028 000b0101 R_AARCH64_ABS64 >>> .kpatch.strings >>> + 0 >> >> Hm, we do not have aarch64 support in upstream (yet). There is even no >> dynamic ftrace with regs yet (if I am not mistaken). > > I'm curious, how was this tested? Since there is no dynamic ftrace > with regs and no livepatch stubs (klp_arch_set_pc, etc) implemented > yet for aarch64. Also, livepatch has switched from klp_relocs/dynrelas > to .klp.rela. sections since 4.7, so I'm curious how your patch module > has a .kpatch.dynrelas section working with livepatch. > > Unrelated to this patch, if there is a working aarch64 livepatch port (and > kpatch build tool, it seems) floating out there, it would be > wonderful to push that upstream :-) Yeah, from 2014, we started to work on livepatch support on aarch64, and in May 2015, we pushed the solution to the livepatch community[1] and gcc community (mfentry feature on aarch64)[2]. And then, there were an another gcc solution from linaro [3], which proposes to implement a new option -fprolog-pad=N that generate a pad of N nops at the beginning of each function, and AFAIK, Torsten Duwe from SUSE is still discussing this method with gcc community. At this stage, we are validating the livepatch support on aarch64 based on aarch64 mfentry feature. When the community has a clear plan, we are happy to make adaptation and contribute our related work to the community, including the kpatch-build support :-) [1] livepatch: add support on arm64 https://lkml.org/lkml/2015/5/28/54 [2] [AArch64] support -mfentry feature for arm64 https://gcc.gnu.org/ml/gcc-patches/2016-03/msg00756.html [3] Kernel livepatching support in GCC https://gcc.gnu.org/ml/gcc/2015-05/msg00267.html [4] arm64: ftrace with regs for livepatch support http://lists.infradead.org/pipermail/linux-arm-kernel/2016-January/401352.html Thanks, Li Bin > > Jessica > > . >
Re: [PATCH v13 01/10] arm64: Add HAVE_REGS_AND_STACK_ACCESS_API feature
on 2016/6/3 11:26, David Long wrote: > From: "David A. Long" > > Add HAVE_REGS_AND_STACK_ACCESS_API feature for arm64, including supporting > functions and defines. > > Signed-off-by: David A. Long > --- > arch/arm64/Kconfig | 1 + > arch/arm64/include/asm/ptrace.h | 54 +- > arch/arm64/kernel/ptrace.c | 118 > > 3 files changed, 172 insertions(+), 1 deletion(-) > > diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig > index 76747d9..0f7a624 100644 > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -85,6 +85,7 @@ config ARM64 > select HAVE_PERF_EVENTS > select HAVE_PERF_REGS > select HAVE_PERF_USER_STACK_DUMP > + select HAVE_REGS_AND_STACK_ACCESS_API > select HAVE_RCU_TABLE_FREE > select HAVE_SYSCALL_TRACEPOINTS > select IOMMU_DMA if IOMMU_SUPPORT > diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h > index a307eb6..c130f2b 100644 > --- a/arch/arm64/include/asm/ptrace.h > +++ b/arch/arm64/include/asm/ptrace.h > @@ -74,6 +74,7 @@ > #define COMPAT_PT_DATA_ADDR 0x10004 > #define COMPAT_PT_TEXT_END_ADDR 0x10008 > #ifndef __ASSEMBLY__ > +#include > > /* sizeof(struct user) for AArch32 */ > #define COMPAT_USER_SZ 296 > @@ -119,6 +120,8 @@ struct pt_regs { > u64 syscallno; > }; > > +#define MAX_REG_OFFSET offsetof(struct pt_regs, pstate) > + > #define arch_has_single_step() (1) > > #ifdef CONFIG_COMPAT > @@ -147,6 +150,55 @@ struct pt_regs { > #define user_stack_pointer(regs) \ > (!compat_user_mode(regs) ? (regs)->sp : (regs)->compat_sp) > > +extern int regs_query_register_offset(const char *name); > +extern const char *regs_query_register_name(unsigned int offset); > +extern bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long > addr); > +extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, > +unsigned int n); > + > +/** > + * regs_get_register() - get register value from its offset > + * @regs: pt_regs from which register value is gotten > + * @offset:offset of the register. > + * > + * regs_get_register returns the value of a register whose offset from @regs. > + * The @offset is the offset of the register in struct pt_regs. > + * If @offset is bigger than MAX_REG_OFFSET, this returns 0. > + */ > +static inline u64 regs_get_register(struct pt_regs *regs, > + unsigned int offset) > +{ > + u64 val = 0; > + > + WARN_ON(offset & 7); > + > + offset >>= 3; Here, the offset is 3-bits right shifted. So, > + switch (offset) { > + case0 ... 30: > + val = regs->regs[offset]; > + break; > + case offsetof(struct pt_regs, sp): here should be shifted too, as case offsetof(struct pt_regs, sp) >> 3: > + val = regs->sp; > + break; > + case offsetof(struct pt_regs, pc): and here, > + val = regs->pc; > + break; > + case offsetof(struct pt_regs, pstate): and here. thanks, Li Bin > + val = regs->pstate; > + break; > + default: > + val = 0; > + } > + > + return val; > +} > + > +/* Valid only for Kernel mode traps. */ > +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) > +{ > + return regs->sp; > +} > + > static inline unsigned long regs_return_value(struct pt_regs *regs) > { > return regs->regs[0]; > @@ -156,7 +208,7 @@ static inline unsigned long regs_return_value(struct > pt_regs *regs) > struct task_struct; > int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task); > > -#define instruction_pointer(regs)((unsigned long)(regs)->pc) > +#define instruction_pointer(regs)((regs)->pc) > > extern unsigned long profile_pc(struct pt_regs *regs); > > diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c > index 3f6cd5c..2c88c33 100644 > --- a/arch/arm64/kernel/ptrace.c > +++ b/arch/arm64/kernel/ptrace.c > @@ -48,6 +48,124 @@ > #define CREATE_TRACE_POINTS > #include > > +struct pt_regs_offset { > + const char *name; > + int offset; > +}; > + > +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, > r)} > +#define REG_OFFSET_END {.name = NULL, .offset = 0} > +#define GPR_OFFSET_NAME(r) \ > + {.name = "x" #r, .offset = offs
Re: [PATCH v12 00/10] arm64: Add kernel probes (kprobes) support
@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) instruction_pointer_set(regs, (long)jp->entry); preempt_disable(); + pause_graph_tracing(); return 1; } @@ -757,6 +758,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) show_regs(regs); BUG(); } + unpause_graph_tracing(); *regs = kcb->jprobe_saved_regs; memcpy((void *)stack_addr, kcb->jprobes_stack, MIN_STACK_SIZE(stack_addr)); Li Bin > > Thanks, > > > James > > > > > ___ > linux-arm-kernel mailing list > linux-arm-ker...@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel > > . >
Re: [PATCH v11 5/9] arm64: Kprobes with single stepping support
Hi David, on 2016/3/9 13:32, David Long wrote: > +int __kprobes arch_prepare_kprobe(struct kprobe *p) > +{ > + unsigned long probe_addr = (unsigned long)p->addr; Here should verify the addr alignment: if (probe_addr & 0x3) return -EINVAL; Thanks, Li Bin > + > + /* copy instruction */ > + p->opcode = le32_to_cpu(*p->addr); > + > + if (in_exception_text(probe_addr)) > + return -EINVAL; > + > + /* decode instruction */ > + switch (arm_kprobe_decode_insn(p->addr, &p->ainsn)) { > + case INSN_REJECTED: /* insn not supported */ > + return -EINVAL; > + > + case INSN_GOOD_NO_SLOT: /* insn need simulation */ > + return -EINVAL; > + > + case INSN_GOOD: /* instruction uses slot */ > + p->ainsn.insn = get_insn_slot(); > + if (!p->ainsn.insn) > + return -ENOMEM; > + break; > + }; > + > + /* prepare the instruction */ > + arch_prepare_ss_slot(p); > + > + return 0; > +} > +
[PATCH] kprobes: fix race condition in __unregister_kprobe_top
In the following case, it will trigger kernel panic: 1. register a kprobe for one function 2. register a jprobe for the same function (to make it easier to reproduce, let the entry callback take a long time such as calling mdelay) 3. trigger the function be called, and then unregister the jprobe (before the entry callback calling jprobe_return) The reason is that in __unregister_kprobe_top (unregister_jprobe) the break_handler of the aggrprobe will be set NULL, but now the entry callback may has been triggered (before jprobe_return), and then in jprobe_return, it trigger int3/brk exception, in exception handler, because the break_handler has been set NULL, it will not setup_singlestep, and will return to the original instrucion... To fix this bug, __unregister_kprobe_top call the synchronize_sched() before clearing the handler. Signed-off-by: Li Bin --- kernel/kprobes.c |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index d10ab6b..5b5dd68 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1615,6 +1615,7 @@ static int __unregister_kprobe_top(struct kprobe *p) */ goto disarmed; else { + synchronize_sched(); /* If disabling probe has special handlers, update aggrprobe */ if (p->break_handler && !kprobe_gone(p)) ap->break_handler = NULL; -- 1.7.1
Re: [PATCH 2/2] arm64: Fix watchpoint recursion when single-step is wrongly triggered in irq
on 2016/4/8 13:14, Pratyush Anand wrote: > Hi Li, > > On 07/04/2016:07:34:37 PM, Li Bin wrote: >> Hi Pratyush, >> >> on 2016/4/4 13:17, Pratyush Anand wrote: >>> Hi Li, >>> >>> On 31/03/2016:08:45:05 PM, Li Bin wrote: >>>> Hi Pratyush, >>>> >>>> on 2016/3/21 18:24, Pratyush Anand wrote: >>>>> On 21/03/2016:08:37:50 AM, He Kuang wrote: >>>>>> On arm64, watchpoint handler enables single-step to bypass the next >>>>>> instruction for not recursive enter. If an irq is triggered right >>>>>> after the watchpoint, a single-step will be wrongly triggered in irq >>>>>> handler, which causes the watchpoint address not stepped over and >>>>>> system hang. >>>>> >>>>> Does patch [1] resolves this issue as well? I hope it should. Patch[1] >>>>> has still >>>>> not been sent for review. Your test result will be helpful. >>>>> >>>>> ~Pratyush >>>>> >>>>> [1] >>>>> https://github.com/pratyushanand/linux/commit/7623c8099ac22eaa00e7e0f52430f7a4bd154652 >>>> >>>> This patch did not consider that, when excetpion return, the singlestep >>>> flag >>>> should be restored, otherwise the right singlestep will not triggered. >>>> Right? >>> >>> Yes, you are right, and there are other problems as well. Will Deacon >>> pointed >>> out [1] that kernel debugging is per-cpu rather than per-task. So, I did >>> thought >>> of a per-cpu implementation by introducing a new element "flags" in struct >>> pt_regs. But even with that I see issues. For example: >>> - While executing single step instruction, we get a data abort >>> - In the kernel_entry of data abort we disable single stepping based on >>> "flags" >>> bit field >>> - While handling data abort we receive anther interrupt, so we are again in >>> kernel_entry (for el1_irq). Single stepping will be disabled again >>> (although >>> it does not matter). >>> >>> Now the issue is that, what condition should be verified in kernel_exit for >>> enabling single step again? In the above scenario, kernel_exit for el1_irq >>> should not enable single stepping, but how to prevent that elegantly? >> >> The condition for kernel_entry to disable the single step is that >> MDSCR_EL1.SS >> has been set. And only when the corresponding kernel_entry has disabled the >> single >> step, the kernel_exit should enable it, but the kernel_exit of single-step >> exception >> should be handled specially, that when disable single step in single-step >> exception >> handler, flag of pt_regs stored in stack should be cleard to prevent to be >> re-enabled >> by kernel_exit. > > Nice, :-) > I had latter on almost similar patch [1], but it did fail when I merged two of > the tests. > -- I inserted kprobe to an instruction in function __copy_to_user() which > could > generate data abort. > -- In parallel I also run test case which is defined here [2] > -- As soon as I did `cat /proc/version`, kernel crashed. Hi Pratyush, Firstly, I have test this case, and it does not trigger failture as you describing. But it indeed may trigger problem, and it is an another issue that if an exception triggered before single-step exception, changes the regs->pc (data abort exception will fixup_exception), the current implemetion of kprobes does not support, for example: 1. kprobes brk exception setup single-step, regs->pc points to the slot, MDSCR.SS=1, SPSR_EL1.SS=1 (Inactive state) 2. brk exception eret (Active-not-pending state) 3. execute the slot instruction and trigger data abort exception, and this case the return addr is also the slot instruction, so the SPSR_EL1.SS is set to 1 (Inactive state) 4. but in the data abort exception, fixup_exception will change the regs->pc to the fixup code 5. data abort exception eret, going into Active-not-pending state, executing fixup code without taking an exception, going into Active-pending state, triggering single-step exception. But the single-step instruction is not the target instrution, so kprobe fails. And so this case including copy_to/from_user should be added to kprobes blacklist. Right, or am i missing something? Thanks, Li Bin > > Although, just by comparing visually I do not see any functional difference > with > the patch you inlined below, still can you please try both of the above test > case in parallel and see how does it behave? To insert kprobe within > __copy_to_user() you need to revert "arm64: add copy_to/from_user to kprobes > blacklist". > > [1] > https://github.com/pratyushanand/linux/commit/9182afbe640ea7caf52e209eb8e5f00bdf91b0c0 > [2] http://thread.gmane.org/gmane.linux.kernel/2167918 >> >> Thanks, >> Li Bin >>
Re: [PATCH 2/2] arm64: Fix watchpoint recursion when single-step is wrongly triggered in irq
Hi Pratyush, on 2016/4/4 13:17, Pratyush Anand wrote: > Hi Li, > > On 31/03/2016:08:45:05 PM, Li Bin wrote: >> Hi Pratyush, >> >> on 2016/3/21 18:24, Pratyush Anand wrote: >>> On 21/03/2016:08:37:50 AM, He Kuang wrote: >>>> On arm64, watchpoint handler enables single-step to bypass the next >>>> instruction for not recursive enter. If an irq is triggered right >>>> after the watchpoint, a single-step will be wrongly triggered in irq >>>> handler, which causes the watchpoint address not stepped over and >>>> system hang. >>> >>> Does patch [1] resolves this issue as well? I hope it should. Patch[1] has >>> still >>> not been sent for review. Your test result will be helpful. >>> >>> ~Pratyush >>> >>> [1] >>> https://github.com/pratyushanand/linux/commit/7623c8099ac22eaa00e7e0f52430f7a4bd154652 >> >> This patch did not consider that, when excetpion return, the singlestep flag >> should be restored, otherwise the right singlestep will not triggered. >> Right? > > Yes, you are right, and there are other problems as well. Will Deacon pointed > out [1] that kernel debugging is per-cpu rather than per-task. So, I did > thought > of a per-cpu implementation by introducing a new element "flags" in struct > pt_regs. But even with that I see issues. For example: > - While executing single step instruction, we get a data abort > - In the kernel_entry of data abort we disable single stepping based on > "flags" > bit field > - While handling data abort we receive anther interrupt, so we are again in > kernel_entry (for el1_irq). Single stepping will be disabled again (although > it does not matter). > > Now the issue is that, what condition should be verified in kernel_exit for > enabling single step again? In the above scenario, kernel_exit for el1_irq > should not enable single stepping, but how to prevent that elegantly? The condition for kernel_entry to disable the single step is that MDSCR_EL1.SS has been set. And only when the corresponding kernel_entry has disabled the single step, the kernel_exit should enable it, but the kernel_exit of single-step exception should be handled specially, that when disable single step in single-step exception handler, flag of pt_regs stored in stack should be cleard to prevent to be re-enabled by kernel_exit. Thanks, Li Bin --- arch/arm64/include/asm/assembler.h | 11 +++ arch/arm64/include/asm/debug-monitors.h |2 +- arch/arm64/include/asm/ptrace.h |2 ++ arch/arm64/kernel/asm-offsets.c |1 + arch/arm64/kernel/debug-monitors.c |3 ++- arch/arm64/kernel/entry.S | 18 ++ arch/arm64/kernel/hw_breakpoint.c |2 +- arch/arm64/kernel/kgdb.c|2 +- 8 files changed, 37 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 70f7b9e..56a4335 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -60,6 +60,17 @@ msr daifclr, #8 .endm + .macro disable_step, orig + bic \orig, \orig, #1 + msr mdscr_el1, \orig + .endm + + .macro enable_step, orig + disable_dbg + orr \orig, \orig, #1 + msr mdscr_el1, \orig + .endm + .macro disable_step_tsk, flgs, tmp tbz \flgs, #TIF_SINGLESTEP, 9990f mrs \tmp, mdscr_el1 diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index 2fcb9b7..50f114c 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -115,7 +115,7 @@ void user_rewind_single_step(struct task_struct *task); void user_fastforward_single_step(struct task_struct *task); void kernel_enable_single_step(struct pt_regs *regs); -void kernel_disable_single_step(void); +void kernel_disable_single_step(struct pt_regs *regs); int kernel_active_single_step(void); #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index a307eb6..da00cd8 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -113,6 +113,8 @@ struct pt_regs { u64 sp; u64 pc; u64 pstate; + u64 flags; + u64 padding; }; }; u64 orig_x0; diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 3ae6b31..7936ba9 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -56,6 +56,7 @@ int main(void) DEFINE(S_COMPAT_SP, offsetof(struct p
Re: [PATCH 2/2] arm64: Fix watchpoint recursion when single-step is wrongly triggered in irq
Hi Pratyush, on 2016/3/21 18:24, Pratyush Anand wrote: > On 21/03/2016:08:37:50 AM, He Kuang wrote: >> On arm64, watchpoint handler enables single-step to bypass the next >> instruction for not recursive enter. If an irq is triggered right >> after the watchpoint, a single-step will be wrongly triggered in irq >> handler, which causes the watchpoint address not stepped over and >> system hang. > > Does patch [1] resolves this issue as well? I hope it should. Patch[1] has > still > not been sent for review. Your test result will be helpful. > > ~Pratyush > > [1] > https://github.com/pratyushanand/linux/commit/7623c8099ac22eaa00e7e0f52430f7a4bd154652 This patch did not consider that, when excetpion return, the singlestep flag should be restored, otherwise the right singlestep will not triggered. Right? Thanks, Li Bin > > ___ > linux-arm-kernel mailing list > linux-arm-ker...@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-arm-kernel > > . >
[PATCH] kernel/Makefile: remove the useless CFLAGS_REMOVE_cgroup-debug.o
The file cgroup-debug.c had been removed from commit fe6934354f8e (cgroups: move the cgroup debug subsys into cgroup.c to access internal state). Remain the CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) useless in kernel/Makefile. Signed-off-by: Li Bin --- kernel/Makefile |3 +-- 1 files changed, 1 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index 53abf00..baa55e5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -14,8 +14,7 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER -# Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) +# Do not trace internal ftrace files CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif -- 1.7.1
Re: [PATCH 5/5] x86: ftrace: fix the comments for ftrace_modify_code_direct
on 2015/12/6 6:52, Steven Rostedt wrote: > On Sat, 5 Dec 2015 18:12:57 +0100 (CET) > Thomas Gleixner wrote: > >> On Fri, 4 Dec 2015, Li Bin wrote: >>> --- a/arch/x86/kernel/ftrace.c >>> +++ b/arch/x86/kernel/ftrace.c >>> @@ -106,13 +106,12 @@ ftrace_modify_code_direct(unsigned long ip, unsigned >>> const char *old_code, >>> unsigned char replaced[MCOUNT_INSN_SIZE]; >>> >>> /* >>> -* Note: Due to modules and __init, code can >>> -* disappear and change, we need to protect against faulting >>> -* as well as code changing. We do this by using the >>> -* probe_kernel_* functions. >>> -* >>> -* No real locking needed, this code is run through >>> -* kstop_machine, or before SMP starts. >>> +* Note: >>> +* We are paranoid about modifying text, as if a bug were to happen, it >>> +* could cause us to read or write to someplace that could cause harm. >>> +* Carefully read and modify the code with aarch64_insn_*() which uses >> aarch64_insn_() is related to x86 in which way? >> >> > The original comment is incorrect and we discussed this with the > arm64 code, and said the other archs need the comment updated as > well. But it seems that Li Bin just cut and pasted the arm64 patch for > the other archs, or at least with x86 (haven't looked at the others > yet). This needs to be fixed. Sorry for my mistake, I will modify it. Thanks, Li Bin > -- Steve > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 5/5] x86: ftrace: fix the comments for ftrace_modify_code_direct
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: x...@kernel.org Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/x86/kernel/ftrace.c | 12 +--- 1 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 311bcf3..db0f6b1 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -106,13 +106,11 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, unsigned char replaced[MCOUNT_INSN_SIZE]; /* -* Note: Due to modules and __init, code can -* disappear and change, we need to protect against faulting -* as well as code changing. We do this by using the -* probe_kernel_* functions. -* -* No real locking needed, this code is run through -* kstop_machine, or before SMP starts. +* Note: +* We are paranoid about modifying text, as if a bug was to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with probe_kernel_*(), and make +* sure what we read is what we expected it to be before modifying it. */ /* read the text we want to modify */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 3/5] powerpc: ftrace: fix the comments for ftrace_modify_code
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-...@lists.ozlabs.org Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/powerpc/kernel/ftrace.c | 12 +--- 1 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 44d4d8e..83e3c88 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -47,13 +47,11 @@ ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new) unsigned int replaced; /* -* Note: Due to modules and __init, code can -* disappear and change, we need to protect against faulting -* as well as code changing. We do this by using the -* probe_kernel_* functions. -* -* No real locking needed, this code is run through -* kstop_machine, or before SMP starts. +* Note: +* We are paranoid about modifying text, as if a bug was to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with probe_kernel_*(), and make +* sure what we read is what we expected it to be before modifying it. */ /* read the text we want to modify */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 4/5] sh: ftrace: fix the comments for ftrace_modify_code
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Cc: linux...@vger.kernel.org Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/sh/kernel/ftrace.c | 12 +--- 1 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 079d70e..38993e0 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -212,13 +212,11 @@ static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char replaced[MCOUNT_INSN_SIZE]; /* -* Note: Due to modules and __init, code can -* disappear and change, we need to protect against faulting -* as well as code changing. We do this by using the -* probe_kernel_* functions. -* -* No real locking needed, this code is run through -* kstop_machine, or before SMP starts. +* Note: +* We are paranoid about modifying text, as if a bug was to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with probe_kernel_*(), and make +* sure what we read is what we expected it to be before modifying it. */ /* read the text we want to modify */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 1/5] ia64: ftrace: fix the comments for ftrace_modify_code
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Cc: Tony Luck Cc: Fenghua Yu Cc: linux-i...@vger.kernel.org Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/ia64/kernel/ftrace.c | 12 +--- 1 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c index 3b0c2aa..cee411e 100644 --- a/arch/ia64/kernel/ftrace.c +++ b/arch/ia64/kernel/ftrace.c @@ -97,13 +97,11 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char replaced[MCOUNT_INSN_SIZE]; /* -* Note: Due to modules and __init, code can -* disappear and change, we need to protect against faulting -* as well as code changing. We do this by using the -* probe_kernel_* functions. -* -* No real locking needed, this code is run through -* kstop_machine, or before SMP starts. +* Note: +* We are paranoid about modifying text, as if a bug was to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with probe_kernel_*(), and make +* sure what we read is what we expected it to be before modifying it. */ if (!do_check) -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 0/5] ftrace: fix ftrace misleading comments for arch using it
Fix the following similar misleading comments of ftrace for arch ia64/metag/powerpc/sh/x86: Note: Due to modules and __init, code can disappear and change, we need to protect against faulting as well as code changing. We do this by using the probe_kernel_* functions. No real locking needed, this code is run through kstop_machine, or before SMP starts. Cc: Tony Luck Cc: Fenghua Yu Cc: linux-i...@vger.kernel.org Cc: James Hogan Cc: linux-me...@vger.kernel.org Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-...@lists.ozlabs.org Cc: linux...@vger.kernel.org Cc: Thomas Gleixner "Cc: H. Peter Anvin" Cc: x...@kernel.org Li Bin (5): ia64: ftrace: fix the comments for ftrace_modify_code metag: ftrace: fix the comments for ftrace_modify_code powerpc: ftrace: fix the comments for ftrace_modify_code sh: ftrace: fix the comments for ftrace_modify_code x86: ftrace: fix the comments for ftrace_modify_code_direct arch/ia64/kernel/ftrace.c| 12 +--- arch/metag/kernel/ftrace.c | 11 +-- arch/powerpc/kernel/ftrace.c | 12 +--- arch/sh/kernel/ftrace.c | 12 +--- arch/x86/kernel/ftrace.c | 12 +--- 5 files changed, 25 insertions(+), 34 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 2/5] metag: ftrace: fix the comments for ftrace_modify_code
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Cc: James Hogan Cc: linux-me...@vger.kernel.org Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/metag/kernel/ftrace.c | 11 +-- 1 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/metag/kernel/ftrace.c b/arch/metag/kernel/ftrace.c index ed1d685..ac8c039 100644 --- a/arch/metag/kernel/ftrace.c +++ b/arch/metag/kernel/ftrace.c @@ -54,12 +54,11 @@ static int ftrace_modify_code(unsigned long pc, unsigned char *old_code, unsigned char replaced[MCOUNT_INSN_SIZE]; /* -* Note: Due to modules and __init, code can -* disappear and change, we need to protect against faulting -* as well as code changing. -* -* No real locking needed, this code is run through -* kstop_machine. +* Note: +* We are paranoid about modifying text, as if a bug was to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with probe_kernel_*(), and make +* sure what we read is what we expected it to be before modifying it. */ /* read the text we want to modify */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 5/5] x86: ftrace: fix the comments for ftrace_modify_code_direct
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: x...@kernel.org Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/x86/kernel/ftrace.c | 13 ++--- 1 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 311bcf3..c2987e8 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -106,13 +106,12 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, unsigned char replaced[MCOUNT_INSN_SIZE]; /* -* Note: Due to modules and __init, code can -* disappear and change, we need to protect against faulting -* as well as code changing. We do this by using the -* probe_kernel_* functions. -* -* No real locking needed, this code is run through -* kstop_machine, or before SMP starts. +* Note: +* We are paranoid about modifying text, as if a bug were to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with aarch64_insn_*() which uses +* probe_kernel_*(), and make sure what we read is what we expected it +* to be before modifying it. */ /* read the text we want to modify */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/5] powerpc: ftrace: fix the comments for ftrace_modify_code
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-...@lists.ozlabs.org Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/powerpc/kernel/ftrace.c | 13 ++--- 1 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 44d4d8e..c6452b2 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -47,13 +47,12 @@ ftrace_modify_code(unsigned long ip, unsigned int old, unsigned int new) unsigned int replaced; /* -* Note: Due to modules and __init, code can -* disappear and change, we need to protect against faulting -* as well as code changing. We do this by using the -* probe_kernel_* functions. -* -* No real locking needed, this code is run through -* kstop_machine, or before SMP starts. +* Note: +* We are paranoid about modifying text, as if a bug were to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with aarch64_insn_*() which uses +* probe_kernel_*(), and make sure what we read is what we expected it +* to be before modifying it. */ /* read the text we want to modify */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 4/5] sh: ftrace: fix the comments for ftrace_modify_code
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Cc: linux...@vger.kernel.org Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/sh/kernel/ftrace.c | 13 ++--- 1 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 079d70e..b696f92 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -212,13 +212,12 @@ static int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char replaced[MCOUNT_INSN_SIZE]; /* -* Note: Due to modules and __init, code can -* disappear and change, we need to protect against faulting -* as well as code changing. We do this by using the -* probe_kernel_* functions. -* -* No real locking needed, this code is run through -* kstop_machine, or before SMP starts. +* Note: +* We are paranoid about modifying text, as if a bug were to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with aarch64_insn_*() which uses +* probe_kernel_*(), and make sure what we read is what we expected it +* to be before modifying it. */ /* read the text we want to modify */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/5] metag: ftrace: fix the comments for ftrace_modify_code
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Cc: James Hogan Cc: linux-me...@vger.kernel.org Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/metag/kernel/ftrace.c | 12 ++-- 1 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/metag/kernel/ftrace.c b/arch/metag/kernel/ftrace.c index ed1d685..e5d71b1 100644 --- a/arch/metag/kernel/ftrace.c +++ b/arch/metag/kernel/ftrace.c @@ -54,12 +54,12 @@ static int ftrace_modify_code(unsigned long pc, unsigned char *old_code, unsigned char replaced[MCOUNT_INSN_SIZE]; /* -* Note: Due to modules and __init, code can -* disappear and change, we need to protect against faulting -* as well as code changing. -* -* No real locking needed, this code is run through -* kstop_machine. +* Note: +* We are paranoid about modifying text, as if a bug were to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with aarch64_insn_*() which uses +* probe_kernel_*(), and make sure what we read is what we expected it +* to be before modifying it. */ /* read the text we want to modify */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/5] ftrace: fix ftrace misleading comments for arch using it
Fix the following similar misleading comments of ftrace for arch ia64/metag/powerpc/sh/x86: Note: Due to modules and __init, code can disappear and change, we need to protect against faulting as well as code changing. We do this by using the probe_kernel_* functions. No real locking needed, this code is run through kstop_machine, or before SMP starts. Cc: Tony Luck Cc: Fenghua Yu Cc: linux-i...@vger.kernel.org Cc: James Hogan Cc: linux-me...@vger.kernel.org Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-...@lists.ozlabs.org Cc: linux...@vger.kernel.org Cc: Thomas Gleixner "Cc: H. Peter Anvin" Cc: x...@kernel.org Li Bin (5): ia64: ftrace: fix the comments for ftrace_modify_code metag: ftrace: fix the comments for ftrace_modify_code powerpc: ftrace: fix the comments for ftrace_modify_code sh: ftrace: fix the comments for ftrace_modify_code x86: ftrace: fix the comments for ftrace_modify_code_direct arch/ia64/kernel/ftrace.c| 13 ++--- arch/metag/kernel/ftrace.c | 12 ++-- arch/powerpc/kernel/ftrace.c | 13 ++--- arch/sh/kernel/ftrace.c | 13 ++--- arch/x86/kernel/ftrace.c | 13 ++--- 5 files changed, 30 insertions(+), 34 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/5] ia64: ftrace: fix the comments for ftrace_modify_code
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Cc: Tony Luck Cc: Fenghua Yu Cc: linux-i...@vger.kernel.org Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/ia64/kernel/ftrace.c | 13 ++--- 1 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c index 3b0c2aa..a48a3f4 100644 --- a/arch/ia64/kernel/ftrace.c +++ b/arch/ia64/kernel/ftrace.c @@ -97,13 +97,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char replaced[MCOUNT_INSN_SIZE]; /* -* Note: Due to modules and __init, code can -* disappear and change, we need to protect against faulting -* as well as code changing. We do this by using the -* probe_kernel_* functions. -* -* No real locking needed, this code is run through -* kstop_machine, or before SMP starts. +* Note: +* We are paranoid about modifying text, as if a bug were to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with aarch64_insn_*() which uses +* probe_kernel_*(), and make sure what we read is what we expected it +* to be before modifying it. */ if (!do_check) -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 2/2] arm64: ftrace: fix the comments for ftrace_modify_code
I will also update the comment for the other arch that using the similar description, such as ia64/metag/powerpc/sh/x86. Thanks, Li Bin on 2015/12/4 10:50, Steven Rostedt wrote: > On Fri, 4 Dec 2015 10:18:39 +0800 > Li Bin wrote: > >> There is no need to worry about module text disappearing case, >> because that ftrace has a module notifier that is called when >> a module is being unloaded and before the text goes away, and this >> code grabs the ftrace_lock mutex and removes the module functions >> from the ftrace list, such that it will no longer do any >> modifications to that module's text. >> The update to make functions be traced or not is done under the >> ftrace_lock mutex as well. >> >> Signed-off-by: Li Bin >> --- >> arch/arm64/kernel/ftrace.c |5 + >> 1 files changed, 1 insertions(+), 4 deletions(-) >> >> diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c >> index 9669b33..ee91c0c 100644 >> --- a/arch/arm64/kernel/ftrace.c >> +++ b/arch/arm64/kernel/ftrace.c >> @@ -29,12 +29,9 @@ static int ftrace_modify_code(unsigned long pc, u32 old, >> u32 new, >> >> /* >> * Note: >> - * Due to modules and __init, code can disappear and change, >> + * Due to __init, code can disappear and change, > Init code should not be modified either because it is black listed in > recordmcount.c. > > I say just change the comment to be something like: > > We are paranoid about modifying text, as if a bug were to happen, it > could cause us to read or write to someplace that could cause harm. > Carefully read and modify the code with aarch64_insn_*() which uses > probe_kernel_*(), and make sure what we read is what we expected it to > be before modifying it. > > -- Steve > > >> * we need to protect against faulting as well as code changing. >> * We do this by aarch64_insn_*() which use the probe_kernel_*(). >> - * >> - * No lock is held here because all the modifications are run >> - * through stop_machine(). >> */ >> if (validate) { >> if (aarch64_insn_read((void *)pc, &replaced)) > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 1/2] arm64: ftrace: stop using kstop_machine to enable/disable tracing
For ftrace on arm64, kstop_machine which is hugely disruptive to a running system is not needed to convert nops to ftrace calls or back, because that to be modified instrucions, that NOP, B or BL, are all safe instructions which called "concurrent modification and execution of instructions", that can be executed by one thread of execution as they are being modified by another thread of execution without requiring explicit synchronization. Signed-off-by: Li Bin Reviewed-by: Steven Rostedt --- arch/arm64/kernel/ftrace.c |5 + 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index c851be7..9669b33 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -93,6 +93,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, return ftrace_modify_code(pc, old, new, true); } +void arch_ftrace_update_code(int command) +{ + ftrace_modify_all_code(command); +} + int __init ftrace_dyn_arch_init(void) { return 0; -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 0/2] arm64: stop using kstop_machine for ftrace
v2: Based on the comments from Will and Steve, 1. Modify the commit message 2. Fix the misleading comments for ftrace_modify_code v3: Modify the comments again based on the comment from Steve. Link: https://lkml.org/lkml/2015/12/3/422 Li Bin (2): arm64: ftrace: stop using kstop_machine to enable/disable tracing arm64: ftrace: fix the comments for ftrace_modify_code arch/arm64/kernel/ftrace.c | 16 ++-- 1 files changed, 10 insertions(+), 6 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v3 2/2] arm64: ftrace: fix the comments for ftrace_modify_code
There is no need to worry about module and __init text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text, the update to make functions be traced or not is done under the ftrace_lock mutex as well. And by now, __init section codes should not been modified by ftrace, because it is black listed in recordmcount.c and ignored by ftrace. Suggested-by: Steven Rostedt Signed-off-by: Li Bin --- arch/arm64/kernel/ftrace.c | 11 +-- 1 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 9669b33..8f7005b 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -29,12 +29,11 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, /* * Note: -* Due to modules and __init, code can disappear and change, -* we need to protect against faulting as well as code changing. -* We do this by aarch64_insn_*() which use the probe_kernel_*(). -* -* No lock is held here because all the modifications are run -* through stop_machine(). +* We are paranoid about modifying text, as if a bug were to happen, it +* could cause us to read or write to someplace that could cause harm. +* Carefully read and modify the code with aarch64_insn_*() which uses +* probe_kernel_*(), and make sure what we read is what we expected it +* to be before modifying it. */ if (validate) { if (aarch64_insn_read((void *)pc, &replaced)) -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 2/2] arm64: ftrace: fix the comments for ftrace_modify_code
on 2015/12/4 10:50, Steven Rostedt wrote: > On Fri, 4 Dec 2015 10:18:39 +0800 > Li Bin wrote: > >> There is no need to worry about module text disappearing case, >> because that ftrace has a module notifier that is called when >> a module is being unloaded and before the text goes away, and this >> code grabs the ftrace_lock mutex and removes the module functions >> from the ftrace list, such that it will no longer do any >> modifications to that module's text. >> The update to make functions be traced or not is done under the >> ftrace_lock mutex as well. >> >> Signed-off-by: Li Bin >> --- >> arch/arm64/kernel/ftrace.c |5 + >> 1 files changed, 1 insertions(+), 4 deletions(-) >> >> diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c >> index 9669b33..ee91c0c 100644 >> --- a/arch/arm64/kernel/ftrace.c >> +++ b/arch/arm64/kernel/ftrace.c >> @@ -29,12 +29,9 @@ static int ftrace_modify_code(unsigned long pc, u32 old, >> u32 new, >> >> /* >> * Note: >> - * Due to modules and __init, code can disappear and change, >> + * Due to __init, code can disappear and change, > Init code should not be modified either because it is black listed in > recordmcount.c. > > I say just change the comment to be something like: > > We are paranoid about modifying text, as if a bug were to happen, it > could cause us to read or write to someplace that could cause harm. > Carefully read and modify the code with aarch64_insn_*() which uses > probe_kernel_*(), and make sure what we read is what we expected it to > be before modifying it. Ok, I will modify it. Thanks, Li Bin > -- Steve > > >> * we need to protect against faulting as well as code changing. >> * We do this by aarch64_insn_*() which use the probe_kernel_*(). >> - * >> - * No lock is held here because all the modifications are run >> - * through stop_machine(). >> */ >> if (validate) { >> if (aarch64_insn_read((void *)pc, &replaced)) > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 0/2] arm64: stop using kstop_machine for ftrace
v2: Based on the comments from Will and Steve, 1. Modify the commit message 2. Fix the misleading comments for ftrace_modify_code Link: https://lkml.org/lkml/2015/12/3/422 Li Bin (2): arm64: ftrace: stop using kstop_machine to enable/disable tracing arm64: ftrace: fix the comments for ftrace_modify_code arch/arm64/kernel/ftrace.c | 10 ++ 1 files changed, 6 insertions(+), 4 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 1/2] arm64: ftrace: stop using kstop_machine to enable/disable tracing
For ftrace on arm64, kstop_machine which is hugely disruptive to a running system is not needed to convert nops to ftrace calls or back, because that to be modified instrucions, that NOP, B or BL, are all safe instructions which called "concurrent modification and execution of instructions", that can be executed by one thread of execution as they are being modified by another thread of execution without requiring explicit synchronization. Signed-off-by: Li Bin --- arch/arm64/kernel/ftrace.c |5 + 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index c851be7..9669b33 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -93,6 +93,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, return ftrace_modify_code(pc, old, new, true); } +void arch_ftrace_update_code(int command) +{ + ftrace_modify_all_code(command); +} + int __init ftrace_dyn_arch_init(void) { return 0; -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2 2/2] arm64: ftrace: fix the comments for ftrace_modify_code
There is no need to worry about module text disappearing case, because that ftrace has a module notifier that is called when a module is being unloaded and before the text goes away, and this code grabs the ftrace_lock mutex and removes the module functions from the ftrace list, such that it will no longer do any modifications to that module's text. The update to make functions be traced or not is done under the ftrace_lock mutex as well. Signed-off-by: Li Bin --- arch/arm64/kernel/ftrace.c |5 + 1 files changed, 1 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 9669b33..ee91c0c 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -29,12 +29,9 @@ static int ftrace_modify_code(unsigned long pc, u32 old, u32 new, /* * Note: -* Due to modules and __init, code can disappear and change, +* Due to __init, code can disappear and change, * we need to protect against faulting as well as code changing. * We do this by aarch64_insn_*() which use the probe_kernel_*(). -* -* No lock is held here because all the modifications are run -* through stop_machine(). */ if (validate) { if (aarch64_insn_read((void *)pc, &replaced)) -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2] ftrace: fix a typo in comment
s/ARCH_SUPPORT_FTARCE_OPS/ARCH_SUPPORTS_FTRACE_OPS Signed-off-by: Li Bin --- kernel/trace/ftrace.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f743b1..0033e05 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5195,7 +5195,7 @@ out: * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still - * set the ARCH_SUPPORT_FTARCE_OPS. + * set the ARCH_SUPPORTS_FTRACE_OPS. */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] ftrace: fix a typo in comment
s/ARCH_SUPPORT_FTARCE_OPS/ARCH_SUPPORTS_FTARCE_OPS Signed-off-by: Li Bin --- kernel/trace/ftrace.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f743b1..eb4a881 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5195,7 +5195,7 @@ out: * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still - * set the ARCH_SUPPORT_FTARCE_OPS. + * set the ARCH_SUPPORTS_FTARCE_OPS. */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] livepatch: fix race between enabled_store() and klp_unregister_patch()
There is a potential race as following: CPU0 | CPU1 -|--- enabled_store() | klp_unregister_patch() | |-mutex_lock(&klp_mutex); |-mutex_lock(&klp_mutex);| |-klp_free_patch(); | |-mutex_unlock(&klp_mutex); |-[process the patch's state]| |-mutex_unlock(&klp_mutex) | Fix this race condition by adding klp_is_patch_registered() check in enabled_store() after get the lock klp_mutex. Signed-off-by: Li Bin --- kernel/livepatch/core.c |5 + 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index db545cb..50af971 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -614,6 +614,11 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, mutex_lock(&klp_mutex); + if (!klp_is_patch_registered(patch)) { + ret = -EINVAL; + goto err; + } + if (val == patch->state) { /* already in requested state */ ret = -EINVAL; -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] arm64: ftrace: stop using kstop_machine to enable/disable tracing
On arm64, kstop_machine which is hugely disruptive to a running system is not needed to convert nops to ftrace calls or back, because that modifed code is a single 32bit instructions which is impossible to cross cache (or page) boundaries, and the used str instruction is single-copy atomic. Cc: # 3.18+ Signed-off-by: Li Bin --- arch/arm64/kernel/ftrace.c |5 + 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index c851be7..9669b33 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -93,6 +93,11 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, return ftrace_modify_code(pc, old, new, true); } +void arch_ftrace_update_code(int command) +{ + ftrace_modify_all_code(command); +} + int __init ftrace_dyn_arch_init(void) { return 0; -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/3] recordmcount: bugfix and amend
Li Bin (3): recordmcount: fix endianness handling bug for nop_mcount recordmcount: x86: assign a meaningful value to rel_type_nop recordmcount: arm64: replace the ignored mcount call into nop scripts/recordmcount.c | 26 +- scripts/recordmcount.h | 5 +++-- 2 files changed, 28 insertions(+), 3 deletions(-) -- 1.7.12.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/3] recordmcount: x86: assign a meaningful value to rel_type_nop
Although, the default value of rel_type_nop is zero, and the value of R_386_NONE/R_X86_64_NONE is zero too, but it should be assigned a meaningful value explicitly, otherwise it looks confused. Assign R_386_NONE to rel_type_nop for 386, assign R_X86_64_NONE to rel_type_nop for x86_64. Signed-off-by: Li Bin --- scripts/recordmcount.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 3d1984e..8cc020b 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -345,6 +345,7 @@ do_file(char const *const fname) break; case EM_386: reltype = R_386_32; + rel_type_nop = R_386_NONE; make_nop = make_nop_x86; ideal_nop = ideal_nop5_x86_32; mcount_adjust_32 = -1; @@ -371,6 +372,7 @@ do_file(char const *const fname) make_nop = make_nop_x86; ideal_nop = ideal_nop5_x86_64; reltype = R_X86_64_64; + rel_type_nop = R_X86_64_NONE; mcount_adjust_64 = -1; break; } /* end switch */ -- 1.7.12.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/3] recordmcount: arm64: replace the ignored mcount call into nop
By now, the recordmcount only records the function that in following sections: .text/.ref.text/.sched.text/.spinlock.text/.irqentry.text/ .kprobes.text/.text.unlikely For the function that not in these sections, the call mcount will be in place and not be replaced when kernel boot up. And it will bring performance overhead, such as do_mem_abort (in .exception.text section). This patch make the call mcount to nop for this case in recordmcount. Cc: # 3.18+ Signed-off-by: Li Bin --- scripts/recordmcount.c | 24 +++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 8cc020b..698768b 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -42,6 +42,7 @@ #ifndef EM_AARCH64 #define EM_AARCH64 183 +#define R_AARCH64_NONE 0 #define R_AARCH64_ABS64257 #endif @@ -160,6 +161,22 @@ static int make_nop_x86(void *map, size_t const offset) return 0; } +static unsigned char ideal_nop4_arm64[4] = {0x1f, 0x20, 0x03, 0xd5}; +static int make_nop_arm64(void *map, size_t const offset) +{ + uint32_t *ptr; + + ptr = map + offset; + /* bl <_mcount> is 0x9400 before relocation */ + if (*ptr != 0x9400) + return -1; + + /* Convert to nop */ + ulseek(fd_map, offset, SEEK_SET); + uwrite(fd_map, ideal_nop, 4); + return 0; +} + /* * Get the whole file as a programming convenience in order to avoid * malloc+lseek+read+free of many pieces. If successful, then mmap @@ -354,7 +371,12 @@ do_file(char const *const fname) altmcount = "__gnu_mcount_nc"; break; case EM_AARCH64: -reltype = R_AARCH64_ABS64; gpfx = '_'; break; + reltype = R_AARCH64_ABS64; + make_nop = make_nop_arm64; + rel_type_nop = R_AARCH64_NONE; + ideal_nop = ideal_nop4_arm64; + gpfx = '_'; + break; case EM_IA_64: reltype = R_IA64_IMM64; gpfx = '_'; break; case EM_METAG: reltype = R_METAG_ADDR32; altmcount = "_mcount_wrapper"; -- 1.7.12.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/3] recordmcount: fix endianness handling bug for nop_mcount
In nop_mcount, shdr->sh_offset and welp->r_offset should handle endianness properly, otherwise it will trigger Segmentation fault if the recordmcount main and file.o have different endianness. Cc: # 3.0+ Signed-off-by: Li Bin --- scripts/recordmcount.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/recordmcount.h b/scripts/recordmcount.h index 49b582a..dda9dba 100644 --- a/scripts/recordmcount.h +++ b/scripts/recordmcount.h @@ -376,8 +376,9 @@ static void nop_mcount(Elf_Shdr const *const relhdr, mcountsym = get_mcountsym(sym0, relp, str0); if (mcountsym == Elf_r_sym(relp) && !is_fake_mcount(relp)) { - if (make_nop) - ret = make_nop((void *)ehdr, shdr->sh_offset + relp->r_offset); + if (make_nop) { + ret = make_nop((void *)ehdr, _w(shdr->sh_offset) + _w(relp->r_offset)); + } if (warn_on_notrace_sect && !once) { printf("Section %s has mcount callers being ignored\n", txtname); -- 1.7.12.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] recordmcount: x86: assign a meaningful value to rel_type_nop
Although, the default value of rel_type_nop is zero, and the value of R_386_NONE/R_X86_64_NONE is zero too, but it should be assigned a meaningful value explicitly, otherwise it looks confused. Assign R_386_NONE to rel_type_nop for 386, assign R_X86_64_NONE to rel_type_nop for x86_64. Signed-off-by: Li Bin --- scripts/recordmcount.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 3d1984e..8cc020b 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -345,6 +345,7 @@ do_file(char const *const fname) break; case EM_386: reltype = R_386_32; + rel_type_nop = R_386_NONE; make_nop = make_nop_x86; ideal_nop = ideal_nop5_x86_32; mcount_adjust_32 = -1; @@ -371,6 +372,7 @@ do_file(char const *const fname) make_nop = make_nop_x86; ideal_nop = ideal_nop5_x86_64; reltype = R_X86_64_64; + rel_type_nop = R_X86_64_NONE; mcount_adjust_64 = -1; break; } /* end switch */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] recordmcount: arm64: replace the ignored mcount call into nop
By now, the recordmcount only records the function that in following sections: .text/.ref.text/.sched.text/.spinlock.text/.irqentry.text/ .kprobes.text/.text.unlikely For the function that not in these sections, the call mcount will be in place and not be replaced when kernel boot up. And it will bring performance overhead, such as do_mem_abort (in .exception.text section). This patch make the call mcount to nop for this case in recordmcount. Signed-off-by: Li Bin --- scripts/recordmcount.c | 23 ++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 3d1984e..f697226 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -42,6 +42,7 @@ #ifndef EM_AARCH64 #define EM_AARCH64 183 +#define R_AARCH64_NONE 0 #define R_AARCH64_ABS64257 #endif @@ -160,6 +161,21 @@ static int make_nop_x86(void *map, size_t const offset) return 0; } +static unsigned char ideal_nop4_arm64[4] = {0x1f, 0x20, 0x03, 0xd5}; +static int make_nop_arm64(void *map, size_t const offset) +{ + uint32_t *ptr; + + ptr = map + offset; + if (*ptr != 0x9400) + return -1; + + /* Convert to nop */ + ulseek(fd_map, offset, SEEK_SET); + uwrite(fd_map, ideal_nop, 4); + return 0; +} + /* * Get the whole file as a programming convenience in order to avoid * malloc+lseek+read+free of many pieces. If successful, then mmap @@ -353,7 +369,12 @@ do_file(char const *const fname) altmcount = "__gnu_mcount_nc"; break; case EM_AARCH64: -reltype = R_AARCH64_ABS64; gpfx = '_'; break; + reltype = R_AARCH64_ABS64; + make_nop = make_nop_arm64; + rel_type_nop = R_AARCH64_NONE; + ideal_nop = ideal_nop4_arm64; + gpfx = '_'; + break; case EM_IA_64: reltype = R_IA64_IMM64; gpfx = '_'; break; case EM_METAG: reltype = R_METAG_ADDR32; altmcount = "_mcount_wrapper"; -- 1.7.12.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] arm64: ftrace: function_graph: dump real return addr in call trace
When using function graph tracer, the printed call trace will be as following that has many ftrace_graph_caller (return_to_handler - 4), which is been placed in the stack by ftrace_graph tracer to replace the real return address. [ 198.582568] Call trace: [ 198.583313] [] next_tgid+0x30/0x100 [ 198.584359] [] ftrace_graph_caller+0x6c/0x70 [ 198.585503] [] ftrace_graph_caller+0x6c/0x70 [ 198.586574] [] ftrace_graph_caller+0x6c/0x70 [ 198.587660] [] ftrace_graph_caller+0x6c/0x70 [ 198.588896] Code: aa0003f5 2a0103f4 b4000102 91004043 (885f7c60) [ 198.591092] ---[ end trace 6a346f8f20949ac8 ]--- This patch fix it, and dump the real return address in the call trace. Signed-off-by: Li Bin --- arch/arm64/kernel/traps.c | 31 +++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index f93aae5..4a4e679 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -143,9 +143,38 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) set_fs(fs); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void print_ftrace_graph_addr(unsigned long addr, + struct task_struct *tsk, + unsigned long sp, int *graph) +{ + unsigned long ret_addr; + int index = tsk->curr_ret_stack; + + if (addr != ((unsigned long)return_to_handler - 4)) + return; + + if (!tsk->ret_stack || index < *graph) + return; + + index -= *graph; + ret_addr = tsk->ret_stack[index].ret; + + dump_backtrace_entry(ret_addr - 4, sp); + + (*graph)++; +} +#else +static inline void print_ftrace_graph_addr(unsigned long addr, + struct task_struct *tsk, + unsigned long sp, int *graph) +{} +#endif + static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; + int graph = 0; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); @@ -177,7 +206,9 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) ret = unwind_frame(&frame); if (ret < 0) break; + dump_backtrace_entry(where, frame.sp); + print_ftrace_graph_addr(where, tsk, frame.sp, &graph); } } -- 1.7.12.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] arm64: ftrace: fix function_graph tracer panic
When function graph tracer is enabled, the following operation will trigger panic: mount -t debugfs nodev /sys/kernel echo next_tgid > /sys/kernel/tracing/set_ftrace_filter echo function_graph > /sys/kernel/tracing/current_tracer ls /proc/ [ cut here ] [ 198.501417] Unable to handle kernel paging request at virtual address cb88537fdc8ba316 [ 198.506126] pgd = ffc008f79000 [ 198.509363] [cb88537fdc8ba316] *pgd=488c6003, *pud=488c6003, *pmd= [ 198.517726] Internal error: Oops: 9405 [#1] SMP [ 198.518798] Modules linked in: [ 198.520582] CPU: 1 PID: 1388 Comm: ls Tainted: G [ 198.521800] Hardware name: linux,dummy-virt (DT) [ 198.522852] task: ffc0fa9e8000 ti: ffc0f9ab task.ti: ffc0f9ab [ 198.524306] PC is at next_tgid+0x30/0x100 [ 198.525205] LR is at return_to_handler+0x0/0x20 [ 198.526090] pc : [] lr : [] pstate: 6145 [ 198.527392] sp : ffc0f9ab3d40 [ 198.528084] x29: ffc0f9ab3d40 x28: ffc0f9ab [ 198.529406] x27: ffc000d6a000 x26: ffc000b786e8 [ 198.530659] x25: ffc0002a1900 x24: ffc0faf16c00 [ 198.531942] x23: ffc0f9ab3ea0 x22: 0002 [ 198.533202] x21: ffc000d85050 x20: 0002 [ 198.534446] x19: 0002 x18: [ 198.535719] x17: 0049fa08 x16: ffc000242efc [ 198.537030] x15: 007fa472b54c x14: ff00 [ 198.538347] x13: ffc0fada84a0 x12: 0001 [ 198.539634] x11: ffc0f9ab3d70 x10: ffc0f9ab3d70 [ 198.540915] x9 : ffc907c0 x8 : ffc0f9ab3d40 [ 198.542215] x7 : 002e330f08f0 x6 : 0015 [ 198.543508] x5 : 0f08 x4 : ffc0f9835ec0 [ 198.544792] x3 : cb88537fdc8ba316 x2 : cb88537fdc8ba306 [ 198.546108] x1 : 0002 x0 : ffc000d85050 [ 198.547432] [ 198.547920] Process ls (pid: 1388, stack limit = 0xffc0f9ab0020) [ 198.549170] Stack: (0xffc0f9ab3d40 to 0xffc0f9ab4000) [ 198.582568] Call trace: [ 198.583313] [] next_tgid+0x30/0x100 [ 198.584359] [] ftrace_graph_caller+0x6c/0x70 [ 198.585503] [] ftrace_graph_caller+0x6c/0x70 [ 198.586574] [] ftrace_graph_caller+0x6c/0x70 [ 198.587660] [] ftrace_graph_caller+0x6c/0x70 [ 198.588896] Code: aa0003f5 2a0103f4 b4000102 91004043 (885f7c60) [ 198.591092] ---[ end trace 6a346f8f20949ac8 ]--- This is because when using function graph tracer, if the traced function return value is in multi regs ([0x-07]), return_to_handler may corrupt them. So in return_to_handler, the parameter regs should be protected properly. Cc: # 3.18+ Signed-off-by: Li Bin --- arch/arm64/kernel/entry-ftrace.S | 22 -- 1 files changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 08cafc5..0f03a8f 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -178,6 +178,24 @@ ENTRY(ftrace_stub) ENDPROC(ftrace_stub) #ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* save return value regs*/ + .macro save_return_regs + sub sp, sp, #64 + stp x0, x1, [sp] + stp x2, x3, [sp, #16] + stp x4, x5, [sp, #32] + stp x6, x7, [sp, #48] + .endm + + /* restore return value regs*/ + .macro restore_return_regs + ldp x0, x1, [sp] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #48] + add sp, sp, #64 + .endm + /* * void ftrace_graph_caller(void) * @@ -204,11 +222,11 @@ ENDPROC(ftrace_graph_caller) * only when CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST is enabled. */ ENTRY(return_to_handler) - str x0, [sp, #-16]! + save_return_regs mov x0, x29 // parent's fp bl ftrace_return_to_handler// addr = ftrace_return_to_hander(fp); mov x30, x0 // restore the original return address - ldr x0, [sp], #16 + restore_return_regs ret END(return_to_handler) #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] livepatch: add sysfs interface /sys/kernel/livepatch/state
On 2015/6/17 21:20, Miroslav Benes wrote: > On Wed, 17 Jun 2015, Li Bin wrote: > >> On 2015/6/17 16:13, Miroslav Benes wrote: >>> On Wed, 17 Jun 2015, Li Bin wrote: >> >>> The list of applied patches can be obtained just by 'ls >>> /sys/kernel/livepatch' and their state is in enabled attribute in each >>> respective patch (no, you cannot obtain the order in the stack). >> >> But why we cannot obtain it? I think We indeed need the stack order when we >> will disable one patch, at least, we can find out whether it is on the top of >> the stack if failed to disable one patch. > > I meant with the current means. It is correct that we do not export > information about stacking order anywhere. > > What we do in kGraft is that there is something like refcount for each > patch. When the patch is being applied the refcount of all the previous > patches is increased. Only the patch with the refcount equal to 0 can be > removed. This information is exported and gives one a clue about the > order. > It sounds good, but the information is limited that cannot show the stack order, right? (The refcount of all the disabled patch is equal to 0, if being enable one disabled patch, the stack order is also needed.) refcount Patch --- 3 patch1(enabled) 2 patch2(enabled) 1 patch3(enabled) 0 patch4(enabled) 0 patch5(disabled) 0 patch6(disabled) Unless the refcount is allowed to be less than 0, then when the patch is being disabled the refcount of all the patches is decreased, when the patch is being enabled the refcount of all patches is increased. Only the patch with the refcount equal to 0 can be disabled and only equal to -1 can be enabled, and only less or equal to 0 can be removed (that the livepatch does not support right now). refcount Patch --- 3 patch1(enabled) 2 patch2(enabled) 1 patch3(enabled) 0 patch4(enabled) -1 patch5(disabled) -2 patch6(disabled) Thanks, Li Bin > So if there is a need to have something like this there would certainly > be a way (or ways to be precise) how to do it. The question is if we need > it right now. > > Regards, > Miroslav > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] livepatch: add sysfs interface /sys/kernel/livepatch/state
On 2015/6/17 16:13, Miroslav Benes wrote: > On Wed, 17 Jun 2015, Li Bin wrote: > >> The added sysfs interface /sys/kernel/livepatch/state is read-only, >> it shows the patches that have been applied, incluing the stack index >> and the state of each patch. >> >> $ cat /sys/kernel/livepatch/state >> IndexPatch State >> --- >> 1klp_test1 enabled >> 2klp_test2 enabled >> 3klp_test3 enabled >> --- >> >> $ echo 0 > /sys/kernel/livepatch/klp_test3/enabled >> $ cat /sys/kernel/livepatch/state >> IndexPatch State >> --- >> 1klp_test1 enabled >> 2klp_test2 enabled >> 3klp_test3 disabled >> --- >> >> Signed-off-by: Li Bin >> --- > > Hi, > > I think we should comply with sysfs policy and keep 'one value per > attribute' as mentioned in the Documentation/filesystems/sysfs.txt: > > " > Attributes should be ASCII text files, preferably with only one value > per file. It is noted that it may not be efficient to contain only one > value per file, so it is socially acceptable to express an array of > values of the same type. > > Mixing types, expressing multiple lines of data, and doing fancy > formatting of data is heavily frowned upon. Doing these things may get > you publicly humiliated and your code rewritten without notice. > " Hi Miroslav Benes, Thanks for your comments. > The list of applied patches can be obtained just by 'ls > /sys/kernel/livepatch' and their state is in enabled attribute in each > respective patch (no, you cannot obtain the order in the stack). But why we cannot obtain it? I think We indeed need the stack order when we will disable one patch, at least, we can find out whether it is on the top of the stack if failed to disable one patch. Thanks, Li Bin > > Thanks, > > Miroslav Benes > SUSE Labs > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] livepatch: add sysfs interface /sys/kernel/livepatch/state
The added sysfs interface /sys/kernel/livepatch/state is read-only, it shows the patches that have been applied, incluing the stack index and the state of each patch. $ cat /sys/kernel/livepatch/state Index Patch State --- 1 klp_test1 enabled 2 klp_test2 enabled 3 klp_test3 enabled --- $ echo 0 > /sys/kernel/livepatch/klp_test3/enabled $ cat /sys/kernel/livepatch/state Index Patch State --- 1 klp_test1 enabled 2 klp_test2 enabled 3 klp_test3 disabled --- Signed-off-by: Li Bin --- Documentation/ABI/testing/sysfs-kernel-livepatch |9 + kernel/livepatch/core.c | 41 +- 2 files changed, 49 insertions(+), 1 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch index 5bf42a8..01c64da 100644 --- a/Documentation/ABI/testing/sysfs-kernel-livepatch +++ b/Documentation/ABI/testing/sysfs-kernel-livepatch @@ -8,6 +8,15 @@ Description: The /sys/kernel/livepatch directory contains subdirectories for each loaded live patch module. +What: /sys/kernel/livepatch/state +Date: Jun 2015 +KernelVersion: 4.1.0 +Contact: live-patch...@vger.kernel.org +Description: + The state file is read-only and shows the patches that have + been applied, including the patch stack index and state of + each patch. + What: /sys/kernel/livepatch/ Date: Nov 2014 KernelVersion: 3.19.0 diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 3f9f1d6..5d004f3 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -604,6 +604,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * Sysfs Interface * * /sys/kernel/livepatch + * /sys/kernel/livepatch/state * /sys/kernel/livepatch/ * /sys/kernel/livepatch//enabled * /sys/kernel/livepatch// @@ -1008,6 +1009,36 @@ static struct notifier_block klp_module_nb = { .priority = INT_MIN+1, /* called late but before ftrace notifier */ }; +static ssize_t state_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct klp_patch *patch; + unsigned long size = 0; + int index = 0; + size += snprintf(buf+size, PAGE_SIZE-size-1, "%-5s\t%-26s\t%-8s\n", "Index", "Patch", "State"); + size += snprintf(buf+size, PAGE_SIZE-size-1, "---\n"); + mutex_lock(&klp_mutex); + list_for_each_entry(patch, &klp_patches, list) { + size += snprintf(buf+size, PAGE_SIZE-size-1, "%-5d\t%-26s\t%-8s\n", + ++index, patch->mod->name, patch->state ? "enabled" : "disabled"); + } + mutex_unlock(&klp_mutex); + size += snprintf(buf+size, PAGE_SIZE-size-1, "---\n"); + + return size; +} +static struct kobj_attribute state_kobj_attr = __ATTR_RO(state); + +static struct attribute *klp_top_attrs[] = { + &state_kobj_attr.attr, + NULL +}; + +static struct kobj_type klp_ktype_top = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = klp_top_attrs, +}; + static int klp_init(void) { int ret; @@ -1022,12 +1053,20 @@ static int klp_init(void) if (ret) return ret; - klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); + klp_root_kobj = kzalloc(sizeof(*klp_root_kobj), GFP_KERNEL); if (!klp_root_kobj) { ret = -ENOMEM; goto unregister; } + ret = kobject_init_and_add(klp_root_kobj, &klp_ktype_top, + kernel_kobj, "livepatch"); + if (ret) { + kobject_put(klp_root_kobj); + klp_root_kobj = NULL; + goto unregister; + } + return 0; unregister: -- 1.7.7 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH 0/5] livepatch: add support on arm64
On 2015/6/2 10:15, AKASHI Takahiro wrote: > On 05/30/2015 09:01 AM, Masami Hiramatsu wrote: >> On 2015/05/28 14:51, Li Bin wrote: >>> This patchset propose a method for gcc -mfentry feature(profile >>> before prologue) implementation for arm64, and propose the livepatch >>> implementation for arm64 based on this feature. >>> The gcc implementation about this feature will be post to the gcc >>> community soon. >>> >>> With this -mfentry feature, the entry of each function like: >>> >>> foo: >>> mov x9, x30 >>> bl __fentry__ >>> mov x30, x9 >>> [prologue] >>> ... >>> >>> The x9 is a callee corruptible register, and the __fentry__ function >>> is responsible to protect all registers, so it can be used to protect >>> the x30. And the added two instructions which is register mov operation >>> have ralatively small impact on performance. >> >> Hm, this implementation looks good to me :) >> This also enables us to KPROBES_ON_FTRACE too. > > Even if x9 is a callee-saved register, there is no way to restore its original > value in setting up a pt_regs in ftrace_reg_caller. Hi, Takahiro AKASHI Firstly, x9 is not a callee-saved but a caller-saved register(or being called corruptible register). Secondly, I think x9 is already protected properly, please reference the patch: [PATCH 1/5] livepatch: ftrace: arm64: Add support for DYNAMIC_FTRACE_WITH_REGS [PATCH 3/5] livepatch: ftrace: arm64: Add support for -mfentry on arm64 > It's not the right thing for KPROBES_ON_FTRACE, is it? > > Saving Link register in stack is not a big deal since the overhead of ftrace > is much bigger. Performance overhead is only one aspect of the problem, and more importantly, even worse is that it would break the arm64 ABI rules. Thanks, Li Bin > > -Takahiro AKASHI > > >> Thanks, >> >>> >>> This patchset has been tested on arm64 platform. >>> >>> Li Bin (4): >>>livepatch: ftrace: arm64: Add support for DYNAMIC_FTRACE_WITH_REGS >>>livepatch: ftrace: add ftrace_function_stub_ip function >>>livepatch: ftrace: arm64: Add support for -mfentry on arm64 >>>livepatch: arm64: add support for livepatch on arm64 >>> >>> Xie XiuQi (1): >>>livepatch: arm64: support relocation in a module >>> >>> arch/arm64/Kconfig |5 + >>> arch/arm64/include/asm/ftrace.h|9 + >>> arch/arm64/include/asm/livepatch.h | 45 + >>> arch/arm64/kernel/Makefile |1 + >>> arch/arm64/kernel/arm64ksyms.c |4 + >>> arch/arm64/kernel/entry-ftrace.S | 154 +++- >>> arch/arm64/kernel/ftrace.c | 28 +++- >>> arch/arm64/kernel/livepatch.c | 41 >>> arch/arm64/kernel/module.c | 355 >>> ++-- >>> include/linux/ftrace.h |1 + >>> kernel/livepatch/core.c| 17 ++- >>> kernel/trace/ftrace.c | 32 >>> scripts/recordmcount.pl|2 +- >>> 13 files changed, 508 insertions(+), 186 deletions(-) >>> create mode 100644 arch/arm64/include/asm/livepatch.h >>> create mode 100644 arch/arm64/kernel/livepatch.c >>> >>> -- >>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in >>> the body of a message to majord...@vger.kernel.org >>> More majordomo info at http://vger.kernel.org/majordomo-info.html >>> Please read the FAQ at http://www.tux.org/lkml/ >>> >> >> > -- > To unsubscribe from this list: send the line "unsubscribe live-patching" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC PATCH 1/5] livepatch: ftrace: arm64: Add support for DYNAMIC_FTRACE_WITH_REGS
On 2015/5/29 15:14, Paul Bolle wrote: > On Thu, 2015-05-28 at 13:51 +0800, Li Bin wrote: >> --- a/arch/arm64/Kconfig >> +++ b/arch/arm64/Kconfig > >> select HAVE_DYNAMIC_FTRACE >> +select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE > > What's the point of "if HAVE_DYNAMIC_FTRACE" here? That test should > always evaluate to true, because HAVE_DYNAMIC_FTRACE is also selected. > Ah, I'll modify it. Thanks! Li Bin > > Paul Bolle > > > > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] arm64: add livepatch support
On 2015/4/24 17:27, Jiri Kosina wrote: > On Fri, 24 Apr 2015, AKASHI Takahiro wrote: > >> This patchset enables livepatch support on arm64. >> >> Livepatch was merged in v4.0, and allows replacying a function dynamically >> based on ftrace framework, but it also requires -mfentry option of gcc. >> Currently arm64 gcc doesn't support it, but by adding a helper function to >> ftrace, we will be able to support livepatch on arch's which don't support >> this option. >> >> I submit this patchset as RFC since I'm not quite sure that I'm doing >> in the right way, or we should definitely support -fentry instead. > > I don't have arm64 cross-compiler handy, could you please copy/paste how > does function prologue, generated by gcc -pg on arm64 look like? > The function prologue on arm64 with gcc -pg look like as following: func: stp x29, x30, [sp, -48]! add x29, sp, 0 mov x1, x30 str w0, [x29,28] mov x0, x1 bl _mcount ... Thanks, Li Bin > Thanks, > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] livepatch: match function return value type with prototype
On 2015/5/26 15:32, Jiri Slaby wrote: > -BEGIN PGP SIGNED MESSAGE- > Hash: SHA256 > > On 05/26/2015, 06:46 AM, Minfei Huang wrote: >> On Tue, May 26, 2015 at 10:44 AM, Li Bin >> wrote: >>> The klp_is_module return type should be boolean. >>> >>> Signed-off-by: Li Bin --- >>> kernel/livepatch/core.c |2 +- 1 files changed, 1 >>> insertions(+), 1 deletions(-) >>> >>> diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c >>> index 284e269..30e9339 100644 --- a/kernel/livepatch/core.c +++ >>> b/kernel/livepatch/core.c @@ -78,7 +78,7 @@ static struct klp_ops >>> *klp_find_ops(unsigned long old_addr) >>> >>> static bool klp_is_module(struct klp_object *obj) { - >>> return obj->name; + return !!obj->name; } >> >> Hi, Bin. >> >> Does compile blame the prototype before you convert it? If not, I >> think we do not need to fix it, since there are a lot of places in >> kernel where we return point value as boolean. > > Yes, but the real reason is not that we use it on a lot of places, but > that it is standardized in ¶ 6.3.1.2: > When any scalar value is converted to _Bool, the result is 0 if the > value compares equal to 0; otherwise, the result is 1. > OK, I see, thank you! Li Bin > Along with the definition of scalar types: > Arithmetic types and pointer types are collectively called scalar types. > > thanks, > - -- > js > suse labs > -BEGIN PGP SIGNATURE- > Version: GnuPG v2 > > iQIcBAEBCAAGBQJVZCGXAAoJEL0lsQQGtHBJMCEP/AufUwrMFZ7KTD/pWKpjpDlu > fP8eflJ7iBxGtHXe9PGOsthY4kd1mB+kCo+54N1gks9bbnWF6P302hck5zrBHBdH > /wbo7YXhbHicUNAXZUzxfDh3nkKmR96CIeqZhMp2H1UBzmGRKzd6kVSdBwlbJ0/W > ZNcWmiIaALdr7aMEw+qEExV35kUdbJaqUcHKC9in3qQtlzCVQbZ5mCqURe+61ZTL > 9u/Sbf3vB+nJPzyC/8uSVAxF616PiPdgGvxRrrRrRH82JmRaVJFjsRJ1WeMNxOFt > s4gbNByNePmTG3SisqFFKQ6VJYyeEsxkeRKbvL2mW4IlzJjiWCbp6XHcv2/IzR9g > GdIu3Kgy5R0OBXTxMbb5VLVtDRZUYnD/HiOFVLdUiqI6HQ/MtdZGujnVhWAVGIse > BD8T+hTOQyt6Yk7evlEF+REOlAU8jvBvnq3PUfRlUjWso8w0giPx1Re6rQk4k39P > TmfIVrve79n2nR/OejeCAB3xBEDtFbKBg9I+5ONm1gogxz/+3o6mwfMGS8TSdNs9 > l4Kl/dhQhSW3aOmrclUoOZpcAzfTJSNCepTFq6+hKIbKibtnKs63wkLnGyL+XZSE > dEOWBC/B5YNC9VVAm20U8jAtUS5gw3n0NpXFTxOi0nb62IGY+mT+D5pPylVn31Rq > H7FDzgpZ7+lyvx+FDqpv > =UzJg > -END PGP SIGNATURE- > -- > To unsubscribe from this list: send the line "unsubscribe live-patching" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH 5/5] livepatch: arm64: support relocation in a module
From: Xie XiuQi This patch implement klp_write_module_reloc on arm64 platform. Signed-off-by: Xie XiuQi Signed-off-by: Li Bin --- arch/arm64/kernel/livepatch.c |7 +- arch/arm64/kernel/module.c| 355 + 2 files changed, 186 insertions(+), 176 deletions(-) diff --git a/arch/arm64/kernel/livepatch.c b/arch/arm64/kernel/livepatch.c index 2a55532..ad674f0 100644 --- a/arch/arm64/kernel/livepatch.c +++ b/arch/arm64/kernel/livepatch.c @@ -18,8 +18,11 @@ */ #include +#include #include +extern int static_relocate(struct module *mod, unsigned long type, + void * loc, unsigned long value); /** * klp_write_module_reloc() - write a relocation in a module * @mod: module in which the section to be modified is found @@ -33,6 +36,6 @@ int klp_write_module_reloc(struct module *mod, unsigned long type, unsigned long loc, unsigned long value) { - pr_err("lpc_write_module_reloc has not supported now\n"); - return -ENOSYS; + /* Perform the static relocation. */ + return static_relocate(mod, type, (void *)loc, value); } diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 67bf410..7781241 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -193,6 +193,182 @@ static int reloc_insn_imm(enum aarch64_reloc_op op, void *place, u64 val, return 0; } +int static_relocate(struct module *me, unsigned long type, void *loc, + unsigned long val) +{ + int ovf = 0; + bool overflow_check = true; + /* Perform the static relocation. */ + switch (type) { + /* Null relocations. */ + case R_ARM_NONE: + case R_AARCH64_NONE: + ovf = 0; + break; + + /* Data relocations. */ + case R_AARCH64_ABS64: + overflow_check = false; + ovf = reloc_data(RELOC_OP_ABS, loc, val, 64); + break; + case R_AARCH64_ABS32: + ovf = reloc_data(RELOC_OP_ABS, loc, val, 32); + break; + case R_AARCH64_ABS16: + ovf = reloc_data(RELOC_OP_ABS, loc, val, 16); + break; + case R_AARCH64_PREL64: + overflow_check = false; + ovf = reloc_data(RELOC_OP_PREL, loc, val, 64); + break; + case R_AARCH64_PREL32: + ovf = reloc_data(RELOC_OP_PREL, loc, val, 32); + break; + case R_AARCH64_PREL16: + ovf = reloc_data(RELOC_OP_PREL, loc, val, 16); + break; + + /* MOVW instruction relocations. */ + case R_AARCH64_MOVW_UABS_G0_NC: + overflow_check = false; + case R_AARCH64_MOVW_UABS_G0: + ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, + AARCH64_INSN_IMM_16); + break; + case R_AARCH64_MOVW_UABS_G1_NC: + overflow_check = false; + case R_AARCH64_MOVW_UABS_G1: + ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, + AARCH64_INSN_IMM_16); + break; + case R_AARCH64_MOVW_UABS_G2_NC: + overflow_check = false; + case R_AARCH64_MOVW_UABS_G2: + ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, + AARCH64_INSN_IMM_16); + break; + case R_AARCH64_MOVW_UABS_G3: + /* We're using the top bits so we can't overflow. */ + overflow_check = false; + ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 48, + AARCH64_INSN_IMM_16); + break; + case R_AARCH64_MOVW_SABS_G0: + ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 0, + AARCH64_INSN_IMM_MOVNZ); + break; + case R_AARCH64_MOVW_SABS_G1: + ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 16, + AARCH64_INSN_IMM_MOVNZ); + break; + case R_AARCH64_MOVW_SABS_G2: + ovf = reloc_insn_movw(RELOC_OP_ABS, loc, val, 32, + AARCH64_INSN_IMM_MOVNZ); + break; + case R_AARCH64_MOVW_PREL_G0_NC: + overflow_check = false; + ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, + AARCH64_INSN_IMM_MOVK); + break; + case R_AARCH64_MOVW_PREL_G0: + ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 0, + AARCH64_INSN_IMM_MOVNZ); + break; + case R_AARCH64_MOVW_PREL_G1_NC: + overflow_check = false; + ovf = reloc_insn_movw(RELOC_OP_PREL, loc, val, 16, + A
[RFC PATCH 4/5] livepatch: arm64: add support for livepatch on arm64
This patch add support for livepatch on arm64 based on the gcc -mfentry feature and the ftrace DYNAMIC_FTRACE_WITH_REGS feature. Signed-off-by: Li Bin --- arch/arm64/Kconfig |3 ++ arch/arm64/include/asm/livepatch.h | 45 arch/arm64/kernel/Makefile |1 + arch/arm64/kernel/livepatch.c | 38 ++ kernel/livepatch/core.c| 17 + 5 files changed, 99 insertions(+), 5 deletions(-) create mode 100644 arch/arm64/include/asm/livepatch.h create mode 100644 arch/arm64/kernel/livepatch.c diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7bb2468..8a3845c 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -85,6 +85,7 @@ config ARM64 select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE select HAVE_CONTEXT_TRACKING + select HAVE_LIVEPATCH help ARM 64-bit (AArch64) Linux support. @@ -159,6 +160,8 @@ source "init/Kconfig" source "kernel/Kconfig.freezer" +source "kernel/livepatch/Kconfig" + menu "Platform selection" config ARCH_EXYNOS diff --git a/arch/arm64/include/asm/livepatch.h b/arch/arm64/include/asm/livepatch.h new file mode 100644 index 000..1890413 --- /dev/null +++ b/arch/arm64/include/asm/livepatch.h @@ -0,0 +1,45 @@ +/* + * livepatch.h - arm64-specific Kernel Live Patching Core + * + * Copyright (C) 2014 Li Bin + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ASM_ARM64_LIVEPATCH_H +#define _ASM_ARM64_LIVEPATCH_H + +#include +#include + +#ifdef CONFIG_LIVEPATCH +static inline int klp_check_compiler_support(void) +{ +#ifndef CC_USING_FENTRY + return 1; +#endif + return 0; +} +extern int klp_write_module_reloc(struct module *mod, unsigned long type, + unsigned long loc, unsigned long value); + +static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long pc) +{ + regs->pc = pc; +} +#else +#error Live patching support is disabled; check CONFIG_LIVEPATCH +#endif + +#endif /* _ASM_ARM64_LIVEPATCH_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 426d076..30d307b 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -23,6 +23,7 @@ arm64-obj-$(CONFIG_COMPAT)+= sys32.o kuser32.o signal32.o \ sys_compat.o entry32.o \ ../../arm/kernel/opcodes.o arm64-obj-$(CONFIG_FUNCTION_TRACER)+= ftrace.o entry-ftrace.o +arm64-obj-$(CONFIG_LIVEPATCH) += livepatch.o arm64-obj-$(CONFIG_MODULES)+= arm64ksyms.o module.o arm64-obj-$(CONFIG_SMP)+= smp.o smp_spin_table.o topology.o arm64-obj-$(CONFIG_PERF_EVENTS)+= perf_regs.o diff --git a/arch/arm64/kernel/livepatch.c b/arch/arm64/kernel/livepatch.c new file mode 100644 index 000..2a55532 --- /dev/null +++ b/arch/arm64/kernel/livepatch.c @@ -0,0 +1,38 @@ +/* + * livepatch.c - arm64-specific Kernel Live Patching Core + * + * Copyright (C) 2014 Li Bin + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include +#include + +/** + * klp_write_module_reloc() - write a relocation in a module + * @mod: module in which the section to be modified is found + * @type: ELF relocation type (see asm/elf.h) + * @loc: address that the relocation should be written to + * @value: relocation value (sym address + addend) + * + * This function writes a relocation to the specified location for + * a particular module. + */ +int klp_write_module_reloc(struct module *mod, unsigned long type, +
[RFC PATCH 1/5] livepatch: ftrace: arm64: Add support for DYNAMIC_FTRACE_WITH_REGS
If ftrace_ops is registered with flag FTRACE_OPS_FL_SAVE_REGS, the arch that support DYNAMIC_FTRACE_WITH_REGS will pass a full set of pt_regs to the ftrace_ops callback function, which may read or change the value of the pt_regs and the pt_regs will be restored to support work flow redirection such as kernel live patching. This patch adds DYNAMIC_FTRACE_WITH_REGS feature support for arm64 architecture. Signed-off-by: Li Bin --- arch/arm64/Kconfig |1 + arch/arm64/include/asm/ftrace.h |4 ++ arch/arm64/kernel/entry-ftrace.S | 95 ++ arch/arm64/kernel/ftrace.c | 28 ++- 4 files changed, 126 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7796af4..ea435c9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -57,6 +57,7 @@ config ARM64 select HAVE_DMA_ATTRS select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE + select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_TRACER diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index c5534fa..a7722b9 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -16,6 +16,10 @@ #define MCOUNT_ADDR((unsigned long)_mcount) #define MCOUNT_INSN_SIZE AARCH64_INSN_SIZE +#ifdef CONFIG_DYNAMIC_FTRACE +#define ARCH_SUPPORTS_FTRACE_OPS 1 +#endif + #ifndef __ASSEMBLY__ #include diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 08cafc5..fde793b 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -12,6 +12,8 @@ #include #include #include +#include +#include /* * Gcc with -pg will put the following code in the beginning of each function: @@ -50,11 +52,37 @@ mov x29, sp .endm + /* save parameter registers & corruptible registers */ + .macro save_mcount_regs + sub sp, sp, #S_FRAME_SIZE + stp x0, x1, [sp] + stp x2, x3, [sp, #16] + stp x4, x5, [sp, #32] + stp x6, x7, [sp, #48] + stp x8, x9, [sp, #64] + stp x10, x11, [sp, #80] + stp x12, x13, [sp, #96] + stp x14, x15, [sp, #112] + .endm + .macro mcount_exit ldp x29, x30, [sp], #16 ret .endm + /* restore parameter registers & corruptible registers */ + .macro restore_mcount_regs + ldp x0, x1, [sp] + ldp x2, x3, [sp, #16] + ldp x4, x5, [sp, #32] + ldp x6, x7, [sp, #48] + ldp x8, x9, [sp, #64] + ldp x10, x11, [sp, #80] + ldp x12, x13, [sp, #96] + ldp x14, x15, [sp, #112] + add sp, sp, #S_FRAME_SIZE + .endm + .macro mcount_adjust_addr rd, rn sub \rd, \rn, #AARCH64_INSN_SIZE .endm @@ -97,6 +125,7 @@ */ ENTRY(_mcount) mcount_enter + save_mcount_regs adrpx0, ftrace_trace_function ldr x2, [x0, #:lo12:ftrace_trace_function] @@ -110,8 +139,10 @@ ENTRY(_mcount) #ifndef CONFIG_FUNCTION_GRAPH_TRACER skip_ftrace_call: // return; + restore_mcount_regs mcount_exit // } #else + restore_mcount_regs mcount_exit // return; // } skip_ftrace_call: @@ -127,6 +158,7 @@ skip_ftrace_call: cmp x0, x2 b.neftrace_graph_caller // ftrace_graph_caller(); + restore_mcount_regs mcount_exit #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ ENDPROC(_mcount) @@ -153,15 +185,20 @@ ENDPROC(_mcount) */ ENTRY(ftrace_caller) mcount_enter + save_mcount_regs + adrpx0, function_trace_op + ldr x2, [x0, #:lo12:function_trace_op] mcount_get_pc0 x0 // function's pc mcount_get_lr x1 // function's lr + mov x3, #0 .global ftrace_call ftrace_call: // tracer(pc, lr); nop // This will be replaced with "bl xxx" // where xxx can be any kind of tracer. +ftrace_return: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .global ftrace_graph_call ftrace_graph_call: // ftrace_graph_caller(); @@ -169,8 +206,65 @@ ftrace_graph_call: // ftrace_graph_caller(); // "b ftrace_graph_caller" #endif + restore_mcount_regs mcount_exit ENDPROC(ftrace_caller) + +ENTRY(ftrace_regs_caller) + mcount_enter + save_mcount_regs + + /* Save the rest of pt_regs */ + stp x16, x17, [sp, #12
[RFC PATCH 3/5] livepatch: ftrace: arm64: Add support for -mfentry on arm64
This patch depends on the compiler's mfentry feature for arm64 that proposed by this patchset. If the kernel is compiled with this feature, the entry of each function like: foo: mov x9, x30 bl __fentry__ mov x30, x9 When -mfentry is used, the call is to '__fentry__' and not '_mcount' and is done before the function's stack frame is set up. So __fentry__ is responsibel to protect parameter registers and corruptible registers. Signed-off-by: Li Bin --- arch/arm64/Kconfig |1 + arch/arm64/include/asm/ftrace.h |5 +++ arch/arm64/kernel/arm64ksyms.c |4 ++ arch/arm64/kernel/entry-ftrace.S | 59 +++-- scripts/recordmcount.pl |2 +- 5 files changed, 66 insertions(+), 5 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ea435c9..7bb2468 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -60,6 +60,7 @@ config ARM64 select HAVE_DYNAMIC_FTRACE_WITH_REGS if HAVE_DYNAMIC_FTRACE select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FENTRY select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER select HAVE_GENERIC_DMA_COHERENT diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index a7722b9..08eab52 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -13,7 +13,11 @@ #include +#ifdef CC_USING_FENTRY +#define MCOUNT_ADDR((unsigned long)__fentry__) +#else #define MCOUNT_ADDR((unsigned long)_mcount) +#endif #define MCOUNT_INSN_SIZE AARCH64_INSN_SIZE #ifdef CONFIG_DYNAMIC_FTRACE @@ -24,6 +28,7 @@ #include extern void _mcount(unsigned long); +extern void __fentry__(unsigned long); extern void *return_address(unsigned int); struct dyn_arch_ftrace { diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c index a85843d..f0455d3 100644 --- a/arch/arm64/kernel/arm64ksyms.c +++ b/arch/arm64/kernel/arm64ksyms.c @@ -63,5 +63,9 @@ EXPORT_SYMBOL(change_bit); EXPORT_SYMBOL(test_and_change_bit); #ifdef CONFIG_FUNCTION_TRACER +#ifdef CC_USING_FENTRY +EXPORT_SYMBOL(__fentry__); +#else EXPORT_SYMBOL(_mcount); #endif +#endif diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index fde793b..18cfe5b 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -93,27 +93,57 @@ ldr \reg, [\reg] .endm + /* for instrumented function's parent */ + .macro fentry_get_parent_fp reg + ldr \reg, [x29] + .endm + /* for instrumented function */ .macro mcount_get_pc0 reg mcount_adjust_addr \reg, x30 .endm + /* for instrumented function */ + .macro fentry_get_pc0 reg + mcount_adjust_addr \reg, x30 + .endm + .macro mcount_get_pc reg ldr \reg, [x29, #8] mcount_adjust_addr \reg, \reg .endm + .macro fentry_get_pc reg + ldr \reg, [x29, #8] + mcount_adjust_addr \reg, \reg + .endm + .macro mcount_get_lr reg ldr \reg, [x29] ldr \reg, [\reg, #8] mcount_adjust_addr \reg, \reg .endm + .macro fentry_get_lr reg, base + ldr \reg, [\base, #72] //S_X9 + mcount_adjust_addr \reg, \reg + .endm + .macro mcount_get_lr_addr reg ldr \reg, [x29] add \reg, \reg, #8 .endm + .macro fentry_get_lr_addr reg, base + add \reg, \base, #72//S_X9 + .endm + +#ifdef CC_USING_FENTRY +#definefunction_hook __fentry__ +#else +#definefunction_hook _mcount +#endif + #ifndef CONFIG_DYNAMIC_FTRACE /* * void _mcount(unsigned long return_address) @@ -123,7 +153,7 @@ * - tracer function to probe instrumented function's entry, * - ftrace_graph_caller to set up an exit hook */ -ENTRY(_mcount) +ENTRY(function_hook) mcount_enter save_mcount_regs @@ -133,8 +163,13 @@ ENTRY(_mcount) cmp x0, x2 // if (ftrace_trace_function b.eqskip_ftrace_call// != ftrace_stub) { +#ifdef CC_USING_FENTRY + fentry_get_pc x0 // function's pc + fentry_get_lr x1, sp // function's lr (= parent's pc) +#else mcount_get_pc x0 // function's pc mcount_get_lr x1 // function's lr (= parent's pc) +#endif blr x2 // (*ftrace_trace_function)(pc, lr); #ifndef CONFIG_FUNCTION_GRAPH_TRACER @@ -161,7 +196,7 @@ skip_ftrace_call: restore_mcount_regs mcount_exit #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ -ENDPROC(_mcount) +ENDPROC(function_hook
[RFC PATCH 2/5] livepatch: ftrace: add ftrace_function_stub_ip function
The function of ftrace_function_stub_ip is to convert the function address to the ftrace stub calling instruction address. This is needed for the platform that the complier does not support "profile before prologue" feature and the profile calling instruction is not at begin of the function. EXAMPLES: ... stub_ip = ftrace_function_stub_ip(func_addr); ftrace_set_filter_ip(&ftrace_ops, stub_ip, 0, 0); register_ftrace_function(&ftrace_ops); ... Signed-off-by: Li Bin --- include/linux/ftrace.h |1 + kernel/trace/ftrace.c | 32 2 files changed, 33 insertions(+), 0 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1da6029..38a2811 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -360,6 +360,7 @@ struct dyn_ftrace { int ftrace_force_update(void); int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, int remove, int reset); +unsigned long ftrace_function_stub_ip(unsigned long addr); int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02bece4..4d8692c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4161,6 +4161,38 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, } EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); +/** + * ftrace_function_stub_ip - get the profile stub calling location by the + * function address. It is useful for the platform that doesn't place the + * function profiling call at the start of the function. + * @addr - the function address to get the stub ip + * + * It returns the corresponding profile stub calling location if founded, else + * return zero. + */ +unsigned long ftrace_function_stub_ip(unsigned long addr) +{ + struct ftrace_page *pg; + struct dyn_ftrace *rec; + unsigned long ret = 0; + + mutex_lock(&ftrace_lock); + + do_for_each_ftrace_rec(pg, rec) { + unsigned long offset; + + if (kallsyms_lookup_size_offset(rec->ip, NULL, &offset) + && addr + offset == rec->ip) { + ret = rec->ip; + goto out_unlock; + } + } while_for_each_ftrace_rec() +out_unlock: + mutex_unlock(&ftrace_lock); + return ret; +} +EXPORT_SYMBOL_GPL(ftrace_function_stub_ip); + static int ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, int reset, int enable) -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC PATCH 0/5] livepatch: add support on arm64
This patchset propose a method for gcc -mfentry feature(profile before prologue) implementation for arm64, and propose the livepatch implementation for arm64 based on this feature. The gcc implementation about this feature will be post to the gcc community soon. With this -mfentry feature, the entry of each function like: foo: mov x9, x30 bl __fentry__ mov x30, x9 [prologue] ... The x9 is a callee corruptible register, and the __fentry__ function is responsible to protect all registers, so it can be used to protect the x30. And the added two instructions which is register mov operation have ralatively small impact on performance. This patchset has been tested on arm64 platform. Li Bin (4): livepatch: ftrace: arm64: Add support for DYNAMIC_FTRACE_WITH_REGS livepatch: ftrace: add ftrace_function_stub_ip function livepatch: ftrace: arm64: Add support for -mfentry on arm64 livepatch: arm64: add support for livepatch on arm64 Xie XiuQi (1): livepatch: arm64: support relocation in a module arch/arm64/Kconfig |5 + arch/arm64/include/asm/ftrace.h|9 + arch/arm64/include/asm/livepatch.h | 45 + arch/arm64/kernel/Makefile |1 + arch/arm64/kernel/arm64ksyms.c |4 + arch/arm64/kernel/entry-ftrace.S | 154 +++- arch/arm64/kernel/ftrace.c | 28 +++- arch/arm64/kernel/livepatch.c | 41 arch/arm64/kernel/module.c | 355 ++-- include/linux/ftrace.h |1 + kernel/livepatch/core.c| 17 ++- kernel/trace/ftrace.c | 32 scripts/recordmcount.pl|2 +- 13 files changed, 508 insertions(+), 186 deletions(-) create mode 100644 arch/arm64/include/asm/livepatch.h create mode 100644 arch/arm64/kernel/livepatch.c -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] arm64: add livepatch support
On 2015/4/24 14:05, Masami Hiramatsu wrote: > (2015/04/24 12:24), Li Bin wrote: >> On 2015/4/24 10:44, AKASHI Takahiro wrote: >>> This patchset enables livepatch support on arm64. >>> >>> Livepatch was merged in v4.0, and allows replacying a function dynamically >>> based on ftrace framework, but it also requires -mfentry option of gcc. >>> Currently arm64 gcc doesn't support it, but by adding a helper function to >>> ftrace, we will be able to support livepatch on arch's which don't support >>> this option. >>> >> >> This is not correct for the case that the prologue of the old and new >> function >> is different. > > Hmm, is that possible to support -mfentry on arm64? > > Of course we can not call a function directly at the first > instruction of functions on arm, because it can overwrite > link register which stores caller address. However, we can > do "store link register to stack and branch with link" > on arm. That is actually almost same as -mfentry does :), > and that may not depend on the prologue. > Yes, but the method "store link register to stack" will break the arm64 ABI rules, and memory operations may bring performance cost. We have a gcc -mfentry implementation strategy for arm64, and based on this we implement the livepatch for arm64. I will post the patchset for review soon. Thank you, Li Bin > Thank you, > > >> Thanks, >> Li Bin >> >>> I submit this patchset as RFC since I'm not quite sure that I'm doing >>> in the right way, or we should definitely support -fentry instead. >>> >>> Please note that I tested the feature only with livepatch-sample, and >>> the code for DYNAMIC_TRACE_WITH_REGS is still rough-edged. >>> >>> To: Steven Rostedt >>> To: Ingo Molnar >>> To: Josh Poimboeuf >>> To: Seth Jennings >>> To: Jiri Kosina >>> To: Vojtech Pavlik >>> To: Catalin Marinas >>> To: Will Deacon >>> >>> AKASHI Takahiro (4): >>> ftrace: add a helper function for livepatch >>> livepatch: adjust a patched function's address >>> arm64: ftrace: add DYNAMIC_TRACE_WITH_REGS version >>> arm64: add livepatch support >>> >>> arch/arm64/Kconfig |4 ++ >>> arch/arm64/include/asm/ftrace.h|4 ++ >>> arch/arm64/include/asm/livepatch.h | 38 +++ >>> arch/arm64/kernel/Makefile |1 + >>> arch/arm64/kernel/entry-ftrace.S | 124 >>> >>> arch/arm64/kernel/ftrace.c | 24 ++- >>> arch/arm64/kernel/livepatch.c | 68 >>> arch/x86/include/asm/livepatch.h |5 ++ >>> include/linux/ftrace.h |2 + >>> include/linux/livepatch.h |2 + >>> kernel/livepatch/core.c| 16 +++-- >>> kernel/trace/ftrace.c | 26 >>> 12 files changed, 309 insertions(+), 5 deletions(-) >>> create mode 100644 arch/arm64/include/asm/livepatch.h >>> create mode 100644 arch/arm64/kernel/livepatch.c >>> >> >> >> > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] livepatch: match function return value type with prototype
The klp_is_module return type should be boolean. Signed-off-by: Li Bin --- kernel/livepatch/core.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 284e269..30e9339 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -78,7 +78,7 @@ static struct klp_ops *klp_find_ops(unsigned long old_addr) static bool klp_is_module(struct klp_object *obj) { - return obj->name; + return !!obj->name; } static bool klp_is_object_loaded(struct klp_object *obj) -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
livepatch: match function return value type with prototype
The klp_is_module return type should be boolean. Signed-off-by: Li Bin --- kernel/livepatch/core.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 284e269..30e9339 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -78,7 +78,7 @@ static struct klp_ops *klp_find_ops(unsigned long old_addr) static bool klp_is_module(struct klp_object *obj) { - return obj->name; + return !!obj->name; } static bool klp_is_object_loaded(struct klp_object *obj) -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC 0/4] arm64: add livepatch support
On 2015/4/24 10:44, AKASHI Takahiro wrote: > This patchset enables livepatch support on arm64. > > Livepatch was merged in v4.0, and allows replacying a function dynamically > based on ftrace framework, but it also requires -mfentry option of gcc. > Currently arm64 gcc doesn't support it, but by adding a helper function to > ftrace, we will be able to support livepatch on arch's which don't support > this option. > This is not correct for the case that the prologue of the old and new function is different. Thanks, Li Bin > I submit this patchset as RFC since I'm not quite sure that I'm doing > in the right way, or we should definitely support -fentry instead. > > Please note that I tested the feature only with livepatch-sample, and > the code for DYNAMIC_TRACE_WITH_REGS is still rough-edged. > > To: Steven Rostedt > To: Ingo Molnar > To: Josh Poimboeuf > To: Seth Jennings > To: Jiri Kosina > To: Vojtech Pavlik > To: Catalin Marinas > To: Will Deacon > > AKASHI Takahiro (4): > ftrace: add a helper function for livepatch > livepatch: adjust a patched function's address > arm64: ftrace: add DYNAMIC_TRACE_WITH_REGS version > arm64: add livepatch support > > arch/arm64/Kconfig |4 ++ > arch/arm64/include/asm/ftrace.h|4 ++ > arch/arm64/include/asm/livepatch.h | 38 +++ > arch/arm64/kernel/Makefile |1 + > arch/arm64/kernel/entry-ftrace.S | 124 > > arch/arm64/kernel/ftrace.c | 24 ++- > arch/arm64/kernel/livepatch.c | 68 > arch/x86/include/asm/livepatch.h |5 ++ > include/linux/ftrace.h |2 + > include/linux/livepatch.h |2 + > kernel/livepatch/core.c| 16 +++-- > kernel/trace/ftrace.c | 26 > 12 files changed, 309 insertions(+), 5 deletions(-) > create mode 100644 arch/arm64/include/asm/livepatch.h > create mode 100644 arch/arm64/kernel/livepatch.c > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH resend] kprobes: Update Documentation/kprobes.txt
The patch 125e564("Move Kconfig.instrumentation to arch/Kconfig and init/Kconfig") had removed the "Instrumentation Support" menu, and the configurations under this had be moved to "General setup". Update Documentation/kprobes.txt to reflect this change. Signed-off-by: Li Bin Acked-by: Masami Hiramatsu --- Documentation/kprobes.txt |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 1488b65..1f9b3e2 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -305,8 +305,8 @@ architectures: 3. Configuring Kprobes When configuring the kernel using make menuconfig/xconfig/oldconfig, -ensure that CONFIG_KPROBES is set to "y". Under "Instrumentation -Support", look for "Kprobes". +ensure that CONFIG_KPROBES is set to "y". Under "General setup", look +for "Kprobes". So that you can load and unload Kprobes-based instrumentation modules, make sure "Loadable module support" (CONFIG_MODULES) and "Module -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] livepatch: disable/enable_patch manners for interdependent patches
On 2015/1/22 21:05, Josh Poimboeuf wrote: > On Thu, Jan 22, 2015 at 05:54:23PM +0800, Li Bin wrote: >> On 2015/1/22 16:39, Li Bin wrote: >>> On 2015/1/22 11:51, Josh Poimboeuf wrote: >>>> On Thu, Jan 22, 2015 at 08:42:29AM +0800, Li Bin wrote: >>>>> On 2015/1/21 22:08, Jiri Kosina wrote: >>>>>> On Wed, 21 Jan 2015, Li Bin wrote: >>>>>> By this you limit the definition of the patch inter-dependency to just >>>>>> symbols. But that's not the only way how patches can depend on it other >>>>>> -- >>>>>> the dependency can be semantical. >>>>> >>>>> Yes, I agree with you. But I think the other dependencies such as >>>>> semantical >>>>> dependency should be judged by the user, like reverting a patch from git >>>>> repository. >>>>> Right? >>>> >>>> But with live patching, there are two users: the patch creator (who >>>> creates the patch module) and the end user (who loads it on their >>>> system). >>>> >>>> We can assume the patch creator knows what he's doing, but the end user >>>> doesn't always know or care about low level details like patch >>>> dependencies. The easiest and safest way to protect the end user is the >>>> current approach, which assumes that each patch depends on all >>>> previously applied patches. >>> >>> But then, the feature that disable patch dynamically is useless. >>> For example, if user find a bug be introduced by the last patch and disable >>> it directly, the new patch is no longer allowed from now unless enable the >>> old patch firstly but there is a risk window by this way. >>> >> >> Ok, in this case we can unregister the old patch firstly. >> But it seems that the feature that enable/disable patch dynamically indeed >> useless. (Its value is only for the last patch to enable or disable.) > > I wouldn't say it's useless... It's just a patch stack. If there's a > bug at the bottom of the stack, you can either: > > 1) push a new patch which does the opposite of the original patch >(similar to how "git revert" adds a new commit); > > 2) or pop everything off the stack and create a new stack to your >liking. > > It doesn't actually prevent you from doing what you want, it just makes > it less convenient (and more safe IMO). > I am not arguing the value of the patch stack manner, but the sysfs attribute 'enabled'. > I suppose an alternative would be to allow the patch creator to specify > patch dependencies (in addition to something like your patch to catch > the obvious dependencies). But dependencies are tricky and I'm not > really convinced that would be worth the added risk and code complexity. Yes, since we can not assume that end users know the low level details, but they have permission to unregister the stacktop patch, and this action will affect the subsequent patch without providing dependencies information. > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] livepatch: disable/enable_patch manners for interdependent patches
On 2015/1/22 16:39, Li Bin wrote: > On 2015/1/22 11:51, Josh Poimboeuf wrote: >> On Thu, Jan 22, 2015 at 08:42:29AM +0800, Li Bin wrote: >>> On 2015/1/21 22:08, Jiri Kosina wrote: >>>> On Wed, 21 Jan 2015, Li Bin wrote: >>>> By this you limit the definition of the patch inter-dependency to just >>>> symbols. But that's not the only way how patches can depend on it other -- >>>> the dependency can be semantical. >>> >>> Yes, I agree with you. But I think the other dependencies such as semantical >>> dependency should be judged by the user, like reverting a patch from git >>> repository. >>> Right? >> >> But with live patching, there are two users: the patch creator (who >> creates the patch module) and the end user (who loads it on their >> system). >> >> We can assume the patch creator knows what he's doing, but the end user >> doesn't always know or care about low level details like patch >> dependencies. The easiest and safest way to protect the end user is the >> current approach, which assumes that each patch depends on all >> previously applied patches. > > But then, the feature that disable patch dynamically is useless. > For example, if user find a bug be introduced by the last patch and disable > it directly, the new patch is no longer allowed from now unless enable the > old patch firstly but there is a risk window by this way. > Ok, in this case we can unregister the old patch firstly. But it seems that the feature that enable/disable patch dynamically indeed useless. (Its value is only for the last patch to enable or disable.) >> > > > -- > To unsubscribe from this list: send the line "unsubscribe live-patching" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] livepatch: Revert "livepatch: enforce patch stacking semantics"
On 2015/1/22 17:15, Miroslav Benes wrote: > On Thu, 22 Jan 2015, Li Bin wrote: > >> On 2015/1/21 17:07, Li Bin wrote: >>> This reverts commit 83a90bb1345767f0cb96d242fd8b9db44b2b0e17. >>> >>> The method that only allowing the topmost patch on the stack to be >>> enabled or disabled is unreasonable. Such as the following case: >>> >>> - do live patch1 >>> - disable patch1 >>> - do live patch2 //error >>> >>> Now, we will never be able to do new live patch unless disabing the >>> patch1 although there is no dependencies. >>> >> >> Correct the log: >> ... unless disabling the patch1 although ... --> >> ... unless enabling the patch1 firstly although ... > > Yes, but in such situation you can unregister patch1 and proceed with new > live patch. No problem. As Jiri has already written. Or are we missing > something? > Ok, that is before process with new live patch we must unregister the disabled patch1 previously. Is there need some message to avoid confusing the user? Thanks, Li Bin > Regards, > -- > Miroslav Benes > SUSE Labs > -- > To unsubscribe from this list: send the line "unsubscribe live-patching" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] livepatch: disable/enable_patch manners for interdependent patches
On 2015/1/22 11:51, Josh Poimboeuf wrote: > On Thu, Jan 22, 2015 at 08:42:29AM +0800, Li Bin wrote: >> On 2015/1/21 22:08, Jiri Kosina wrote: >>> On Wed, 21 Jan 2015, Li Bin wrote: >>> By this you limit the definition of the patch inter-dependency to just >>> symbols. But that's not the only way how patches can depend on it other -- >>> the dependency can be semantical. >> >> Yes, I agree with you. But I think the other dependencies such as semantical >> dependency should be judged by the user, like reverting a patch from git >> repository. >> Right? > > But with live patching, there are two users: the patch creator (who > creates the patch module) and the end user (who loads it on their > system). > > We can assume the patch creator knows what he's doing, but the end user > doesn't always know or care about low level details like patch > dependencies. The easiest and safest way to protect the end user is the > current approach, which assumes that each patch depends on all > previously applied patches. But then, the feature that disable patch dynamically is useless. For example, if user find a bug be introduced by the last patch and disable it directly, the new patch is no longer allowed from now unless enable the old patch firstly but there is a risk window by this way. > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] livepatch: Revert "livepatch: enforce patch stacking semantics"
On 2015/1/21 17:07, Li Bin wrote: > This reverts commit 83a90bb1345767f0cb96d242fd8b9db44b2b0e17. > > The method that only allowing the topmost patch on the stack to be > enabled or disabled is unreasonable. Such as the following case: > > - do live patch1 > - disable patch1 > - do live patch2 //error > > Now, we will never be able to do new live patch unless disabing the > patch1 although there is no dependencies. > Correct the log: ... unless disabling the patch1 although ... --> ... unless enabling the patch1 firstly although ... > Signed-off-by: Li Bin > --- > kernel/livepatch/core.c | 10 -- > 1 files changed, 0 insertions(+), 10 deletions(-) > > diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c > index bc05d39..7861ed2 100644 > --- a/kernel/livepatch/core.c > +++ b/kernel/livepatch/core.c > @@ -468,11 +468,6 @@ static int __klp_disable_patch(struct klp_patch *patch) > struct klp_object *obj; > int ret; > > - /* enforce stacking: only the last enabled patch can be disabled */ > - if (!list_is_last(&patch->list, &klp_patches) && > - list_next_entry(patch, list)->state == KLP_ENABLED) > - return -EBUSY; > - > pr_notice("disabling patch '%s'\n", patch->mod->name); > > for (obj = patch->objs; obj->funcs; obj++) { > @@ -529,11 +524,6 @@ static int __klp_enable_patch(struct klp_patch *patch) > if (WARN_ON(patch->state != KLP_DISABLED)) > return -EINVAL; > > - /* enforce stacking: only the first disabled patch can be enabled */ > - if (patch->list.prev != &klp_patches && > - list_prev_entry(patch, list)->state == KLP_DISABLED) > - return -EBUSY; > - > pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); > add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] livepatch: Revert "livepatch: enforce patch stacking semantics"
On 2015/1/21 22:36, Seth Jennings wrote: > On Wed, Jan 21, 2015 at 03:06:38PM +0100, Jiri Kosina wrote: >> On Wed, 21 Jan 2015, Li Bin wrote: >> >>> This reverts commit 83a90bb1345767f0cb96d242fd8b9db44b2b0e17. >>> >>> The method that only allowing the topmost patch on the stack to be >>> enabled or disabled is unreasonable. Such as the following case: >>> >>> - do live patch1 >>> - disable patch1 >>> - do live patch2 //error >>> >>> Now, we will never be able to do new live patch unless disabing the >>> patch1 although there is no dependencies. >> >> Unregistering disabled patch still works and removes it from the list no >> matter the position. >> >> So what exactly is the problem? > >>From a quick glance, it seems that what this set does is it only > enforces the stacking requirements if two patches patch the same > function. > Yes, this patch is only concerning this case that 'multi patches patch the same function' and solve the problem that mentioned previously: foo_unpatched() foo_patch1() foo_patch2() foo_patch3() disable(foo_patch2) disable(foo_patch3) foo_patch1() foo_patch2 is not allowed to be disabled before disable foo_patch3. Thanks, Li Bin > I'm not sure if that is correct logically or correctly implemented by > these patches yet. > > Seth > >> >> -- >> Jiri Kosina >> SUSE Labs > > . > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/2] livepatch: disable/enable_patch manners for interdependent patches
On 2015/1/21 22:08, Jiri Kosina wrote: > On Wed, 21 Jan 2015, Li Bin wrote: > >> for disable_patch: >> The patch is unallowed to be disabled if one patch after has >> dependencies with it and has been enabled. >> >> for enable_patch: >> The patch is unallowed to be enabled if one patch before has >> dependencies with it and has been disabled. >> >> Signed-off-by: Li Bin >> --- >> kernel/livepatch/core.c | 60 >> +++ >> 1 files changed, 60 insertions(+), 0 deletions(-) >> >> diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c >> index 7861ed2..a12a31c 100644 >> --- a/kernel/livepatch/core.c >> +++ b/kernel/livepatch/core.c >> @@ -114,6 +114,21 @@ static bool klp_is_patch_registered(struct klp_patch >> *patch) >> return false; >> } >> >> +static bool klp_func_in_patch(struct klp_func *kfunc, struct klp_patch >> *patch) >> +{ >> +struct klp_object *obj; >> +struct klp_func *func; >> + >> +for (obj = patch->objs; obj->funcs; obj++) { >> +for (func = obj->funcs; func->old_name; func++) { >> +if (kfunc->old_addr == func->old_addr) { >> +return true; >> +} >> +} >> +} >> +return false; >> +} >> + >> static bool klp_initialized(void) >> { >> return klp_root_kobj; >> @@ -466,8 +481,31 @@ unregister: >> static int __klp_disable_patch(struct klp_patch *patch) >> { >> struct klp_object *obj; >> +struct klp_patch *temp; >> +struct klp_func *func; >> int ret; >> >> +/* >> + * the patch is unallowed to be disabled if one patch >> + * after has dependencies with it and has been enabled. >> + */ >> +for (temp = list_next_entry(patch, list); >> +&temp->list != &klp_patches; >> +temp = list_next_entry(temp, list)) { >> +if (temp->state != KLP_ENABLED) >> +continue; >> + >> +for (obj = patch->objs; obj->funcs; obj++) { >> +for (func = obj->funcs; func->old_name; func++) { >> +if (klp_func_in_patch(func, temp)) { >> +pr_err("this patch depends on '%s', >> please disable it firstly\n", >> + temp->mod->name); >> +return -EBUSY; >> +} >> +} >> +} >> +} >> + >> pr_notice("disabling patch '%s'\n", patch->mod->name); >> >> for (obj = patch->objs; obj->funcs; obj++) { >> @@ -519,11 +557,33 @@ EXPORT_SYMBOL_GPL(klp_disable_patch); >> static int __klp_enable_patch(struct klp_patch *patch) >> { >> struct klp_object *obj; >> +struct klp_patch *temp; >> +struct klp_func *func; >> int ret; >> >> if (WARN_ON(patch->state != KLP_DISABLED)) >> return -EINVAL; >> >> +/* >> + * the patch is unallowed to be enabled if one patch >> + * before has dependencies with it and has been disabled. >> + */ >> +for (temp = list_first_entry(&klp_patches, struct klp_patch, list); >> +temp != patch; temp = list_next_entry(temp, list)) { >> +if (temp->state != KLP_DISABLED) >> +continue; >> + >> +for (obj = patch->objs; obj->funcs; obj++) { >> +for (func = obj->funcs; func->old_name; func++) { >> +if (klp_func_in_patch(func, temp)) { >> +pr_err("this patch depends on '%s', >> please enable it firstly\n", >> + temp->mod->name); >> +return -EBUSY; > > By this you limit the definition of the patch inter-dependency to just > symbols. But that's not the only way how patches can depend on it other -- > the dependency can be semantical. Yes, I agree with you. But I think the other dependencies such as semantical dependency should be judged by the user, like reverting a patch from git repository. Right? Thanks, Li Bin > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] livepatch: disable/enable_patch manners for interdependent patches
for disable_patch: The patch is unallowed to be disabled if one patch after has dependencies with it and has been enabled. for enable_patch: The patch is unallowed to be enabled if one patch before has dependencies with it and has been disabled. Signed-off-by: Li Bin --- kernel/livepatch/core.c | 60 +++ 1 files changed, 60 insertions(+), 0 deletions(-) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 7861ed2..a12a31c 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -114,6 +114,21 @@ static bool klp_is_patch_registered(struct klp_patch *patch) return false; } +static bool klp_func_in_patch(struct klp_func *kfunc, struct klp_patch *patch) +{ + struct klp_object *obj; + struct klp_func *func; + + for (obj = patch->objs; obj->funcs; obj++) { + for (func = obj->funcs; func->old_name; func++) { + if (kfunc->old_addr == func->old_addr) { + return true; + } + } + } + return false; +} + static bool klp_initialized(void) { return klp_root_kobj; @@ -466,8 +481,31 @@ unregister: static int __klp_disable_patch(struct klp_patch *patch) { struct klp_object *obj; + struct klp_patch *temp; + struct klp_func *func; int ret; + /* +* the patch is unallowed to be disabled if one patch +* after has dependencies with it and has been enabled. +*/ + for (temp = list_next_entry(patch, list); + &temp->list != &klp_patches; + temp = list_next_entry(temp, list)) { + if (temp->state != KLP_ENABLED) + continue; + + for (obj = patch->objs; obj->funcs; obj++) { + for (func = obj->funcs; func->old_name; func++) { + if (klp_func_in_patch(func, temp)) { + pr_err("this patch depends on '%s', please disable it firstly\n", + temp->mod->name); + return -EBUSY; + } + } + } + } + pr_notice("disabling patch '%s'\n", patch->mod->name); for (obj = patch->objs; obj->funcs; obj++) { @@ -519,11 +557,33 @@ EXPORT_SYMBOL_GPL(klp_disable_patch); static int __klp_enable_patch(struct klp_patch *patch) { struct klp_object *obj; + struct klp_patch *temp; + struct klp_func *func; int ret; if (WARN_ON(patch->state != KLP_DISABLED)) return -EINVAL; + /* +* the patch is unallowed to be enabled if one patch +* before has dependencies with it and has been disabled. +*/ + for (temp = list_first_entry(&klp_patches, struct klp_patch, list); + temp != patch; temp = list_next_entry(temp, list)) { + if (temp->state != KLP_DISABLED) + continue; + + for (obj = patch->objs; obj->funcs; obj++) { + for (func = obj->funcs; func->old_name; func++) { + if (klp_func_in_patch(func, temp)) { + pr_err("this patch depends on '%s', please enable it firstly\n", + temp->mod->name); + return -EBUSY; + } + } + } + } + pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] livepatch: Revert "livepatch: enforce patch stacking semantics"
This reverts commit 83a90bb1345767f0cb96d242fd8b9db44b2b0e17. The method that only allowing the topmost patch on the stack to be enabled or disabled is unreasonable. Such as the following case: - do live patch1 - disable patch1 - do live patch2 //error Now, we will never be able to do new live patch unless disabing the patch1 although there is no dependencies. Signed-off-by: Li Bin --- kernel/livepatch/core.c | 10 -- 1 files changed, 0 insertions(+), 10 deletions(-) diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bc05d39..7861ed2 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -468,11 +468,6 @@ static int __klp_disable_patch(struct klp_patch *patch) struct klp_object *obj; int ret; - /* enforce stacking: only the last enabled patch can be disabled */ - if (!list_is_last(&patch->list, &klp_patches) && - list_next_entry(patch, list)->state == KLP_ENABLED) - return -EBUSY; - pr_notice("disabling patch '%s'\n", patch->mod->name); for (obj = patch->objs; obj->funcs; obj++) { @@ -529,11 +524,6 @@ static int __klp_enable_patch(struct klp_patch *patch) if (WARN_ON(patch->state != KLP_DISABLED)) return -EINVAL; - /* enforce stacking: only the first disabled patch can be enabled */ - if (patch->list.prev != &klp_patches && - list_prev_entry(patch, list)->state == KLP_DISABLED) - return -EBUSY; - pr_notice_once("tainting kernel with TAINT_LIVEPATCH\n"); add_taint(TAINT_LIVEPATCH, LOCKDEP_STILL_OK); -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/2] disable/enable_patch manners for interdependent patches
Revert commit 83a90bb1345767f0cb96d242fd8b9db44b2b0e17 and introduce disable/enable_patch manners for interdependent patches for disable_patch: The patch is unallowed to be disabled if one patch after has dependencies with it and has been enabled. for enable_patch: The patch is unallowed to be enabled if one patch before has dependencies with it and has been disabled. Li Bin (2): livepatch: Revert "livepatch: enforce patch stacking semantics" livepatch: disable/enable_patch manners for interdependent patches kernel/livepatch/core.c | 66 +- 1 files changed, 58 insertions(+), 8 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: sched: spinlock recursion in sched_rr_get_interval
On 2014/12/26 15:01, Sasha Levin wrote: > On 12/26/2014 01:45 AM, Li Bin wrote: >> On 2014/7/8 4:05, Peter Zijlstra wrote: >>>> On Mon, Jul 07, 2014 at 09:55:43AM -0400, Sasha Levin wrote: >>>>>> I've also had this one, which looks similar: >>>>>> >>>>>> [10375.005884] BUG: spinlock recursion on CPU#0, modprobe/10965 >>>>>> [10375.006573] lock: 0x8803a0fd7740, .magic: dead4ead, .owner: >>>>>> modprobe/10965, .owner_cpu: 15 >>>>>> [10375.007412] CPU: 0 PID: 10965 Comm: modprobe Tainted: GW >>>>>> 3.16.0-rc3-next-20140704-sasha-00023-g26c0906-dirty #765 >>>> >>>> Something's fucked; so we have: >>>> >>>> debug_spin_lock_before() >>>>SPIN_BUG_ON(lock->owner == current, "recursion"); >>>> >> Hello, >> Does ACCESS_ONCE() can help this issue? I have no evidence that its lack is >> responsible for the issue, but I think here need it indeed. Is that right? >> >> SPIN_BUG_ON(ACCESS_ONCE(lock->owner) == current, "recursion"); > > Could you explain a bit more why do you think it's needed? > Oh, just adding ACCESS_ONCE may be not enough, and i think lacking lock protection for reading lock->owner is a risk. In short, the reason of the issue is more like the spinlock debug mechanism, rather than a real spinlock recursion. ... //under no lock protection if (lock->owner == current) //access lock->owner |-spin_dump(lock, "recursion"); |-if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT) //access lock->owner again owner = lock->owner; ... Right, or am I missing something? Thanks, Li Bin > > Thanks, > Sasha > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: sched: spinlock recursion in sched_rr_get_interval
On 2014/7/8 4:05, Peter Zijlstra wrote: > On Mon, Jul 07, 2014 at 09:55:43AM -0400, Sasha Levin wrote: >> I've also had this one, which looks similar: >> >> [10375.005884] BUG: spinlock recursion on CPU#0, modprobe/10965 >> [10375.006573] lock: 0x8803a0fd7740, .magic: dead4ead, .owner: >> modprobe/10965, .owner_cpu: 15 >> [10375.007412] CPU: 0 PID: 10965 Comm: modprobe Tainted: GW >> 3.16.0-rc3-next-20140704-sasha-00023-g26c0906-dirty #765 > > Something's fucked; so we have: > > debug_spin_lock_before() > SPIN_BUG_ON(lock->owner == current, "recursion"); > Hello, Does ACCESS_ONCE() can help this issue? I have no evidence that its lack is responsible for the issue, but I think here need it indeed. Is that right? SPIN_BUG_ON(ACCESS_ONCE(lock->owner) == current, "recursion"); Thanks, Li Bin > Causing that, _HOWEVER_ look at .owner_cpu and the reporting cpu!! How > can the lock owner, own the lock on cpu 15 and again contend with it on > CPU 0. That's impossible. > > About when-ish did you start seeing things like this? Lemme go stare > hard at recent changes. > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [kpatch] [PATCH] livepatch v7: move x86 specific ftrace handler code to arch/x86
Sorry! Bad format, please ignore this patch. On 2014/12/19 13:37, Li Bin wrote: > The execution flow redirection related implemention in the livepatch > ftrace handler is depended on the specific architecture. This patch > introduces klp_arch_set_pc(like kgdb_arch_set_pc) interface to change > the pt_regs. > > Signed-off-by: Li Bin -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[kpatch] [PATCH] livepatch v7: move x86 specific ftrace handler code to arch/x86
The execution flow redirection related implemention in the livepatch ftrace handler is depended on the specific architecture. This patch introduces klp_arch_set_pc(like kgdb_arch_set_pc) interface to change the pt_regs. Signed-off-by: Li Bin --- arch/x86/include/asm/livepatch.h |5 + kernel/livepatch/core.c |2 +- 2 files changed, 6 insertions(+), 1 deletions(-) diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index c2ae592..4cdec4e 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -21,6 +21,7 @@ #define _ASM_X86_LIVEPATCH_H #include +#include #ifdef CONFIG_LIVE_PATCHING #ifndef CC_USING_FENTRY @@ -29,6 +30,10 @@ extern int klp_write_module_reloc(struct module *mod, unsigned long type, unsigned long loc, unsigned long value); +static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} #else #error Live patching support is disabled; check CONFIG_LIVE_PATCHING #endif diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0004a71..c4c04fd 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -271,7 +271,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, { struct klp_func *func = ops->private; - regs->ip = (unsigned long)func->new_func; + klp_arch_set_pc(regs, (unsigned long)func->new_func); } static int klp_disable_func(struct klp_func *func) -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] ftrace: fix typo in comment
%s/ARCH_SUPPORT_FTARCE_OPS/ARCH_SUPPORTS_FTRACE_OPS/ Signed-off-by: Li Bin --- kernel/trace/ftrace.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 929a733..9473b24 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5102,7 +5102,7 @@ out: * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still - * set the ARCH_SUPPORT_FTARCE_OPS. + * set the ARCH_SUPPORTS_FTRACE_OPS. */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[kpatch] [PATCH] livepatch v7: move x86 specific ftrace handler code to arch/x86
The execution flow redirection related implemention in the livepatch ftrace handler is depended on the specific architecture. This patch introduces klp_arch_set_pc(like kgdb_arch_set_pc) interface to change the pt_regs. Signed-off-by: Li Bin --- arch/x86/include/asm/livepatch.h |5 + kernel/livepatch/core.c |2 +- 2 files changed, 6 insertions(+), 1 deletions(-) diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index c2ae592..4cdec4e 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -21,6 +21,7 @@ #define _ASM_X86_LIVEPATCH_H #include +#include #ifdef CONFIG_LIVE_PATCHING #ifndef CC_USING_FENTRY @@ -29,6 +30,10 @@ extern int klp_write_module_reloc(struct module *mod, unsigned long type, unsigned long loc, unsigned long value); +static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} #else #error Live patching support is disabled; check CONFIG_LIVE_PATCHING #endif diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0004a71..c4c04fd 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -271,7 +271,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, { struct klp_func *func = ops->private; - regs->ip = (unsigned long)func->new_func; + klp_arch_set_pc(regs, (unsigned long)func->new_func); } static int klp_disable_func(struct klp_func *func) -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] kprobes: Update Documentation/kprobes.txt
The patch 125e564("Move Kconfig.instrumentation to arch/Kconfig and init/Kconfig") had removed the "Instrumentation Support" menu, and the configurations under this had be moved to "General setup". Update Documentation/kprobes.txt to reflect this change. Signed-off-by: Li Bin --- Documentation/kprobes.txt |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 4bbeca8..ca4a927 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -304,8 +304,8 @@ architectures: 3. Configuring Kprobes When configuring the kernel using make menuconfig/xconfig/oldconfig, -ensure that CONFIG_KPROBES is set to "y". Under "Instrumentation -Support", look for "Kprobes". +ensure that CONFIG_KPROBES is set to "y". Under "General setup", look +for "Kprobes". So that you can load and unload Kprobes-based instrumentation modules, make sure "Loadable module support" (CONFIG_MODULES) and "Module -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Enhancement for PLE handler in KVM
Thanks, Paolo for the comments Understand the requirement to "fix it for all guest OSes". I will investigate the "new hypercall KVM_HC_HALT_AND_YIELD_TO_CPU that takes an APIC id, donates the quantum to that CPU, and puts the originating CPU in halted state. " Regards Bin -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
RE: Enhancement for PLE handler in KVM
Fully agree. It will be a very helpful feature to make ple setting per VM. This feature will provide more flexible control to the VM user. All KVM user will love to have it. The enhancement we proposed is neither overlapping nor conflicting with this feature. The enhancement is targeting to provide the best real time performance to the guest OS. There will be more and more embedded system migrating to KVM. Especially in the telecom industry. And a lot of existing system will be running on top of customized embedded OS which significant different from generic OS (either linux or windows ). Is there any concern regarding to the enhancement we need to address? Or more work need to be done? Regards Bin -Original Message- From: Marcelo Tosatti [mailto:mtosa...@redhat.com] Sent: Friday, March 07, 2014 12:42 PM To: Li, Bin (Bin) Cc: Paolo Bonzini; k...@vger.kernel.org; Jatania, Neel (Neel); linux-kernel@vger.kernel.org Subject: Re: Enhancement for PLE handler in KVM On Fri, Mar 07, 2014 at 02:26:19PM +, Li, Bin (Bin) wrote: > Can we have "per-VM PLE values"? > > My understanding is that the ple values are kvm module setting which applying > to all VMs in the system. > And all vms must be stopped first, then unload kvm-intel, reload kvm-intel > with new ple setting. > > /sbin/modprobe -r kvm-intel > /sbin/modprobe kvm-intel ple_window=16384 > > Regards > Bin Yes, but it can be made per-VM (its a VMCS field). -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
RE: Enhancement for PLE handler in KVM
Can we have "per-VM PLE values"? My understanding is that the ple values are kvm module setting which applying to all VMs in the system. And all vms must be stopped first, then unload kvm-intel, reload kvm-intel with new ple setting. /sbin/modprobe -r kvm-intel /sbin/modprobe kvm-intel ple_window=16384 Regards Bin -Original Message- From: Marcelo Tosatti [mailto:mtosa...@redhat.com] Sent: Thursday, March 06, 2014 10:07 PM To: Li, Bin (Bin) Cc: Paolo Bonzini; k...@vger.kernel.org; Jatania, Neel (Neel); linux-kernel@vger.kernel.org; Peter Zijlstra; Mike Galbraith; Chris Wright; ttr...@redhat.com; Nakajima, Jun; r...@redhat.com Subject: Re: Enhancement for PLE handler in KVM On Wed, Mar 05, 2014 at 09:16:45PM +0000, Li, Bin (Bin) wrote: > Did you also find out here why this is the case? > > Binl: > Yes. The application running in our customized embedded OS is also real time > application which is timing sensitive. > The timing sensitive level varies among the applications. > When I talking about different threshold, the assumption is using default > 4096 ple_window setting. > If I set up the ple_window to 16384 or higher, there will be no problem for > our application. But the finial user could also run linux or window VM on > same hypervisor which would prefer default 4096 setting for linux and windows. Then have per-VM PLE values? > We are looking for a solution to be good for both linux / window and real > time application. > The enhancement patch we proposed will satisfy both linux/window application > and real time embedded applications. > > Paolo -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
RE: Enhancement for PLE handler in KVM
Thanks for the quick response. Inline the comments and also added a typical log. Regards Bin -Original Message- From: Paolo Bonzini [mailto:pbonz...@redhat.com] Sent: Wednesday, March 05, 2014 9:49 AM To: Li, Bin (Bin); k...@vger.kernel.org Cc: Jatania, Neel (Neel); linux-kernel@vger.kernel.org; Srivatsa Vaddagiri; Peter Zijlstra; Mike Galbraith; Chris Wright; ttr...@redhat.com; Nakajima, Jun; r...@redhat.com Subject: Re: Enhancement for PLE handler in KVM Il 05/03/2014 15:17, Li, Bin (Bin) ha scritto: > Hello, Paolo, > > We are using a customized embedded SMP OS as guest OS. It is not meaningful > to post the guest OS code. > Also, there is no "performance numbers for common workloads" since there is > no common workloads to compare with. > In our OS, there is still a big kernel lock to protect the kernel. Does this means that average spinning time for the spinlock is relatively high compared to Linux or Windows? Binl: Yes. The default setting for ple_window is 4096 which based on linux and window. In our guest OS, the ple_window need to be at least 16384 in order to prevent large jitter caused system reset. With setting ple_window to 16384, the biggest jitter in guest OS is about 24-25ms. > - when the in-correct boosting happens, the vCPU in spin lock will run > longer time on the pCPU and causing the lock holder vCPU having less > time to run on pCPU since they are sharing the on same pCPU. Correct. This is an unfortunate problem in the current implementation of PLE. > Adding hyper call in every kernel enter and kernel exist is expensive. > From the trace log collect from i7 running @ 3.0GHz , the cost per > hyper is <1us. Right, it is around 1500 cycles and 0.4-0.5 us, i.e. approximately 1 us for enter and exit together. This is not too bad for a kernel with a big lock, but not acceptable if you do not have it (as is the case for Linux and Windows). > Regarding to the " paravirtual ticketlock ", we did try the same idea in our > embedded guest OS. > We got following results: > > a) We implemented similar approach like linux "paravirtual > ticketlock". The system clock jitter does get reduced a lot. But, the > system clock jitter is still happening at lower rate. In a few hours > system stress test, we still see the big jitter few times. Did you find out why? It could happen if the virtual CPU is scheduled out for a relatively long time. A small number of spinning iterations can then account for a relatively large time. My impression is that you're implementing a paravirtual spinlock, except that you're relying on PLE to decide when to go to sleep. PLE is implemented using the TSC. Can you assume the host TSC is of good quality? If so, perhaps you can try to modify the pv ticketlock algorithm, and use a threshold based on TSC instead of an iteration count? BinL: Our "paravirtual ticket lock" is done in following way, - when a vCPU is doing spin loop to get into kernel state, it will set up a counter to track how many time in the tight loop. If the counter reach certain threshold, the vCPU will add a request to the lock holder work item pipe ( multiple producer, single consumer pipe ) and the vCPU will run "hlt" instruction, waiting for the lock holder to wake it up. - TSC is not used in this case. - When lock holder sees the request, it will send an IPI interrupt to wake up the vCPU which is hlt-ed. - With this implementation, we are able to get the system stress test ( about hours run with traffic ) completing successfully most of time. But not all the time. The big jitter still happening during the run, just less frequently. The root cause of the problem is still there. The lock holder vCPU is schedule out for a relatively long time. We did more investigation for why. a) From the trace-cmd log, we believe the root cause is PLE handler incorrectly boost the spin loop vCPU which being stack with the lock holder vCPU on same pCPU. The sample log we captured is, lock holder vCPU and the vCPU doing spin loop, both running on a pCPU. The vCPU in spin loop being incorrectly boosted by ple handler. And linux scheduler could toggling two vCPU on the pCPU at vert high rate. The worst case is, the lock is happen to stay between the two vCPUs on the same pCPU. Then we will see lock holder vCPU is scheduled in doing a little bit real work and the other spin loop vCPU being scheduled in doing spin loop. And the context switch on the pCPU happens very often. ( see more details of typical trace below) This causing the real cpu power used for processing real work becomes much less than normal. ( a short period truly over load on this pCPU). And the guest OS stays in a very slow kernel work stage because the lock holder vCPU having much less time to do its real work. The system will get out of