From: chris hyser <chris.hy...@oracle.com> This patch provides support for setting, clearing and copying core scheduling 'task cookies' between threads (PID), processes (TGID), and process groups (PGID).
The value of core scheduling isn't that tasks don't share a core, 'nosmt' can do that. The value lies in exploiting all the sharing opportunities that exist to recover possible lost performance and that requires a degree of flexibility in the API. From a security perspective (and there are others), the thread, process and process group distinction is an existent hierarchal categorization of tasks that reflects many of the security concerns about 'data sharing'. For example, protecting against cache-snooping by a thread that can just read the memory directly isn't all that useful. With this in mind, subcommands to CLEAR/CREATE/SHARE (TO/FROM) provide a mechanism to create, clear and share cookies. CLEAR/CREATE/SHARE_TO specify a target pid with enum pidtype used to specify the scope of the targeted tasks. For example, PIDTYPE_TGID will share the cookie with the process and all of it's threads as typically desired in a security scenario. API: prctl(PR_SCHED_CORE_SHARE, PR_SCHED_CORE_CREATE, tgtpid, pidtype, 0) prctl(PR_SCHED_CORE_SHARE, PR_SCHED_CORE_CLEAR, tgtpid, pidtype, 0) prctl(PR_SCHED_CORE_SHARE, PR_SCHED_CORE_SHARE_FROM, srcpid, 0, 0) prctl(PR_SCHED_CORE_SHARE, PR_SCHED_CORE_SHARE_TO, tgtpid, pidtype, 0) where 'tgtpid/srcpid == 0' implies the current process and pidtype is kernel enum pid_type {PIDTYPE_PID, PIDTYPE_TGID, PIDTYPE_PGID, ...}. PIDTYPE_SID, sharing a cookie with an entire session, was considered less useful given the choice to create a new cookie on task exec(). For return values, EINVAL, ENOMEM are what they say. ESRCH means the tgtpid/srcpid was not found. EPERM indicates lack of PTRACE permission access to tgtpid/srcpid. EACCES indicates that a task in the target pidtype group was not updated due to permission. In terms of interaction with the cgroup interface, task cookies are set independently of cgroup core scheduling cookies and thus would allow use for tasks within a container using cgroup cookies. Current hard-coded policies are: - a user can clear the cookie of any process they can set a cookie for. Lack of a cookie *might* be a security issue if cookies are being used for that. - on fork of a parent with a cookie, both process and thread child tasks get a copy. - on exec a task with a cookie is given a new cookie Signed-off-by: Chris Hyser <chris.hy...@oracle.com> Signed-off-by: Josh Don <josh...@google.com> --- fs/exec.c | 4 +- include/linux/sched.h | 11 ++ include/linux/sched/task.h | 4 +- include/uapi/linux/prctl.h | 7 ++ kernel/sched/core.c | 11 +- kernel/sched/coretag.c | 196 ++++++++++++++++++++++++++++++- kernel/sched/sched.h | 2 + kernel/sys.c | 7 ++ tools/include/uapi/linux/prctl.h | 7 ++ 9 files changed, 241 insertions(+), 8 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 18594f11c31f..ab0945508b50 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1807,7 +1807,9 @@ static int bprm_execve(struct linux_binprm *bprm, if (IS_ERR(file)) goto out_unmark; - sched_exec(); + retval = sched_exec(); + if (retval) + goto out; bprm->file = file; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 833f8d682212..075b15392a4a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2184,8 +2184,19 @@ const struct cpumask *sched_trace_rd_span(struct root_domain *rd); #ifdef CONFIG_SCHED_CORE void sched_tsk_free(struct task_struct *tsk); +int sched_core_share_pid(unsigned long flags, pid_t pid, enum pid_type type); +int sched_core_exec(void); #else #define sched_tsk_free(tsk) do { } while (0) +static inline int sched_core_share_pid(unsigned long flags, pid_t pid, enum pid_type type) +{ + return 0; +} + +static inline int sched_core_exec(void) +{ + return 0; +} #endif #endif diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index ef02be869cf2..d0f5b233f092 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -94,9 +94,9 @@ extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP -extern void sched_exec(void); +int sched_exec(void); #else -#define sched_exec() {} +static inline int sched_exec(void) { return 0; } #endif static inline struct task_struct *get_task_struct(struct task_struct *t) diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 667f1aed091c..e658dca88f4f 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -255,4 +255,11 @@ struct prctl_mm_map { # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 +/* Request the scheduler to share a core */ +#define PR_SCHED_CORE_SHARE 60 +# define PR_SCHED_CORE_CLEAR 0 /* clear core_sched cookie of pid */ +# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */ +# define PR_SCHED_CORE_SHARE_FROM 2 /* get core_sched cookie from pid */ +# define PR_SCHED_CORE_SHARE_TO 3 /* push core_sched cookie to pid */ + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1b07687c53d4..3093cb3414c3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4752,11 +4752,17 @@ unsigned long nr_iowait(void) * sched_exec - execve() is a valuable balancing opportunity, because at * this point the task has the smallest effective memory and cache footprint. */ -void sched_exec(void) +int sched_exec(void) { struct task_struct *p = current; unsigned long flags; int dest_cpu; + int ret; + + /* this may change what tasks current can share a core with */ + ret = sched_core_exec(); + if (ret) + return ret; raw_spin_lock_irqsave(&p->pi_lock, flags); dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); @@ -4768,10 +4774,11 @@ void sched_exec(void) raw_spin_unlock_irqrestore(&p->pi_lock, flags); stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); - return; + return 0; } unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return 0; } #endif diff --git a/kernel/sched/coretag.c b/kernel/sched/coretag.c index ba73569237f0..550f4975eea2 100644 --- a/kernel/sched/coretag.c +++ b/kernel/sched/coretag.c @@ -155,6 +155,7 @@ static void sched_core_update_cookie(struct task_struct *p, task_rq_unlock(rq, p, &rf); } +/* Per-task interface: task free. */ static void sched_core_free_task_cookie_work(struct work_struct *ws); static unsigned long sched_core_alloc_task_cookie(void) @@ -223,16 +224,205 @@ static inline void sched_core_update_task_cookie(struct task_struct *t, sched_core_update_cookie(t, c, sched_core_task_cookie_type); } -/* - * Called from sched_fork(). - */ +static int sched_core_create_cookie(struct task_struct *p) +{ + unsigned long cookie; + + lockdep_assert_held(&sched_core_tasks_mutex); + + cookie = sched_core_alloc_task_cookie(); + if (!cookie) + return -ENOMEM; + + if (p->core_cookie.task_cookie) + sched_core_put_task_cookie(p->core_cookie.task_cookie); + + sched_core_update_task_cookie(p, cookie); + return 0; +} + +static void sched_core_clear_cookie(struct task_struct *p) +{ + lockdep_assert_held(&sched_core_tasks_mutex); + if (p->core_cookie.task_cookie) { + sched_core_put_task_cookie(p->core_cookie.task_cookie); + sched_core_update_task_cookie(p, 0); + } +} + +static unsigned long sched_core_get_copy_cookie(struct task_struct *p) +{ + unsigned long cookie = p->core_cookie.task_cookie; + + lockdep_assert_held(&sched_core_tasks_mutex); + sched_core_get_task_cookie(cookie); + return cookie; +} + +static void sched_core_copy_cookie_frm_to(struct task_struct *ft, struct task_struct *tt) +{ + unsigned long cookie; + + lockdep_assert_held(&sched_core_tasks_mutex); + + /* sharing a 0 cookie is a clear */ + if (!ft->core_cookie.task_cookie) { + sched_core_clear_cookie(tt); + return; + } + + cookie = sched_core_get_copy_cookie(ft); + if (tt->core_cookie.task_cookie) + sched_core_put_task_cookie(tt->core_cookie.task_cookie); + sched_core_update_task_cookie(tt, cookie); +} + +/* Called from prctl interface: PR_SCHED_CORE_SHARE */ +int sched_core_share_pid(unsigned long flags, pid_t pid, enum pid_type type) +{ + struct task_struct *task; + struct task_struct *p; + unsigned long cookie; + struct pid *grp; + int err = 0; + + if (type > PIDTYPE_PGID || flags > PR_SCHED_CORE_SHARE_TO || pid < 0 || + (flags == PR_SCHED_CORE_SHARE_FROM && type != PIDTYPE_PID)) + return -EINVAL; + + rcu_read_lock(); + + if (pid == 0) { + task = current; + } else { + task = find_task_by_vpid(pid); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } + } + + get_task_struct(task); + + /* Check if this process has the right to modify the specified + * process. Use the regular "ptrace_may_access()" checks. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + err = -EPERM; + goto out; + } + rcu_read_unlock(); + + mutex_lock(&sched_core_tasks_mutex); + if (type == PIDTYPE_PID) { + if (flags == PR_SCHED_CORE_CREATE) { + err = sched_core_create_cookie(task); + + } else if (flags == PR_SCHED_CORE_CLEAR) { + sched_core_clear_cookie(task); + + } else if (flags == PR_SCHED_CORE_SHARE_FROM) { + sched_core_copy_cookie_frm_to(task, current); + + } else if (flags == PR_SCHED_CORE_SHARE_TO) { + sched_core_copy_cookie_frm_to(current, task); + + } else { + err = -EINVAL; + goto out_unlock; + } + } else { + if (flags == PR_SCHED_CORE_CREATE) { + cookie = sched_core_alloc_task_cookie(); + if (!cookie) { + err = -ENOMEM; + goto out_unlock; + } + + } else if (flags == PR_SCHED_CORE_CLEAR) { + cookie = 0; + } else if (flags == PR_SCHED_CORE_SHARE_TO) { + cookie = sched_core_get_copy_cookie(current); + } else { + err = -EINVAL; + goto out_unlock; + } + + rcu_read_lock(); + if (type == PIDTYPE_TGID) { + grp = task_tgid(task); + } else if (type == PIDTYPE_PGID) { + grp = task_pgrp(task); + } else { + err = -EINVAL; + rcu_read_unlock(); + goto out_unlock; + } + + do_each_pid_thread(grp, type, p) { + /* + * if not allowed, don't do it, but indicate to caller. + * task and current are already good. + */ + if (p == task || p == current || + ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) { + if (cookie) + sched_core_get_task_cookie(cookie); + if (p->core_cookie.task_cookie) + sched_core_put_task_cookie_async(p->core_cookie.task_cookie); + sched_core_update_task_cookie(p, cookie); + } else { + err = -EACCES; + } + } while_each_pid_thread(grp, type, p); + + rcu_read_unlock(); + + /* + * Remove the extra reference we took to the cookie + * (ie. via alloc/copy). + */ + if (cookie) + sched_core_put_task_cookie(cookie); + } +out_unlock: + mutex_unlock(&sched_core_tasks_mutex); + +out: + put_task_struct(task); + return err; +} + +int sched_core_exec(void) +{ + int ret = 0; + + /* absent a policy mech, if task had a cookie, give it a new one */ + if (READ_ONCE(current->core_cookie.task_cookie)) { + mutex_lock(&sched_core_tasks_mutex); + if (current->core_cookie.task_cookie) + ret = sched_core_create_cookie(current); + mutex_unlock(&sched_core_tasks_mutex); + } + return ret; +} + +/* Called from sched_fork() */ int sched_core_fork(struct task_struct *p, unsigned long clone_flags) { /* * Task cookie is ref counted; avoid an uncounted reference. + * If p should have a task cookie, it will be set below. */ __sched_core_set_task_cookie(&p->core_cookie, 0); + if (READ_ONCE(current->core_cookie.task_cookie)) { + mutex_lock(&sched_core_tasks_mutex); + if (current->core_cookie.task_cookie) + sched_core_copy_cookie_frm_to(current, p); + mutex_unlock(&sched_core_tasks_mutex); + } return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5b49cfaa4a53..1be86d9cc58f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1184,6 +1184,8 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p); void sched_core_get(void); void sched_core_put(void); +int sched_core_share_pid(unsigned long flags, pid_t pid, enum pid_type type); + bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); int sched_core_cookie_cmp(const struct sched_core_cookie *a, diff --git a/kernel/sys.c b/kernel/sys.c index 2e2e3f378d97..b40243522146 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2534,6 +2534,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = set_syscall_user_dispatch(arg2, arg3, arg4, (char __user *) arg5); break; +#ifdef CONFIG_SCHED_CORE + case PR_SCHED_CORE_SHARE: + if (arg5) + return -EINVAL; + error = sched_core_share_pid(arg2, arg3, arg4); + break; +#endif default: error = -EINVAL; break; diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 667f1aed091c..14900c400e74 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -255,4 +255,11 @@ struct prctl_mm_map { # define SYSCALL_DISPATCH_FILTER_ALLOW 0 # define SYSCALL_DISPATCH_FILTER_BLOCK 1 +/* Request the scheduler to share a core */ +#define PR_SCHED_CORE_SHARE 60 +# define PR_SCHED_CORE_CLEAR 0 /* clear core_sched cookie of pid */ +# define PR_SCHED_CORE_CREATE 1 /* get core_sched cookie from pid */ +# define PR_SCHED_CORE_SHARE_FROM 2 /* get core_sched cookie from pid */ +# define PR_SCHED_CORE_SHARE_TO 3 /* push core_sched cookie to pid */ + #endif /* _LINUX_PRCTL_H */ -- 2.31.0.291.g576ba9dcdaf-goog