[RFC v4] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may no

2017-11-20 Thread Shawn Landden
See my systemd patch: https://github.com/shawnl/systemd/tree/prctl

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).

v2
switch to prctl, memcg support

v3
use 
put OOM after constraint checking

v4
ignore memcg OOMs as should have been all along (sry for the noise)
---
 fs/eventpoll.c |  9 +
 fs/proc/array.c|  7 +++
 include/linux/memcontrol.h |  1 +
 include/linux/oom.h|  4 
 include/linux/sched.h  |  1 +
 include/uapi/linux/prctl.h |  4 
 kernel/exit.c  |  1 +
 kernel/sys.c   |  9 +
 mm/oom_kill.c  | 43 +++
 9 files changed, 79 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..5b3f084b22d5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -43,6 +43,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 /*
  * LOCKING:
@@ -1761,6 +1763,10 @@ static int ep_poll(struct eventpoll *ep, struct 
epoll_event __user *events,
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
+   DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback);
+
+   if (current->oom_target)
+   add_wait_queue(oom_target_get_wait(), _target_wait);
 
if (timeout > 0) {
struct timespec64 end_time = ep_set_mstimeout(timeout);
@@ -1850,6 +1856,9 @@ static int ep_poll(struct eventpoll *ep, struct 
epoll_event __user *events,
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
 
+   if (current->oom_target)
+   remove_wait_queue(oom_target_get_wait(), _target_wait);
+
return res;
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9390032a11e1..1954ae87cb88 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct 
task_struct *p)
seq_putc(m, '\n');
 }
 
+static inline void task_idle(struct seq_file *m, struct task_struct *p)
+{
+   seq_put_decimal_ull(m, "Idle:\t", p->oom_target);
+   seq_putc(m, '\n');
+}
+
 static inline void task_context_switch_counts(struct seq_file *m,
struct task_struct *p)
 {
@@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct 
pid_namespace *ns,
task_sig(m, task);
task_cap(m, task);
task_seccomp(m, task);
+   task_idle(m, task);
task_cpus_allowed(m, task);
cpuset_task_status_allowed(m, task);
task_context_switch_counts(m, task);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 69966c461d1c..471d1d52ae72 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct mem_cgroup;
 struct page;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 01c91d874a57..88acea9e0a59 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -102,6 +102,10 @@ extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
+extern void exit_oom_target(void);
+struct wait_queue_head *oom_target_get_wait(void);
+int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, 
void *key);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdf74f27acf1..51b0e5987e8c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -652,6 +652,7 @@ struct task_struct {
/* disallow userland-initiated cgroup migration */
unsignedno_cgroup_migration:1;
 #endif
+   unsignedoom_target:1;
 
unsigned long   atomic_flags; /* Flags requiring atomic 
access. */
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b640071421f7..94868317c6f2 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -198,4 +198,8 @@ struct prctl_mm_map {
 # define PR_CAP_AMBIENT_LOWER  3
 # define PR_CAP_AMBIENT_CLEAR_ALL  4
 
+#define PR_SET_IDLE48
+#define PR_GET_IDLE49
+# define PR_IDLE_MODE_KILLME   1
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..2788fbdae267 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
diff --git a/kernel/sys.c b/kernel/sys.c
index 524a4cb9bbe2..e1eb049a85e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2386,6 +2386,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, 
unsigned long, arg3,
case PR_GET_FP_MODE:
error = GET_FP_MODE(me);
break;
+   case PR_SET_IDLE:
+   

[RFC v4] It is common for services to be stateless around their main event loop. If a process sets PR_SET_IDLE to PR_IDLE_MODE_KILLME then it signals to the kernel that epoll_wait() and friends may no

2017-11-20 Thread Shawn Landden
See my systemd patch: https://github.com/shawnl/systemd/tree/prctl

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).

v2
switch to prctl, memcg support

v3
use 
put OOM after constraint checking

v4
ignore memcg OOMs as should have been all along (sry for the noise)
---
 fs/eventpoll.c |  9 +
 fs/proc/array.c|  7 +++
 include/linux/memcontrol.h |  1 +
 include/linux/oom.h|  4 
 include/linux/sched.h  |  1 +
 include/uapi/linux/prctl.h |  4 
 kernel/exit.c  |  1 +
 kernel/sys.c   |  9 +
 mm/oom_kill.c  | 43 +++
 9 files changed, 79 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..5b3f084b22d5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -43,6 +43,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 /*
  * LOCKING:
@@ -1761,6 +1763,10 @@ static int ep_poll(struct eventpoll *ep, struct 
epoll_event __user *events,
u64 slack = 0;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
+   DEFINE_WAIT_FUNC(oom_target_wait, oom_target_callback);
+
+   if (current->oom_target)
+   add_wait_queue(oom_target_get_wait(), _target_wait);
 
if (timeout > 0) {
struct timespec64 end_time = ep_set_mstimeout(timeout);
@@ -1850,6 +1856,9 @@ static int ep_poll(struct eventpoll *ep, struct 
epoll_event __user *events,
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
 
+   if (current->oom_target)
+   remove_wait_queue(oom_target_get_wait(), _target_wait);
+
return res;
 }
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 9390032a11e1..1954ae87cb88 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -350,6 +350,12 @@ static inline void task_seccomp(struct seq_file *m, struct 
task_struct *p)
seq_putc(m, '\n');
 }
 
+static inline void task_idle(struct seq_file *m, struct task_struct *p)
+{
+   seq_put_decimal_ull(m, "Idle:\t", p->oom_target);
+   seq_putc(m, '\n');
+}
+
 static inline void task_context_switch_counts(struct seq_file *m,
struct task_struct *p)
 {
@@ -381,6 +387,7 @@ int proc_pid_status(struct seq_file *m, struct 
pid_namespace *ns,
task_sig(m, task);
task_cap(m, task);
task_seccomp(m, task);
+   task_idle(m, task);
task_cpus_allowed(m, task);
cpuset_task_status_allowed(m, task);
task_context_switch_counts(m, task);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 69966c461d1c..471d1d52ae72 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct mem_cgroup;
 struct page;
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 01c91d874a57..88acea9e0a59 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -102,6 +102,10 @@ extern void oom_killer_enable(void);
 
 extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 
+extern void exit_oom_target(void);
+struct wait_queue_head *oom_target_get_wait(void);
+int oom_target_callback(wait_queue_entry_t *wait, unsigned mode, int sync, 
void *key);
+
 /* sysctls */
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index fdf74f27acf1..51b0e5987e8c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -652,6 +652,7 @@ struct task_struct {
/* disallow userland-initiated cgroup migration */
unsignedno_cgroup_migration:1;
 #endif
+   unsignedoom_target:1;
 
unsigned long   atomic_flags; /* Flags requiring atomic 
access. */
 
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index b640071421f7..94868317c6f2 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -198,4 +198,8 @@ struct prctl_mm_map {
 # define PR_CAP_AMBIENT_LOWER  3
 # define PR_CAP_AMBIENT_CLEAR_ALL  4
 
+#define PR_SET_IDLE48
+#define PR_GET_IDLE49
+# define PR_IDLE_MODE_KILLME   1
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..2788fbdae267 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
diff --git a/kernel/sys.c b/kernel/sys.c
index 524a4cb9bbe2..e1eb049a85e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2386,6 +2386,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, 
unsigned long, arg3,
case PR_GET_FP_MODE:
error = GET_FP_MODE(me);
break;
+   case PR_SET_IDLE:
+