From: Konstantin Khlebnikov <khlebni...@yandex-team.ru>

inject_delay() allows to pause current task before returning
into userspace in place where kernel doesn't hold any locks
thus wait wouldn't introduce any priority-inversion problems.

This code abuses existing task-work and 'TASK_PARKED' state.
Parked tasks are killable and don't contribute into cpu load.

Together with percpu_ratelimit this could be used in this manner:

if (percpu_ratelimit_charge(&ratelimit, events))
        inject_delay(percpu_ratelimit_target(&ratelimit));

Signed-off-by: Konstantin Khlebnikov <khlebni...@yandex-team.ru>
---
 include/linux/sched.h        |    7 ++++
 include/trace/events/sched.h |    7 ++++
 kernel/sched/core.c          |   66 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |   12 ++++++++
 4 files changed, 92 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef..2363918 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1132,6 +1132,7 @@ struct sched_statistics {
        u64                     iowait_sum;
 
        u64                     sleep_start;
+       u64                     delay_start;
        u64                     sleep_max;
        s64                     sum_sleep_runtime;
 
@@ -1662,6 +1663,10 @@ struct task_struct {
        unsigned long timer_slack_ns;
        unsigned long default_timer_slack_ns;
 
+       /* Pause task till this time before returning into userspace */
+       ktime_t delay_injection_target;
+       struct callback_head delay_injection_work;
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
        /* Index of current stored address in ret_stack */
        int curr_ret_stack;
@@ -2277,6 +2282,8 @@ extern void set_curr_task(int cpu, struct task_struct *p);
 
 void yield(void);
 
+extern void inject_delay(ktime_t target);
+
 /*
  * The default (Linux) execution domain.
  */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 30fedaf..d35154e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -365,6 +365,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
             TP_ARGS(tsk, delay));
 
 /*
+ * Tracepoint for accounting delay-injection
+ */
+DEFINE_EVENT(sched_stat_template, sched_stat_delayed,
+            TP_PROTO(struct task_struct *tsk, u64 delay),
+            TP_ARGS(tsk, delay));
+
+/*
  * Tracepoint for accounting runtime (time the task is executing
  * on a CPU).
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc0..7a9d6a1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -65,6 +65,7 @@
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
+#include <linux/task_work.h>
 #include <linux/tick.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
@@ -8377,3 +8378,68 @@ void dump_cpu_task(int cpu)
        pr_info("Task dump for CPU %d:\n", cpu);
        sched_show_task(cpu_curr(cpu));
 }
+
+#define DELAY_INJECTION_SLACK_NS       (NSEC_PER_SEC / 50)
+
+static enum hrtimer_restart delay_injection_wakeup(struct hrtimer *timer)
+{
+       struct hrtimer_sleeper *t =
+               container_of(timer, struct hrtimer_sleeper, timer);
+       struct task_struct *task = t->task;
+
+       t->task = NULL;
+       if (task)
+               wake_up_state(task, TASK_PARKED);
+
+       return HRTIMER_NORESTART;
+}
+
+/*
+ * Here delayed task sleeps in 'P'arked state.
+ */
+static void delay_injection_sleep(struct callback_head *head)
+{
+       struct task_struct *task = current;
+       struct hrtimer_sleeper t;
+
+       head->func = NULL;
+       __set_task_state(task, TASK_WAKEKILL | TASK_PARKED);
+       hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+       hrtimer_set_expires_range_ns(&t.timer, current->delay_injection_target,
+                                    DELAY_INJECTION_SLACK_NS);
+
+       t.timer.function = delay_injection_wakeup;
+       t.task = task;
+
+       hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+       if (!hrtimer_active(&t.timer))
+               t.task = NULL;
+
+       if (likely(t.task))
+               schedule();
+
+       hrtimer_cancel(&t.timer);
+       destroy_hrtimer_on_stack(&t.timer);
+
+       __set_task_state(task, TASK_RUNNING);
+}
+
+/*
+ * inject_delay - injects delay before returning into userspace
+ * @target: absolute monotomic timestamp to sleeping for,
+ *         task will not return into userspace before this time
+ */
+void inject_delay(ktime_t target)
+{
+       struct task_struct *task = current;
+
+       if (ktime_after(target, task->delay_injection_target)) {
+               task->delay_injection_target = target;
+               if (!task->delay_injection_work.func) {
+                       init_task_work(&task->delay_injection_work,
+                                       delay_injection_sleep);
+                       task_work_add(task, &task->delay_injection_work, true);
+               }
+       }
+}
+EXPORT_SYMBOL(inject_delay);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cb..2e3269b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2944,6 +2944,15 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, 
struct sched_entity *se)
                        account_scheduler_latency(tsk, delta >> 10, 0);
                }
        }
+       if (se->statistics.delay_start) {
+               u64 delta = rq_clock(rq_of(cfs_rq)) - 
se->statistics.delay_start;
+
+               if ((s64)delta < 0)
+                       delta = 0;
+
+               se->statistics.delay_start = 0;
+               trace_sched_stat_delayed(tsk, delta);
+       }
 #endif
 }
 
@@ -3095,6 +3104,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
                                se->statistics.sleep_start = 
rq_clock(rq_of(cfs_rq));
                        if (tsk->state & TASK_UNINTERRUPTIBLE)
                                se->statistics.block_start = 
rq_clock(rq_of(cfs_rq));
+                       if ((tsk->state & TASK_PARKED) &&
+                           tsk->delay_injection_target.tv64)
+                               se->statistics.delay_start = 
rq_clock(rq_of(cfs_rq));
                }
 #endif
        }

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to