On Fri, Jul 28, 2017 at 10:55:32AM +0200, Peter Zijlstra wrote:
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e9785f7aed75..33f34a201255 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2641,8 +2641,18 @@ static struct rq *finish_task_switch(struct 
> task_struct *prev)
>       finish_arch_post_lock_switch();
>  
>       fire_sched_in_preempt_notifiers(current);
> +
> +     /*
> +      * For CONFIG_MEMBARRIER we need a full memory barrier after the
> +      * rq->curr assignment. Not all architectures have one in either
> +      * switch_to() or switch_mm() so we use (and complement) the one
> +      * implied by mmdrop()'s atomic_dec_and_test().
> +      */
>       if (mm)
>               mmdrop(mm);
> +     else if (IS_ENABLED(CONFIG_MEMBARRIER))
> +             smp_mb();
> +
>       if (unlikely(prev_state == TASK_DEAD)) {
>               if (prev->sched_class->task_dead)
>                       prev->sched_class->task_dead(prev);

Hurm.. so going over this again, I'm not sure this is as good as we
thought it was..


context_switch()

  mm = next->mm
  old_mm = prev->active_mm;

  if (!mm) {
    next->active_mm = old_mm;
    mmgrab(oldmm);
    enter_lazy_tlb(oldmm, next);
  } else
    switch_mm(oldmm, mm, next);

  if (!prev->mm) {
    prev->active_mm = NULL;
    rq->prev_mm = old_mm;
  }

  /* ... */

  finish_task_switch()

    mm = rq->prev_mm; // oldmm when !prev->mm
    rq->prev_mm = NULL;

    if (mm)
      mmdrop(mm);



That mmdrop() is to balance the mmgrab() for when we switch between
kthreads. Also, it looks to me that if we do kthread->kthread switches,
we do a superfluous mmgrab() / mmdrop().

Something like the below perhaps would optimize and clarify things.

---
 kernel/sched/core.c | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e9785f7aed75..7924b4cc2bff 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2733,12 +2733,8 @@ static __always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
               struct task_struct *next, struct rq_flags *rf)
 {
-       struct mm_struct *mm, *oldmm;
-
        prepare_task_switch(rq, prev, next);
 
-       mm = next->mm;
-       oldmm = prev->active_mm;
        /*
         * For paravirt, this is coupled with an exit in switch_to to
         * combine the page table reload and the switch backend into
@@ -2746,16 +2742,29 @@ context_switch(struct rq *rq, struct task_struct *prev,
         */
        arch_start_context_switch(prev);
 
-       if (!mm) {
-               next->active_mm = oldmm;
-               mmgrab(oldmm);
-               enter_lazy_tlb(oldmm, next);
-       } else
-               switch_mm_irqs_off(oldmm, mm, next);
+       /*
+        * kernel -> kernel   transfer active
+        *   user -> kernel   mmgrab()
+        *
+        * kernel ->   user   mmdrop()
+        *   user ->   user   switch_mm()
+        */
+       if (!next->mm) {                                // to kernel
+               next->active_mm = prev->active_mm;
+               enter_lazy_tlb(prev->active_mm, next);
+
+               if (prev->mm)                           // from user
+                       mmgrab(prev->active_mm);
+
+       } else {                                        // to user
+               switch_mm_irqs_off(prev->active_mm, next->mm, next);
 
-       if (!prev->mm) {
-               prev->active_mm = NULL;
-               rq->prev_mm = oldmm;
+               if (!prev->mm) {                        // from kernel
+                       rq->prev_mm = prev->active_mm;
+                       prev->active_mm = NULL;
+
+                       /* will mmdrop() in finish_task_switch(). */
+               }
        }
 
        rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);

Reply via email to