On Tue, Nov 17, 2020 at 06:19:48PM -0500, Joel Fernandes (Google) wrote:
> Core-scheduling prevents hyperthreads in usermode from attacking each
> other, but it does not do anything about one of the hyperthreads
> entering the kernel for any reason. This leaves the door open for MDS
> and L1TF attacks with concurrent execution sequences between
> hyperthreads.
> 
> This patch therefore adds support for protecting all syscall and IRQ
> kernel mode entries. Care is taken to track the outermost usermode exit
> and entry using per-cpu counters. In cases where one of the hyperthreads
> enter the kernel, no additional IPIs are sent. Further, IPIs are avoided
> when not needed - example: idle and non-cookie HTs do not need to be
> forced into kernel mode.
> 
> More information about attacks:
> For MDS, it is possible for syscalls, IRQ and softirq handlers to leak
> data to either host or guest attackers. For L1TF, it is possible to leak
> to guest attackers. There is no possible mitigation involving flushing
> of buffers to avoid this since the execution of attacker and victims
> happen concurrently on 2 or more HTs.

Oh gawd; this is horrible...


> +bool sched_core_wait_till_safe(unsigned long ti_check)
> +{
> +     bool restart = false;
> +     struct rq *rq;
> +     int cpu;
> +
> +     /* We clear the thread flag only at the end, so no need to check for 
> it. */
> +     ti_check &= ~_TIF_UNSAFE_RET;
> +
> +     cpu = smp_processor_id();
> +     rq = cpu_rq(cpu);
> +
> +     if (!sched_core_enabled(rq))
> +             goto ret;
> +
> +     /* Down grade to allow interrupts to prevent stop_machine lockups.. */
> +     preempt_disable();
> +     local_irq_enable();
> +
> +     /*
> +      * Wait till the core of this HT is not in an unsafe state.
> +      *
> +      * Pair with raw_spin_lock/unlock() in sched_core_unsafe_enter/exit().
> +      */
> +     while (smp_load_acquire(&rq->core->core_unsafe_nest) > 0) {
> +             cpu_relax();
> +             if (READ_ONCE(current_thread_info()->flags) & ti_check) {
> +                     restart = true;
> +                     break;
> +             }
> +     }

What's that ACQUIRE for?

> +
> +     /* Upgrade it back to the expectations of entry code. */
> +     local_irq_disable();
> +     preempt_enable();
> +
> +ret:
> +     if (!restart)
> +             clear_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> +     return restart;
> +}

So if TIF_NEED_RESCHED gets set, we'll break out and reschedule, cute.

> +void sched_core_unsafe_enter(void)
> +{
> +     const struct cpumask *smt_mask;
> +     unsigned long flags;
> +     struct rq *rq;
> +     int i, cpu;
> +
> +     if (!static_branch_likely(&sched_core_protect_kernel))
> +             return;
> +
> +     local_irq_save(flags);
> +     cpu = smp_processor_id();
> +     rq = cpu_rq(cpu);
> +     if (!sched_core_enabled(rq))
> +             goto ret;
> +
> +     /* Ensure that on return to user/guest, we check whether to wait. */
> +     if (current->core_cookie)
> +             set_tsk_thread_flag(current, TIF_UNSAFE_RET);
> +
> +     /* Count unsafe_enter() calls received without unsafe_exit() on this 
> CPU. */
> +     rq->core_this_unsafe_nest++;
> +
> +     /*
> +      * Should not nest: enter() should only pair with exit(). Both are done
> +      * during the first entry into kernel and the last exit from kernel.
> +      * Nested kernel entries (such as nested interrupts) will only trigger
> +      * enter() and exit() on the outer most kernel entry and exit.
> +      */
> +     if (WARN_ON_ONCE(rq->core_this_unsafe_nest != 1))
> +             goto ret;
> +
> +     raw_spin_lock(rq_lockp(rq));
> +     smt_mask = cpu_smt_mask(cpu);
> +
> +     /*
> +      * Contribute this CPU's unsafe_enter() to the core-wide unsafe_enter()
> +      * count.  The raw_spin_unlock() release semantics pairs with the nest
> +      * counter's smp_load_acquire() in sched_core_wait_till_safe().
> +      */
> +     WRITE_ONCE(rq->core->core_unsafe_nest, rq->core->core_unsafe_nest + 1);
> +
> +     if (WARN_ON_ONCE(rq->core->core_unsafe_nest == UINT_MAX))
> +             goto unlock;
> +
> +     if (irq_work_is_busy(&rq->core_irq_work)) {
> +             /*
> +              * Do nothing more since we are in an IPI sent from another
> +              * sibling to enforce safety. That sibling would have sent IPIs
> +              * to all of the HTs.
> +              */
> +             goto unlock;
> +     }
> +
> +     /*
> +      * If we are not the first ones on the core to enter core-wide unsafe
> +      * state, do nothing.
> +      */
> +     if (rq->core->core_unsafe_nest > 1)
> +             goto unlock;
> +
> +     /* Do nothing more if the core is not tagged. */
> +     if (!rq->core->core_cookie)
> +             goto unlock;
> +
> +     for_each_cpu(i, smt_mask) {
> +             struct rq *srq = cpu_rq(i);
> +
> +             if (i == cpu || cpu_is_offline(i))
> +                     continue;
> +
> +             if (!srq->curr->mm || is_task_rq_idle(srq->curr))
> +                     continue;
> +
> +             /* Skip if HT is not running a tagged task. */
> +             if (!srq->curr->core_cookie && !srq->core_pick)
> +                     continue;
> +
> +             /*
> +              * Force sibling into the kernel by IPI. If work was already
> +              * pending, no new IPIs are sent. This is Ok since the receiver
> +              * would already be in the kernel, or on its way to it.
> +              */
> +             irq_work_queue_on(&srq->core_irq_work, i);

Why irq_work though? Why not smp_send_reschedule(i)?

> +     }
> +unlock:
> +     raw_spin_unlock(rq_lockp(rq));
> +ret:
> +     local_irq_restore(flags);
> +}

Reply via email to