The preempt_disable/enable tracepoint only traces in the disable <-> enable
case, which is correct. But think about this case:

---------------------------- %< ------------------------------
        THREAD                                  IRQ
           |                                     |
preempt_disable() {
    __preempt_count_add(1)
        ------->            smp_apic_timer_interrupt() {
                                preempt_disable()
                                    do not trace (preempt count >= 1)
                                    ....
                                preempt_enable()
                                    do not trace (preempt count >= 1)
                            }
    trace_preempt_disable();
}
---------------------------- >% ------------------------------

The tracepoint will be skipped.

It is possible to observe this problem using this kernel module:

        http://bristot.me/files/efficient_verification/wip.tar.gz [*]

and doing the following trace:

        # cd /sys/kernel/debug/tracing/
        # echo 1 > snapshot
        # cat available_events | grep preempt_ > set_event
        # echo irq_vectors >> /sys/kernel/debug/tracing/set_event
        # insmod wip.ko
        /* wait for a snapshot creation */
        # tail -100 snapshot
                [...]
            tail-5572  [001] ....1..  2888.401184: preempt_enable: 
caller=_raw_spin_unlock_irqrestore+0x2a/0x70 parent=          (null)
            tail-5572  [001] ....1..  2888.401184: preempt_disable: 
caller=migrate_disable+0x8b/0x1e0 parent=migrate_disable+0x8b/0x1e0
            tail-5572  [001] ....111  2888.401184: preempt_enable: 
caller=migrate_disable+0x12f/0x1e0 parent=migrate_disable+0x12f/0x1e0
            tail-5572  [001] d..h212  2888.401189: local_timer_entry: vector=236
            tail-5572  [001] dN.h512  2888.401192: process_event: event 
sched_waking not expected in the state preemptive
            tail-5572  [001] dN.h512  2888.401200: <stack trace>
         => process_event
         => __handle_event
         => ttwu_do_wakeup
         => try_to_wake_up
         => invoke_rcu_core
         => rcu_check_callbacks
         => update_process_times
         => tick_sched_handle
         => tick_sched_timer
         => __hrtimer_run_queues
         => hrtimer_interrupt
         => smp_apic_timer_interrupt
         => apic_timer_interrupt
         => trace_event_raw_event_preemptirq_template
         => trace_preempt_off
         => get_page_from_freelist
         => __alloc_pages_nodemask
         => __handle_mm_fault
         => handle_mm_fault
         => __do_page_fault
         => do_page_fault
         => async_page_fault

To avoid skipping the trace, the change in the counter should be "atomic"
with the start/stop, w.r.t the interrupts.

Disable interrupts while the adding/starting stopping/subtracting.

Signed-off-by: Daniel Bristot de Oliveira <[email protected]>
Cc: "Steven Rostedt (VMware)" <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Matthias Kaehlcke <[email protected]>
Cc: "Joel Fernandes (Google)" <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: Yangtao Li <[email protected]>
Cc: Tommaso Cucinotta <[email protected]>
Cc: [email protected]
---
[*] with some luck we will talk about this at the Plumbers.

 kernel/sched/core.c | 48 +++++++++++++++++++++++++++++++++------------
 kernel/softirq.c    |  2 +-
 2 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8c0b414e45dc..be4117d7384f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3163,6 +3163,16 @@ void preempt_latency_start(int val)
        }
 }
 
+static inline void preempt_add_start_latency(int val)
+{
+       unsigned long flags;
+
+       raw_local_irq_save(flags);
+       __preempt_count_add(val);
+       preempt_latency_start(val);
+       raw_local_irq_restore(flags);
+}
+
 void preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
@@ -3172,7 +3182,7 @@ void preempt_count_add(int val)
        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                return;
 #endif
-       __preempt_count_add(val);
+       preempt_add_start_latency(val);
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Spinlock count overflowing soon?
@@ -3180,7 +3190,6 @@ void preempt_count_add(int val)
        DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
                                PREEMPT_MASK - 10);
 #endif
-       preempt_latency_start(val);
 }
 EXPORT_SYMBOL(preempt_count_add);
 NOKPROBE_SYMBOL(preempt_count_add);
@@ -3195,6 +3204,16 @@ void preempt_latency_stop(int val)
                trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
 }
 
+static inline void preempt_sub_stop_latency(int val)
+{
+       unsigned long flags;
+
+       raw_local_irq_save(flags);
+       preempt_latency_stop(val);
+       __preempt_count_sub(val);
+       raw_local_irq_restore(flags);
+}
+
 void preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
@@ -3211,8 +3230,7 @@ void preempt_count_sub(int val)
                return;
 #endif
 
-       preempt_latency_stop(val);
-       __preempt_count_sub(val);
+       preempt_sub_stop_latency(val);
 }
 EXPORT_SYMBOL(preempt_count_sub);
 NOKPROBE_SYMBOL(preempt_count_sub);
@@ -3220,6 +3238,16 @@ NOKPROBE_SYMBOL(preempt_count_sub);
 #else
 static inline void preempt_latency_start(int val) { }
 static inline void preempt_latency_stop(int val) { }
+
+static inline void preempt_sub_stop_latency(int val)
+{
+       __preempt_count_sub(val);
+}
+
+static inline void preempt_add_start_latency(int val)
+{
+       __preempt_count_add(val);
+}
 #endif
 
 static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
@@ -3585,11 +3613,9 @@ static void __sched notrace preempt_schedule_common(void)
                 * traced. The other to still record the preemption latency,
                 * which can also be traced by the function tracer.
                 */
-               preempt_disable_notrace();
-               preempt_latency_start(1);
+               preempt_add_start_latency(1);
                __schedule(true);
-               preempt_latency_stop(1);
-               preempt_enable_no_resched_notrace();
+               preempt_sub_stop_latency(1);
 
                /*
                 * Check again in case we missed a preemption opportunity
@@ -3653,8 +3679,7 @@ asmlinkage __visible void __sched notrace 
preempt_schedule_notrace(void)
                 * traced. The other to still record the preemption latency,
                 * which can also be traced by the function tracer.
                 */
-               preempt_disable_notrace();
-               preempt_latency_start(1);
+               preempt_add_start_latency(1);
                /*
                 * Needs preempt disabled in case user_exit() is traced
                 * and the tracer calls preempt_enable_notrace() causing
@@ -3664,8 +3689,7 @@ asmlinkage __visible void __sched notrace 
preempt_schedule_notrace(void)
                __schedule(true);
                exception_exit(prev_ctx);
 
-               preempt_latency_stop(1);
-               preempt_enable_no_resched_notrace();
+               preempt_sub_stop_latency(1);
        } while (need_resched());
 }
 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c9ad89c3dfed..9c64522ecc76 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -130,9 +130,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int 
cnt)
         */
        if (softirq_count() == (cnt & SOFTIRQ_MASK))
                trace_softirqs_off(ip);
-       raw_local_irq_restore(flags);
 
        preempt_latency_start(cnt);
+       raw_local_irq_restore(flags);
 
 }
 EXPORT_SYMBOL(__local_bh_disable_ip);
-- 
2.20.1

Reply via email to