On 14.10.2021 11:20, Cyrill Gorcunov wrote: > The @load_ve_lock spinlock guards manipulations of @ve_root_list, > same time the calc_load_ve() is executed from irq context which > triggers "invalid context wait" bug > > | [ 5.195868] ============================= > | [ 5.195877] [ BUG: Invalid wait context ] > | [ 5.195887] 5.14.0.ovz9.10.1 #37 Tainted: G C X --------- > --- > | [ 5.195902] ----------------------------- > | [ 5.195911] swapper/0/0 is trying to lock: > | [ 5.196327] ffffffff872d8438 (load_ve_lock){....}-{3:3}, at: > calc_load_ve+0x15/0x1c0 > | [ 5.196742] other info that might help us debug this: > | [ 5.196807] context-{2:2} > | [ 5.196807] no locks held by swapper/0/0. > | [ 5.196807] stack backtrace: > | [ 5.196807] CPU: 0 PID: 0 Comm: swapper/0 ve: / Tainted: G C > X --------- --- 5.14.0.ovz9.10.1 #37 10.1 > | [ 5.196807] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS > 1.14.0-4.fc34 04/01/2014 > | [ 5.196807] Call Trace: > | [ 5.196807] <IRQ> > | [ 5.196807] dump_stack_lvl+0x57/0x7d > | [ 5.196807] __lock_acquire.cold+0x28b/0x2cd > | [ 5.196807] ? __lock_acquire+0x3b1/0x1f20 > | [ 5.196807] lock_acquire+0xca/0x300 > | [ 5.196807] ? calc_load_ve+0x15/0x1c0 > | [ 5.196807] ? kvm_sched_clock_read+0x14/0x40 > | [ 5.196807] ? sched_clock_local+0xe/0x80 > | [ 5.196807] ? sched_clock_cpu+0xa5/0xc0 > | [ 5.196807] _raw_spin_lock+0x34/0x80 > | [ 5.196807] ? calc_load_ve+0x15/0x1c0 > | [ 5.196807] calc_load_ve+0x15/0x1c0 > | [ 5.196807] tick_do_update_jiffies64+0x115/0x150 > | [ 5.196807] tick_irq_enter+0x6c/0xe0 > | [ 5.196807] irq_enter_rcu+0x79/0x80 > | [ 5.196807] sysvec_apic_timer_interrupt+0x95/0xd0 > | [ 5.196807] </IRQ> > | [ 5.196807] asm_sysvec_apic_timer_interrupt+0x12/0x20 > | [ 5.196807] RIP: 0010:default_idle+0x10/0x20 > | [ 5.196807] RSP: 0018:ffffffff87203ea8 EFLAGS: 00000202 > | [ 5.196807] RAX: ffffffff86380df0 RBX: 0000000000000000 RCX: > 0000000000000001 > | [ 5.196807] RDX: 0000000000000000 RSI: ffffffff86ee3980 RDI: > ffffffff86e1050e > | [ 5.196807] RBP: ffffffff87260a00 R08: 0000000000000001 R09: > 0000000000000001 > | [ 5.196807] R10: 0000000000000001 R11: 0000000000000000 R12: > 0000000000000000 > | [ 5.196807] R13: 0000000000000000 R14: ffffffff87260120 R15: > 0000000000000000 > | [ 5.196807] ? mwait_idle+0x70/0x70 > | [ 5.196807] ? mwait_idle+0x70/0x70 > | [ 5.196807] default_idle_call+0x59/0x90 > | [ 5.196807] do_idle+0x217/0x2b0 > | [ 5.196807] cpu_startup_entry+0x19/0x20 > | [ 5.196807] start_kernel+0x997/0x9bc > | [ 5.196807] ? copy_bootdata+0x18/0x55 > | [ 5.196807] secondary_startup_64_no_verify+0xc2/0xcb > > Note that the problem is rather coming from rt camp where splinlocks > become sleepable thus can't be used in irq context (and for our kernel > it requires the CONFIG_PROVE_RAW_LOCK_NESTING to be set), thus since > we know that we're operating in irq context lets use raw spinlocks > instead. > > Also I make unlock to happen earlier because there is no need to > keep it once we've finished traversing the @ve_root_list list. > > https://jira.sw.ru/browse/PSBM-134756 > > CC: Kirill Tkhai <ktk...@virtuozzo.com> > Signed-off-by: Cyrill Gorcunov <gorcu...@gmail.com>
Acked-by: Kirill Tkhai <ktk...@virtuozzo.com> > --- > kernel/sched/core.c | 10 +++++----- > kernel/sched/loadavg.c | 6 +++--- > 2 files changed, 8 insertions(+), 8 deletions(-) > > --- vzkernel.orig/kernel/sched/core.c > +++ vzkernel/kernel/sched/core.c > @@ -10036,18 +10036,18 @@ static u64 cpu_shares_read_u64(struct cg > > #ifdef CONFIG_VE > LIST_HEAD(ve_root_list); > -DEFINE_SPINLOCK(load_ve_lock); > +DEFINE_RAW_SPINLOCK(load_ve_lock); > > void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css) > { > struct task_group *tg = css_tg(css); > unsigned long flags; > > - spin_lock_irqsave(&load_ve_lock, flags); > + raw_spin_lock_irqsave(&load_ve_lock, flags); > BUG_ON(!(css->flags & CSS_ONLINE)); > if (list_empty(&tg->ve_root_list)) > list_add(&tg->ve_root_list, &ve_root_list); > - spin_unlock_irqrestore(&load_ve_lock, flags); > + raw_spin_unlock_irqrestore(&load_ve_lock, flags); > } > > void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css) > @@ -10055,9 +10055,9 @@ void unlink_ve_root_cpu_cgroup(struct cg > struct task_group *tg = css_tg(css); > unsigned long flags; > > - spin_lock_irqsave(&load_ve_lock, flags); > + raw_spin_lock_irqsave(&load_ve_lock, flags); > list_del_init(&tg->ve_root_list); > - spin_unlock_irqrestore(&load_ve_lock, flags); > + raw_spin_unlock_irqrestore(&load_ve_lock, flags); > } > #endif /* CONFIG_VE */ > > --- vzkernel.orig/kernel/sched/loadavg.c > +++ vzkernel/kernel/sched/loadavg.c > @@ -110,7 +110,7 @@ long calc_load_fold_active(struct rq *th > > #ifdef CONFIG_VE > extern struct list_head ve_root_list; > -extern spinlock_t load_ve_lock; > +extern raw_spinlock_t load_ve_lock; > > void calc_load_ve(void) > { > @@ -122,7 +122,7 @@ void calc_load_ve(void) > * This is called without jiffies_lock, and here we protect > * against very rare parallel execution on two or more cpus. > */ > - spin_lock(&load_ve_lock); > + raw_spin_lock(&load_ve_lock); > list_for_each_entry(tg, &ve_root_list, ve_root_list) { > nr_active = 0; > for_each_possible_cpu(i) { > @@ -146,6 +146,7 @@ void calc_load_ve(void) > tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active); > tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active); > } > + raw_spin_unlock(&load_ve_lock); > > nr_unint = nr_uninterruptible() * FIXED_1; > > @@ -154,7 +155,6 @@ void calc_load_ve(void) > calc_load(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint); > calc_load(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint); > write_seqcount_end(&kstat_glob.nr_unint_avg_seq); > - spin_unlock(&load_ve_lock); > } > #endif /* CONFIG_VE */ > > _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel