________________________________________ 发件人: Uladzislau Rezki <ure...@gmail.com> 发送时间: 2021年2月4日 22:09 收件人: Zhang, Qiang 抄送: Uladzislau Rezki; paul...@kernel.org; j...@joelfernandes.org; r...@vger.kernel.org; linux-kernel@vger.kernel.org 主题: Re: 回复: [PATCH v3] kvfree_rcu: Release page cache under memory pressure
[Please note: This e-mail is from an EXTERNAL e-mail address] > 发件人: Uladzislau Rezki <ure...@gmail.com> > 发送时间: 2021年2月2日 3:57 > 收件人: Zhang, Qiang > 抄送: ure...@gmail.com; paul...@kernel.org; j...@joelfernandes.org; > r...@vger.kernel.org; linux-kernel@vger.kernel.org > 主题: Re: [PATCH v3] kvfree_rcu: Release page cache under memory pressure > > [Please note: This e-mail is from an EXTERNAL e-mail address] > > Hello, Zqiang. > > > From: Zqiang <qiang.zh...@windriver.com> > > > > Add free per-cpu existing krcp's page cache operation, when > > the system is under memory pressure. > > > > Signed-off-by: Zqiang <qiang.zh...@windriver.com> > > --- > > kernel/rcu/tree.c | 26 ++++++++++++++++++++++++++ > > 1 file changed, 26 insertions(+) > > > > diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > > index c1ae1e52f638..644b0f3c7b9f 100644 > > --- a/kernel/rcu/tree.c > > +++ b/kernel/rcu/tree.c > > @@ -3571,17 +3571,41 @@ void kvfree_call_rcu(struct rcu_head *head, > > rcu_callback_t func) > > } > > EXPORT_SYMBOL_GPL(kvfree_call_rcu); > > > > +static int free_krc_page_cache(struct kfree_rcu_cpu *krcp) > > +{ > > + unsigned long flags; > > + struct llist_node *page_list, *pos, *n; > > + int freed = 0; > > + > > + raw_spin_lock_irqsave(&krcp->lock, flags); > > + page_list = llist_del_all(&krcp->bkvcache); > > + krcp->nr_bkv_objs = 0; > > + raw_spin_unlock_irqrestore(&krcp->lock, flags); > > + > > + llist_for_each_safe(pos, n, page_list) { > > + free_page((unsigned long)pos); > > + freed++; > > + } > > + > > + return freed; > > +} > > + > > static unsigned long > > kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) > > { > > int cpu; > > unsigned long count = 0; > > + unsigned long flags; > > > > /* Snapshot count of all CPUs */ > > for_each_possible_cpu(cpu) { > > struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); > > > > count += READ_ONCE(krcp->count); > > + > > + raw_spin_lock_irqsave(&krcp->lock, flags); > > + count += krcp->nr_bkv_objs; > > + raw_spin_unlock_irqrestore(&krcp->lock, flags); > > } > > > > return count; > > @@ -3598,6 +3622,8 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct > > shrink_control *sc) > > struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); > > > > count = krcp->count; > > + count += free_krc_page_cache(krcp); > > + > > raw_spin_lock_irqsave(&krcp->lock, flags); > > if (krcp->monitor_todo) > > kfree_rcu_drain_unlock(krcp, flags); > > -- > > 2.17.1 > >> > >Thank you for your patch! > > > >I spent some time to see how the patch behaves under low memory condition. > >To simulate it, i used "rcuscale" tool with below parameters: > > > >../rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 > >--kconfig >CONFIG_NR_CPUS=64 \ > >--bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 > >>rcuscale.holdoff=20 rcuscale.kfree_loops=10000 \ > >torture.disable_onoff_at_boot" --trust-make > > > >64 CPUs + 512 MB of memory. In general, my test system was running on edge > >hitting an out of memory sometimes, but could be considered as stable in > >regards to a test completion and taken time, so both were pretty solid. > > > >You can find a comparison on a plot, that can be downloaded following > >a link: wget > >>ftp://vps418301.ovh.net/incoming/release_page_cache_under_low_memory.png > > > >In short, i see that a patched version can lead to longer test completion, > >whereas the default variant is stable on almost all runs. After some analysis > >and further digging i came to conclusion that a shrinker > >free_krc_page_cache() > >concurs with run_page_cache_worker(krcp) running from kvfree_rcu() context. > > > >i.e. During the test a page shrinker is pretty active, because of low memory > >condition. Our callback drains it whereas kvfree_rcu() part refill it right > >away making kind of vicious circle. > > > >So, a run_page_cache_worker() should be backoff for some time when a system > >runs into a low memory condition or high pressure: > > > >diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c > >index 7077d73fcb53..446723b9646b 100644 > >--- a/kernel/rcu/tree.c > >+++ b/kernel/rcu/tree.c > >@@ -3163,7 +3163,7 @@ struct kfree_rcu_cpu { > > bool initialized; > > int count; > > > >- struct work_struct page_cache_work; > >+ struct delayed_work page_cache_work; > > atomic_t work_in_progress; > > struct hrtimer hrtimer; > > > >@@ -3419,7 +3419,7 @@ schedule_page_work_fn(struct hrtimer *t) > > struct kfree_rcu_cpu *krcp = > > container_of(t, struct kfree_rcu_cpu, hrtimer); > > > >- queue_work(system_highpri_wq, &krcp->page_cache_work); > >+ queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0); > > return HRTIMER_NORESTART; > > } > > > >@@ -3428,7 +3428,7 @@ static void fill_page_cache_func(struct work_struct > >*work) > > struct kvfree_rcu_bulk_data *bnode; > > struct kfree_rcu_cpu *krcp = > > container_of(work, struct kfree_rcu_cpu, > >- page_cache_work); > >+ page_cache_work.work); > > unsigned long flags; > > bool pushed; > > int i; > >@@ -3452,15 +3452,22 @@ static void fill_page_cache_func(struct work_struct > >>*work) > > atomic_set(&krcp->work_in_progress, 0); > > } > > > >+static bool backoff_page_cache_fill; > >+ > > static void > > run_page_cache_worker(struct kfree_rcu_cpu *krcp) > > { > > if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && > > !atomic_xchg(&krcp->work_in_progress, 1)) { > >- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, > >- HRTIMER_MODE_REL); > >- krcp->hrtimer.function = schedule_page_work_fn; > >- hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); > >+ if (READ_ONCE(backoff_page_cache_fill)) { > >+ queue_delayed_work(system_highpri_wq, > >&krcp->page_cache_work, >HZ); > >+ WRITE_ONCE(backoff_page_cache_fill, false); > >+ } else { > >+ hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, > >+ HRTIMER_MODE_REL); > >+ krcp->hrtimer.function = schedule_page_work_fn; > >+ hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); > >+ } > > } > > } > > > >@@ -3644,6 +3651,8 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct > >>shrink_control *sc) > > raw_spin_unlock_irqrestore(&krcp->lock, flags); > > } > > > >+ // Low memory condition, limit a page cache worker activity. > >+ WRITE_ONCE(backoff_page_cache_fill, true); > > return count; > > } > > >@@ -4634,7 +4643,7 @@ static void __init kfree_rcu_batch_init(void) > > } > > > > INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); > >- INIT_WORK(&krcp->page_cache_work, fill_page_cache_func); > >+ INIT_DELAYED_WORK(&krcp->page_cache_work, > >fill_page_cache_func); > > krcp->initialized = true; > > } > > if (register_shrinker(&kfree_rcu_shrinker)) > > > >below patch fixes it. We should backoff the page fill worker keeping it empty > >until the situation with memory consumption is normalized. > > Maybe can cancel the timer and cancel work in the rcu_shrink_count > function, after > the rcu_shrink_scan function is executed and recover timer. > what do you think? >I think it is just easier to schedule a periodic delayed work >with HZ >interval. If a system is in a tight memory situation, that >periodic >work will not be harmful from CPU cycles point of view. >Once it is >settled, the hrtimer takes over to speedup the process of >cache filling. It looks like it's easier. I'll resend a patch Thanks Qiang > > The other question is, why not call queue_work function directly to fill > page cache, > but through a hrtimer, what is the purpose of this design? > >It is done that way because the kvfree_rcu() is supposed that >it could >be invoked from the scheduler part. It means that if a CPU >holds rq->lock >queuing the work or placing it into rq will lead to deadlock. > >-- >Vlad Rezki