On 7/6/2025 11:06 PM, Qi Xi wrote:
> Hi Joel,
>
> After reverting the previous 2 commits and applying the new bugfix, the
> problem
> is resolved now.
>
> Thanks for your help!
Thanks! I will add your Tested-by tag to the commit and will post for further
review.
- Joel
>
>
> Best regards,
>
> Qi
>
>
> On 2025/7/5 21:12, Joel Fernandes wrote:
>> On Thu, Jul 03, 2025 at 09:04:31AM +0800, Xiongfeng Wang wrote:
>>>
>>> On 2025/7/3 1:24, Joel Fernandes wrote:
>>>>
>>>> On 7/2/2025 6:59 AM, Joel Fernandes wrote:
>>>>>
>>>>> On 7/2/2025 5:14 AM, Qi Xi wrote:
>>>>>> Hi Joel,
>>>>>>
>>>>>> After applying the 2 patches, the problem still exists. Compared to the
>>>>>> previous
>>>>>> fixes which did solve the problem, the difference is ct_in_irq() in the
>>>>>> first
>>>>>> patch.
>>>>>>
>>>>>> I am wondering why "nesting != CT_NESTING_IRQ_NONIDLE" is added?
>>>>>>
>>>>>>
>>>>>> (previous fix: problem is solved)
>>>>>>
>>>>>> +bool ct_in_irq(void)
>>>>>> +{
>>>>>> + return ct_nmi_nesting() != 0;
>>>>>> +}
>>>>>>
>>>>>> (current fix: problem still exists)
>>>>>>
>>>>>> +bool ct_in_irq(void)
>>>>>> +{
>>>>>> + long nesting = ct_nmi_nesting();
>>>>>> +
>>>>>> + return (nesting && nesting != CT_NESTING_IRQ_NONIDLE);
>>>>>> +}
>>>>> Oh gosh, thanks for spotting that! Indeed, I had changed it to != 0 in
>>>>> the
>>>>> last
>>>>> version but applied an older patch. I will fix it in the tree. Thank you
>>>>> again!
>>>>>
>>>>> Neeraj, would you like this as a separate commit that you can then
>>>>> squash? Or
>>>>> could you fix it up in your tree?
>>>>>
>>>> Qi, Xiongfeng, I am currently working on alternative fix after discussing
>>>> with
>>>> the crew. I will keep you posted with the fix, and would it to be good to
>>>> get
>>>> your testing on it once I have it (hopefully in couple of days), thanks
>>>> for the
>>>> report!
>>> Sure, we are glad to help test once we get the fix patch.
>> Could you try the following patch? I tested it and it fixes the issue for me.
>>
>> If you prefer git, then cherry-pick the HEAD commit from:
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/jfern/linux.git/log/?h=rcu/
>> irq-exit-v2-no-task
>> (cherry-pick sha a58cc91fdca766cb719fb8beee3bb10ffe8e9d58)
>>
>> ---8<---
>>
>> From: Joel Fernandes <[email protected]>
>> Subject: [PATCH] rcu: Fix rcu_read_unlock() deadloop due to IRQ work
>>
>> Signed-off-by: Joel Fernandes <[email protected]>
>> ---
>> kernel/rcu/tree.h | 11 ++++++++++-
>> kernel/rcu/tree_plugin.h | 29 ++++++++++++++++++++++-------
>> 2 files changed, 32 insertions(+), 8 deletions(-)
>>
>> diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
>> index 3830c19cf2f6..f8f612269e6e 100644
>> --- a/kernel/rcu/tree.h
>> +++ b/kernel/rcu/tree.h
>> @@ -174,6 +174,15 @@ struct rcu_snap_record {
>> unsigned long jiffies; /* Track jiffies value */
>> };
>> +/*
>> + * The IRQ work (deferred_qs_iw) is used by RCU to get scheduler's
>> attention.
>> + * It can be in one of the following states:
>> + * - DEFER_QS_IDLE: An IRQ work was never scheduled.
>> + * - DEFER_QS_PENDING: An IRQ work was scheduler but never run.
>> + */
>> +#define DEFER_QS_IDLE 0
>> +#define DEFER_QS_PENDING 1
>> +
>> /* Per-CPU data for read-copy update. */
>> struct rcu_data {
>> /* 1) quiescent-state and grace-period handling : */
>> @@ -192,7 +201,7 @@ struct rcu_data {
>> /* during and after the last grace */
>> /* period it is aware of. */
>> struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */
>> - bool defer_qs_iw_pending; /* Scheduler attention pending? */
>> + int defer_qs_iw_pending; /* Scheduler attention pending? */
>> struct work_struct strict_work; /* Schedule readers for strict GPs.
>> */
>> /* 2) batch handling */
>> diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
>> index dd1c156c1759..baf57745b42f 100644
>> --- a/kernel/rcu/tree_plugin.h
>> +++ b/kernel/rcu/tree_plugin.h
>> @@ -486,13 +486,16 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct
>> *t, unsigned long flags)
>> struct rcu_node *rnp;
>> union rcu_special special;
>> + rdp = this_cpu_ptr(&rcu_data);
>> + if (rdp->defer_qs_iw_pending == DEFER_QS_PENDING)
>> + rdp->defer_qs_iw_pending = DEFER_QS_IDLE;
>> +
>> /*
>> * If RCU core is waiting for this CPU to exit its critical section,
>> * report the fact that it has exited. Because irqs are disabled,
>> * t->rcu_read_unlock_special cannot change.
>> */
>> special = t->rcu_read_unlock_special;
>> - rdp = this_cpu_ptr(&rcu_data);
>> if (!special.s && !rdp->cpu_no_qs.b.exp) {
>> local_irq_restore(flags);
>> return;
>> @@ -623,12 +626,24 @@ notrace void rcu_preempt_deferred_qs(struct
>> task_struct *t)
>> */
>> static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
>> {
>> - unsigned long flags;
>> - struct rcu_data *rdp;
>> + volatile unsigned long flags;
>> + struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
>> - rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
>> local_irq_save(flags);
>> - rdp->defer_qs_iw_pending = false;
>> +
>> + /*
>> + * Requeue the IRQ work on next unlock in following situation:
>> + * 1. rcu_read_unlock() queues IRQ work (state -> DEFER_QS_PENDING)
>> + * 2. CPU enters new rcu_read_lock()
>> + * 3. IRQ work runs but cannot report QS due to rcu_preempt_depth() > 0
>> + * 4. rcu_read_unlock() does not re-queue work (state still PENDING)
>> + * 5. Deferred QS reporting does not happen.
>> + */
>> + if (rcu_preempt_depth() > 0) {
>> + WRITE_ONCE(rdp->defer_qs_iw_pending, DEFER_QS_IDLE);
>> + local_irq_restore(flags);
>> + return;
>> + }
>> local_irq_restore(flags);
>> }
>> @@ -675,7 +690,7 @@ static void rcu_read_unlock_special(struct task_struct
>> *t)
>> set_tsk_need_resched(current);
>> set_preempt_need_resched();
>> if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
>> - expboost && !rdp->defer_qs_iw_pending &&
>> cpu_online(rdp->cpu)) {
>> + expboost && rdp->defer_qs_iw_pending != DEFER_QS_PENDING &&
>> cpu_online(rdp->cpu)) {
>> // Get scheduler to re-evaluate and call hooks.
>> // If !IRQ_WORK, FQS scan will eventually IPI.
>> if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
>> @@ -685,7 +700,7 @@ static void rcu_read_unlock_special(struct task_struct
>> *t)
>> else
>> init_irq_work(&rdp->defer_qs_iw,
>> rcu_preempt_deferred_qs_handler);
>> - rdp->defer_qs_iw_pending = true;
>> + rdp->defer_qs_iw_pending = DEFER_QS_PENDING;
>> irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
>> }
>> }