[PATCHv12 1/4] genirq: Provide a snapshot mechanism for interrupt statistics

2024-03-06 Thread Bitao Hu
The soft lockup detector lacks a mechanism to identify interrupt storms
as root cause of a lockup. To enable this the detector needs a
mechanism to snapshot the interrupt count statistics on a CPU when the
detector observes a potential lockup scenario and compare that against
the interrupt count when it warns about the lockup later on. The number
of interrupts in that period give a hint whether the lockup might be
caused by an interrupt storm.

Instead of having extra storage in the lockup detector and accessing
the internals of the interrupt descriptor directly, convert the per CPU
irq_desc::kstat_irq member to a data structure which contains the
counter plus a snapshot member and provide interfaces to take a
snapshot of all interrupts on the current CPU and to retrieve the delta
of a specific interrupt later on.

Originally-by: Thomas Gleixner 
Signed-off-by: Bitao Hu 
Reviewed-by: Liu Song 
Reviewed-by: Douglas Anderson 
---
 arch/mips/dec/setup.c|  2 +-
 arch/parisc/kernel/smp.c |  2 +-
 arch/powerpc/kvm/book3s_hv_rm_xics.c |  2 +-
 include/linux/irqdesc.h  | 16 ++--
 include/linux/kernel_stat.h  |  8 ++
 kernel/irq/Kconfig   |  4 +++
 kernel/irq/internals.h   |  2 +-
 kernel/irq/irqdesc.c | 38 +++-
 kernel/irq/proc.c|  5 ++--
 scripts/gdb/linux/interrupts.py  |  6 ++---
 10 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/arch/mips/dec/setup.c b/arch/mips/dec/setup.c
index 6c3704f51d0d..87f0a1436bf9 100644
--- a/arch/mips/dec/setup.c
+++ b/arch/mips/dec/setup.c
@@ -756,7 +756,7 @@ void __init arch_init_irq(void)
NULL))
pr_err("Failed to register fpu interrupt\n");
desc_fpu = irq_to_desc(irq_fpu);
-   fpu_kstat_irq = this_cpu_ptr(desc_fpu->kstat_irqs);
+   fpu_kstat_irq = this_cpu_ptr(&desc_fpu->kstat_irqs->cnt);
}
if (dec_interrupt[DEC_IRQ_CASCADE] >= 0) {
if (request_irq(dec_interrupt[DEC_IRQ_CASCADE], no_action,
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 444154271f23..800eb64e91ad 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -344,7 +344,7 @@ static int smp_boot_one_cpu(int cpuid, struct task_struct 
*idle)
struct irq_desc *desc = irq_to_desc(i);
 
if (desc && desc->kstat_irqs)
-   *per_cpu_ptr(desc->kstat_irqs, cpuid) = 0;
+   *per_cpu_ptr(desc->kstat_irqs, cpuid) = (struct 
irqstat) { };
}
 #endif
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e42984878503..f2636414d82a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -837,7 +837,7 @@ static inline void this_cpu_inc_rm(unsigned int __percpu 
*addr)
  */
 static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
 {
-   this_cpu_inc_rm(desc->kstat_irqs);
+   this_cpu_inc_rm(&desc->kstat_irqs->cnt);
__this_cpu_inc(kstat.irqs_sum);
 }
 
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index d9451d456a73..fd091c35d572 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -17,6 +17,18 @@ struct irq_desc;
 struct irq_domain;
 struct pt_regs;
 
+/**
+ * struct irqstat - interrupt statistics
+ * @cnt:   real-time interrupt count
+ * @ref:   snapshot of interrupt count
+ */
+struct irqstat {
+   unsigned intcnt;
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+   unsigned intref;
+#endif
+};
+
 /**
  * struct irq_desc - interrupt descriptor
  * @irq_common_data:   per irq and chip data passed down to chip functions
@@ -55,7 +67,7 @@ struct pt_regs;
 struct irq_desc {
struct irq_common_data  irq_common_data;
struct irq_data irq_data;
-   unsigned int __percpu   *kstat_irqs;
+   struct irqstat __percpu *kstat_irqs;
irq_flow_handler_t  handle_irq;
struct irqaction*action;/* IRQ action list */
unsigned intstatus_use_accessors;
@@ -119,7 +131,7 @@ extern struct irq_desc irq_desc[NR_IRQS];
 static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc,
  unsigned int cpu)
 {
-   return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
+   return desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
 }
 
 static inline struct irq_desc *irq_data_to_desc(struct irq_data *data)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 9935f7ecbfb9..9c042c6384bb 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -79,6 +79,14 @@ static inline unsigned int kstat_cpu_softirqs_sum(int cpu)
return sum;
 }
 
+#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT
+extern void kstat_snapshot_irqs(

Re: [PATCHv12 1/4] genirq: Provide a snapshot mechanism for interrupt statistics

2024-04-09 Thread Thomas Gleixner
On Wed, Mar 06 2024 at 20:52, Bitao Hu wrote:
> The soft lockup detector lacks a mechanism to identify interrupt storms
> as root cause of a lockup. To enable this the detector needs a
> mechanism to snapshot the interrupt count statistics on a CPU when the
> detector observes a potential lockup scenario and compare that against
> the interrupt count when it warns about the lockup later on. The number
> of interrupts in that period give a hint whether the lockup might be
> caused by an interrupt storm.
>
> Instead of having extra storage in the lockup detector and accessing
> the internals of the interrupt descriptor directly, convert the per CPU
> irq_desc::kstat_irq member to a data structure which contains the
> counter plus a snapshot member and provide interfaces to take a
> snapshot of all interrupts on the current CPU and to retrieve the delta
> of a specific interrupt later on.
>
> Originally-by: Thomas Gleixner 
> Signed-off-by: Bitao Hu 
> Reviewed-by: Liu Song 
> Reviewed-by: Douglas Anderson 

This does not apply anymore.

Also can you please split this apart to convert kstat_irqs to a struct
with just the count in it and then add the snapshot mechanics on top.

Thanks,

tglx



Re: [PATCHv12 1/4] genirq: Provide a snapshot mechanism for interrupt statistics

2024-04-09 Thread Bitao Hu

Hi,

On 2024/4/9 17:58, Thomas Gleixner wrote:


This does not apply anymore.

OK, I will update this patch based on the latest kernel code.


Also can you please split this apart to convert kstat_irqs to a struct
with just the count in it and then add the snapshot mechanics on top.


OK, I will split this patch into two. The changelog for the first patch
will be as follows.

genirq: Convert kstat_irqs to a struct

The irq_desc::kstat_irqs member is a per-CPU variable of type int, and
it is only capable of counting. The snapshot mechanism for interrupt
statistics will be added soon, which requires an additional variable to
store snapshot. To facilitate expansion, convert kstat_irqs here to
a struct containing only the count.

By the way, what do you think of my reason for using printk() instead of
pr_crit()? Should I change this part of the code in v13?

Besides, are there any other issues with this set of patches? I hope we
can resolve all points of contention in v13.

Best Regards,

Bitao Hu


Re: [PATCHv12 1/4] genirq: Provide a snapshot mechanism for interrupt statistics

2024-04-10 Thread Thomas Gleixner
On Wed, Apr 10 2024 at 14:45, Bitao Hu wrote:
> On 2024/4/9 17:58, Thomas Gleixner wrote:
> By the way, what do you think of my reason for using printk() instead of
> pr_crit()? Should I change this part of the code in v13?

Either way is fine. Just put a proper explanation into the change log if
you stick with printk().

Thanks,

tglx